blob: f031078a5ce0a21df5ad400b4781e1ad3520c247 [file] [log] [blame]
Doug Zongker424296a2014-09-02 08:53:09 -07001# Copyright (C) 2014 The Android Open Source Project
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
Doug Zongkerfc44a512014-08-26 13:10:25 -070015from __future__ import print_function
16
17from collections import deque, OrderedDict
18from hashlib import sha1
Doug Zongker62338182014-09-08 08:29:55 -070019import heapq
Doug Zongkerfc44a512014-08-26 13:10:25 -070020import itertools
21import multiprocessing
22import os
23import pprint
24import re
25import subprocess
26import sys
27import threading
28import tempfile
29
30from rangelib import *
31
Doug Zongkerab7ca1d2014-08-26 10:40:28 -070032__all__ = ["EmptyImage", "DataImage", "BlockImageDiff"]
33
Doug Zongkerfc44a512014-08-26 13:10:25 -070034def compute_patch(src, tgt, imgdiff=False):
35 srcfd, srcfile = tempfile.mkstemp(prefix="src-")
36 tgtfd, tgtfile = tempfile.mkstemp(prefix="tgt-")
37 patchfd, patchfile = tempfile.mkstemp(prefix="patch-")
38 os.close(patchfd)
39
40 try:
41 with os.fdopen(srcfd, "wb") as f_src:
42 for p in src:
43 f_src.write(p)
44
45 with os.fdopen(tgtfd, "wb") as f_tgt:
46 for p in tgt:
47 f_tgt.write(p)
48 try:
49 os.unlink(patchfile)
50 except OSError:
51 pass
52 if imgdiff:
53 p = subprocess.call(["imgdiff", "-z", srcfile, tgtfile, patchfile],
54 stdout=open("/dev/null", "a"),
55 stderr=subprocess.STDOUT)
56 else:
57 p = subprocess.call(["bsdiff", srcfile, tgtfile, patchfile])
58
59 if p:
60 raise ValueError("diff failed: " + str(p))
61
62 with open(patchfile, "rb") as f:
63 return f.read()
64 finally:
65 try:
66 os.unlink(srcfile)
67 os.unlink(tgtfile)
68 os.unlink(patchfile)
69 except OSError:
70 pass
71
72class EmptyImage(object):
73 """A zero-length image."""
74 blocksize = 4096
75 care_map = RangeSet()
76 total_blocks = 0
77 file_map = {}
78 def ReadRangeSet(self, ranges):
79 return ()
Doug Zongkerab7ca1d2014-08-26 10:40:28 -070080 def TotalSha1(self):
81 return sha1().hexdigest()
82
83
84class DataImage(object):
85 """An image wrapped around a single string of data."""
86
87 def __init__(self, data, trim=False, pad=False):
88 self.data = data
89 self.blocksize = 4096
90
91 assert not (trim and pad)
92
93 partial = len(self.data) % self.blocksize
94 if partial > 0:
95 if trim:
96 self.data = self.data[:-partial]
97 elif pad:
98 self.data += '\0' * (self.blocksize - partial)
99 else:
100 raise ValueError(("data for DataImage must be multiple of %d bytes "
101 "unless trim or pad is specified") %
102 (self.blocksize,))
103
104 assert len(self.data) % self.blocksize == 0
105
106 self.total_blocks = len(self.data) / self.blocksize
107 self.care_map = RangeSet(data=(0, self.total_blocks))
108
109 zero_blocks = []
110 nonzero_blocks = []
111 reference = '\0' * self.blocksize
112
113 for i in range(self.total_blocks):
114 d = self.data[i*self.blocksize : (i+1)*self.blocksize]
115 if d == reference:
116 zero_blocks.append(i)
117 zero_blocks.append(i+1)
118 else:
119 nonzero_blocks.append(i)
120 nonzero_blocks.append(i+1)
121
122 self.file_map = {"__ZERO": RangeSet(zero_blocks),
123 "__NONZERO": RangeSet(nonzero_blocks)}
124
125 def ReadRangeSet(self, ranges):
126 return [self.data[s*self.blocksize:e*self.blocksize] for (s, e) in ranges]
127
128 def TotalSha1(self):
129 if not hasattr(self, "sha1"):
130 self.sha1 = sha1(self.data).hexdigest()
131 return self.sha1
132
Doug Zongkerfc44a512014-08-26 13:10:25 -0700133
134class Transfer(object):
135 def __init__(self, tgt_name, src_name, tgt_ranges, src_ranges, style, by_id):
136 self.tgt_name = tgt_name
137 self.src_name = src_name
138 self.tgt_ranges = tgt_ranges
139 self.src_ranges = src_ranges
140 self.style = style
141 self.intact = (getattr(tgt_ranges, "monotonic", False) and
142 getattr(src_ranges, "monotonic", False))
Tao Baob8c87172015-03-19 19:42:12 -0700143
144 # We use OrderedDict rather than dict so that the output is repeatable;
145 # otherwise it would depend on the hash values of the Transfer objects.
146 self.goes_before = OrderedDict()
147 self.goes_after = OrderedDict()
Doug Zongkerfc44a512014-08-26 13:10:25 -0700148
Doug Zongker62338182014-09-08 08:29:55 -0700149 self.stash_before = []
150 self.use_stash = []
151
Doug Zongkerfc44a512014-08-26 13:10:25 -0700152 self.id = len(by_id)
153 by_id.append(self)
154
Doug Zongker62338182014-09-08 08:29:55 -0700155 def NetStashChange(self):
156 return (sum(sr.size() for (_, sr) in self.stash_before) -
157 sum(sr.size() for (_, sr) in self.use_stash))
158
Doug Zongkerfc44a512014-08-26 13:10:25 -0700159 def __str__(self):
160 return (str(self.id) + ": <" + str(self.src_ranges) + " " + self.style +
161 " to " + str(self.tgt_ranges) + ">")
162
163
164# BlockImageDiff works on two image objects. An image object is
165# anything that provides the following attributes:
166#
167# blocksize: the size in bytes of a block, currently must be 4096.
168#
169# total_blocks: the total size of the partition/image, in blocks.
170#
171# care_map: a RangeSet containing which blocks (in the range [0,
172# total_blocks) we actually care about; i.e. which blocks contain
173# data.
174#
175# file_map: a dict that partitions the blocks contained in care_map
176# into smaller domains that are useful for doing diffs on.
177# (Typically a domain is a file, and the key in file_map is the
178# pathname.)
179#
180# ReadRangeSet(): a function that takes a RangeSet and returns the
181# data contained in the image blocks of that RangeSet. The data
182# is returned as a list or tuple of strings; concatenating the
183# elements together should produce the requested data.
184# Implementations are free to break up the data into list/tuple
185# elements in any way that is convenient.
186#
Doug Zongkerab7ca1d2014-08-26 10:40:28 -0700187# TotalSha1(): a function that returns (as a hex string) the SHA-1
188# hash of all the data in the image (ie, all the blocks in the
189# care_map)
190#
Doug Zongkerfc44a512014-08-26 13:10:25 -0700191# When creating a BlockImageDiff, the src image may be None, in which
192# case the list of transfers produced will never read from the
193# original image.
194
195class BlockImageDiff(object):
Doug Zongker62338182014-09-08 08:29:55 -0700196 def __init__(self, tgt, src=None, threads=None, version=2):
Doug Zongkerfc44a512014-08-26 13:10:25 -0700197 if threads is None:
198 threads = multiprocessing.cpu_count() // 2
199 if threads == 0: threads = 1
200 self.threads = threads
Doug Zongker62338182014-09-08 08:29:55 -0700201 self.version = version
202
203 assert version in (1, 2)
Doug Zongkerfc44a512014-08-26 13:10:25 -0700204
205 self.tgt = tgt
206 if src is None:
207 src = EmptyImage()
208 self.src = src
209
210 # The updater code that installs the patch always uses 4k blocks.
211 assert tgt.blocksize == 4096
212 assert src.blocksize == 4096
213
214 # The range sets in each filemap should comprise a partition of
215 # the care map.
216 self.AssertPartition(src.care_map, src.file_map.values())
217 self.AssertPartition(tgt.care_map, tgt.file_map.values())
218
219 def Compute(self, prefix):
220 # When looking for a source file to use as the diff input for a
221 # target file, we try:
222 # 1) an exact path match if available, otherwise
223 # 2) a exact basename match if available, otherwise
224 # 3) a basename match after all runs of digits are replaced by
225 # "#" if available, otherwise
226 # 4) we have no source for this target.
227 self.AbbreviateSourceNames()
228 self.FindTransfers()
229
230 # Find the ordering dependencies among transfers (this is O(n^2)
231 # in the number of transfers).
232 self.GenerateDigraph()
233 # Find a sequence of transfers that satisfies as many ordering
234 # dependencies as possible (heuristically).
235 self.FindVertexSequence()
236 # Fix up the ordering dependencies that the sequence didn't
237 # satisfy.
Doug Zongker62338182014-09-08 08:29:55 -0700238 if self.version == 1:
239 self.RemoveBackwardEdges()
240 else:
241 self.ReverseBackwardEdges()
242 self.ImproveVertexSequence()
243
Doug Zongkerfc44a512014-08-26 13:10:25 -0700244 # Double-check our work.
245 self.AssertSequenceGood()
246
247 self.ComputePatches(prefix)
248 self.WriteTransfers(prefix)
249
250 def WriteTransfers(self, prefix):
251 out = []
252
Doug Zongkerfc44a512014-08-26 13:10:25 -0700253 total = 0
254 performs_read = False
255
Doug Zongker62338182014-09-08 08:29:55 -0700256 stashes = {}
257 stashed_blocks = 0
258 max_stashed_blocks = 0
259
260 free_stash_ids = []
261 next_stash_id = 0
262
Doug Zongkerfc44a512014-08-26 13:10:25 -0700263 for xf in self.transfers:
264
Doug Zongker62338182014-09-08 08:29:55 -0700265 if self.version < 2:
266 assert not xf.stash_before
267 assert not xf.use_stash
268
269 for s, sr in xf.stash_before:
270 assert s not in stashes
271 if free_stash_ids:
272 sid = heapq.heappop(free_stash_ids)
273 else:
274 sid = next_stash_id
275 next_stash_id += 1
276 stashes[s] = sid
277 stashed_blocks += sr.size()
278 out.append("stash %d %s\n" % (sid, sr.to_string_raw()))
279
280 if stashed_blocks > max_stashed_blocks:
281 max_stashed_blocks = stashed_blocks
282
Jesse Zhao7b985f62015-03-02 16:53:08 -0800283 free_string = []
284
Doug Zongker62338182014-09-08 08:29:55 -0700285 if self.version == 1:
286 src_string = xf.src_ranges.to_string_raw()
287 elif self.version == 2:
288
289 # <# blocks> <src ranges>
290 # OR
291 # <# blocks> <src ranges> <src locs> <stash refs...>
292 # OR
293 # <# blocks> - <stash refs...>
294
295 size = xf.src_ranges.size()
296 src_string = [str(size)]
297
298 unstashed_src_ranges = xf.src_ranges
299 mapped_stashes = []
300 for s, sr in xf.use_stash:
301 sid = stashes.pop(s)
302 stashed_blocks -= sr.size()
303 unstashed_src_ranges = unstashed_src_ranges.subtract(sr)
304 sr = xf.src_ranges.map_within(sr)
305 mapped_stashes.append(sr)
306 src_string.append("%d:%s" % (sid, sr.to_string_raw()))
307 heapq.heappush(free_stash_ids, sid)
308
309 if unstashed_src_ranges:
310 src_string.insert(1, unstashed_src_ranges.to_string_raw())
311 if xf.use_stash:
312 mapped_unstashed = xf.src_ranges.map_within(unstashed_src_ranges)
313 src_string.insert(2, mapped_unstashed.to_string_raw())
314 mapped_stashes.append(mapped_unstashed)
315 self.AssertPartition(RangeSet(data=(0, size)), mapped_stashes)
316 else:
317 src_string.insert(1, "-")
318 self.AssertPartition(RangeSet(data=(0, size)), mapped_stashes)
319
320 src_string = " ".join(src_string)
321
322 # both versions:
323 # zero <rangeset>
324 # new <rangeset>
325 # erase <rangeset>
326 #
327 # version 1:
328 # bsdiff patchstart patchlen <src rangeset> <tgt rangeset>
329 # imgdiff patchstart patchlen <src rangeset> <tgt rangeset>
330 # move <src rangeset> <tgt rangeset>
331 #
332 # version 2:
333 # bsdiff patchstart patchlen <tgt rangeset> <src_string>
334 # imgdiff patchstart patchlen <tgt rangeset> <src_string>
335 # move <tgt rangeset> <src_string>
Doug Zongkerfc44a512014-08-26 13:10:25 -0700336
337 tgt_size = xf.tgt_ranges.size()
338
339 if xf.style == "new":
340 assert xf.tgt_ranges
341 out.append("%s %s\n" % (xf.style, xf.tgt_ranges.to_string_raw()))
342 total += tgt_size
343 elif xf.style == "move":
344 performs_read = True
345 assert xf.tgt_ranges
346 assert xf.src_ranges.size() == tgt_size
347 if xf.src_ranges != xf.tgt_ranges:
Doug Zongker62338182014-09-08 08:29:55 -0700348 if self.version == 1:
349 out.append("%s %s %s\n" % (
350 xf.style,
351 xf.src_ranges.to_string_raw(), xf.tgt_ranges.to_string_raw()))
352 elif self.version == 2:
353 out.append("%s %s %s\n" % (
354 xf.style,
355 xf.tgt_ranges.to_string_raw(), src_string))
Doug Zongkerfc44a512014-08-26 13:10:25 -0700356 total += tgt_size
357 elif xf.style in ("bsdiff", "imgdiff"):
358 performs_read = True
359 assert xf.tgt_ranges
360 assert xf.src_ranges
Doug Zongker62338182014-09-08 08:29:55 -0700361 if self.version == 1:
362 out.append("%s %d %d %s %s\n" % (
363 xf.style, xf.patch_start, xf.patch_len,
364 xf.src_ranges.to_string_raw(), xf.tgt_ranges.to_string_raw()))
365 elif self.version == 2:
366 out.append("%s %d %d %s %s\n" % (
367 xf.style, xf.patch_start, xf.patch_len,
368 xf.tgt_ranges.to_string_raw(), src_string))
Doug Zongkerfc44a512014-08-26 13:10:25 -0700369 total += tgt_size
370 elif xf.style == "zero":
371 assert xf.tgt_ranges
372 to_zero = xf.tgt_ranges.subtract(xf.src_ranges)
373 if to_zero:
374 out.append("%s %s\n" % (xf.style, to_zero.to_string_raw()))
375 total += to_zero.size()
376 else:
377 raise ValueError, "unknown transfer style '%s'\n" % (xf.style,)
378
Doug Zongker62338182014-09-08 08:29:55 -0700379
380 # sanity check: abort if we're going to need more than 512 MB if
381 # stash space
382 assert max_stashed_blocks * self.tgt.blocksize < (512 << 20)
Doug Zongkerfc44a512014-08-26 13:10:25 -0700383
384 all_tgt = RangeSet(data=(0, self.tgt.total_blocks))
385 if performs_read:
386 # if some of the original data is used, then at the end we'll
387 # erase all the blocks on the partition that don't contain data
388 # in the new image.
389 new_dontcare = all_tgt.subtract(self.tgt.care_map)
390 if new_dontcare:
391 out.append("erase %s\n" % (new_dontcare.to_string_raw(),))
392 else:
393 # if nothing is read (ie, this is a full OTA), then we can start
394 # by erasing the entire partition.
Doug Zongkere985f6f2014-09-09 12:38:47 -0700395 out.insert(0, "erase %s\n" % (all_tgt.to_string_raw(),))
396
397 out.insert(0, "%d\n" % (self.version,)) # format version number
398 out.insert(1, str(total) + "\n")
399 if self.version >= 2:
400 # version 2 only: after the total block count, we give the number
401 # of stash slots needed, and the maximum size needed (in blocks)
402 out.insert(2, str(next_stash_id) + "\n")
403 out.insert(3, str(max_stashed_blocks) + "\n")
Doug Zongkerfc44a512014-08-26 13:10:25 -0700404
405 with open(prefix + ".transfer.list", "wb") as f:
406 for i in out:
407 f.write(i)
408
Doug Zongker62338182014-09-08 08:29:55 -0700409 if self.version >= 2:
410 print("max stashed blocks: %d (%d bytes)\n" % (
411 max_stashed_blocks, max_stashed_blocks * self.tgt.blocksize))
412
Doug Zongkerfc44a512014-08-26 13:10:25 -0700413 def ComputePatches(self, prefix):
414 print("Reticulating splines...")
415 diff_q = []
416 patch_num = 0
417 with open(prefix + ".new.dat", "wb") as new_f:
418 for xf in self.transfers:
419 if xf.style == "zero":
420 pass
421 elif xf.style == "new":
422 for piece in self.tgt.ReadRangeSet(xf.tgt_ranges):
423 new_f.write(piece)
424 elif xf.style == "diff":
425 src = self.src.ReadRangeSet(xf.src_ranges)
426 tgt = self.tgt.ReadRangeSet(xf.tgt_ranges)
427
428 # We can't compare src and tgt directly because they may have
429 # the same content but be broken up into blocks differently, eg:
430 #
431 # ["he", "llo"] vs ["h", "ello"]
432 #
433 # We want those to compare equal, ideally without having to
434 # actually concatenate the strings (these may be tens of
435 # megabytes).
436
437 src_sha1 = sha1()
438 for p in src:
439 src_sha1.update(p)
440 tgt_sha1 = sha1()
441 tgt_size = 0
442 for p in tgt:
443 tgt_sha1.update(p)
444 tgt_size += len(p)
445
446 if src_sha1.digest() == tgt_sha1.digest():
447 # These are identical; we don't need to generate a patch,
448 # just issue copy commands on the device.
449 xf.style = "move"
450 else:
451 # For files in zip format (eg, APKs, JARs, etc.) we would
452 # like to use imgdiff -z if possible (because it usually
453 # produces significantly smaller patches than bsdiff).
454 # This is permissible if:
455 #
456 # - the source and target files are monotonic (ie, the
457 # data is stored with blocks in increasing order), and
458 # - we haven't removed any blocks from the source set.
459 #
460 # If these conditions are satisfied then appending all the
461 # blocks in the set together in order will produce a valid
462 # zip file (plus possibly extra zeros in the last block),
463 # which is what imgdiff needs to operate. (imgdiff is
464 # fine with extra zeros at the end of the file.)
465 imgdiff = (xf.intact and
466 xf.tgt_name.split(".")[-1].lower()
467 in ("apk", "jar", "zip"))
468 xf.style = "imgdiff" if imgdiff else "bsdiff"
469 diff_q.append((tgt_size, src, tgt, xf, patch_num))
470 patch_num += 1
471
472 else:
473 assert False, "unknown style " + xf.style
474
475 if diff_q:
476 if self.threads > 1:
477 print("Computing patches (using %d threads)..." % (self.threads,))
478 else:
479 print("Computing patches...")
480 diff_q.sort()
481
482 patches = [None] * patch_num
483
484 lock = threading.Lock()
485 def diff_worker():
486 while True:
487 with lock:
488 if not diff_q: return
489 tgt_size, src, tgt, xf, patchnum = diff_q.pop()
490 patch = compute_patch(src, tgt, imgdiff=(xf.style == "imgdiff"))
491 size = len(patch)
492 with lock:
493 patches[patchnum] = (patch, xf)
494 print("%10d %10d (%6.2f%%) %7s %s" % (
495 size, tgt_size, size * 100.0 / tgt_size, xf.style,
496 xf.tgt_name if xf.tgt_name == xf.src_name else (
497 xf.tgt_name + " (from " + xf.src_name + ")")))
498
499 threads = [threading.Thread(target=diff_worker)
500 for i in range(self.threads)]
501 for th in threads:
502 th.start()
503 while threads:
504 threads.pop().join()
505 else:
506 patches = []
507
508 p = 0
509 with open(prefix + ".patch.dat", "wb") as patch_f:
510 for patch, xf in patches:
511 xf.patch_start = p
512 xf.patch_len = len(patch)
513 patch_f.write(patch)
514 p += len(patch)
515
516 def AssertSequenceGood(self):
517 # Simulate the sequences of transfers we will output, and check that:
518 # - we never read a block after writing it, and
519 # - we write every block we care about exactly once.
520
521 # Start with no blocks having been touched yet.
522 touched = RangeSet()
523
524 # Imagine processing the transfers in order.
525 for xf in self.transfers:
526 # Check that the input blocks for this transfer haven't yet been touched.
Doug Zongker62338182014-09-08 08:29:55 -0700527
528 x = xf.src_ranges
529 if self.version >= 2:
530 for _, sr in xf.use_stash:
531 x = x.subtract(sr)
532
533 assert not touched.overlaps(x)
Doug Zongkerfc44a512014-08-26 13:10:25 -0700534 # Check that the output blocks for this transfer haven't yet been touched.
535 assert not touched.overlaps(xf.tgt_ranges)
536 # Touch all the blocks written by this transfer.
537 touched = touched.union(xf.tgt_ranges)
538
539 # Check that we've written every target block.
540 assert touched == self.tgt.care_map
541
Doug Zongker62338182014-09-08 08:29:55 -0700542 def ImproveVertexSequence(self):
543 print("Improving vertex order...")
544
545 # At this point our digraph is acyclic; we reversed any edges that
546 # were backwards in the heuristically-generated sequence. The
547 # previously-generated order is still acceptable, but we hope to
548 # find a better order that needs less memory for stashed data.
549 # Now we do a topological sort to generate a new vertex order,
550 # using a greedy algorithm to choose which vertex goes next
551 # whenever we have a choice.
552
553 # Make a copy of the edge set; this copy will get destroyed by the
554 # algorithm.
555 for xf in self.transfers:
556 xf.incoming = xf.goes_after.copy()
557 xf.outgoing = xf.goes_before.copy()
558
559 L = [] # the new vertex order
560
561 # S is the set of sources in the remaining graph; we always choose
562 # the one that leaves the least amount of stashed data after it's
563 # executed.
564 S = [(u.NetStashChange(), u.order, u) for u in self.transfers
565 if not u.incoming]
566 heapq.heapify(S)
567
568 while S:
569 _, _, xf = heapq.heappop(S)
570 L.append(xf)
571 for u in xf.outgoing:
572 del u.incoming[xf]
573 if not u.incoming:
574 heapq.heappush(S, (u.NetStashChange(), u.order, u))
575
576 # if this fails then our graph had a cycle.
577 assert len(L) == len(self.transfers)
578
579 self.transfers = L
580 for i, xf in enumerate(L):
581 xf.order = i
582
Doug Zongkerfc44a512014-08-26 13:10:25 -0700583 def RemoveBackwardEdges(self):
584 print("Removing backward edges...")
585 in_order = 0
586 out_of_order = 0
587 lost_source = 0
588
589 for xf in self.transfers:
Doug Zongkerfc44a512014-08-26 13:10:25 -0700590 lost = 0
591 size = xf.src_ranges.size()
592 for u in xf.goes_before:
593 # xf should go before u
594 if xf.order < u.order:
595 # it does, hurray!
Doug Zongker62338182014-09-08 08:29:55 -0700596 in_order += 1
Doug Zongkerfc44a512014-08-26 13:10:25 -0700597 else:
598 # it doesn't, boo. trim the blocks that u writes from xf's
599 # source, so that xf can go after u.
Doug Zongker62338182014-09-08 08:29:55 -0700600 out_of_order += 1
Doug Zongkerfc44a512014-08-26 13:10:25 -0700601 assert xf.src_ranges.overlaps(u.tgt_ranges)
602 xf.src_ranges = xf.src_ranges.subtract(u.tgt_ranges)
603 xf.intact = False
604
605 if xf.style == "diff" and not xf.src_ranges:
606 # nothing left to diff from; treat as new data
607 xf.style = "new"
608
609 lost = size - xf.src_ranges.size()
610 lost_source += lost
Doug Zongkerfc44a512014-08-26 13:10:25 -0700611
612 print((" %d/%d dependencies (%.2f%%) were violated; "
613 "%d source blocks removed.") %
614 (out_of_order, in_order + out_of_order,
615 (out_of_order * 100.0 / (in_order + out_of_order))
616 if (in_order + out_of_order) else 0.0,
617 lost_source))
618
Doug Zongker62338182014-09-08 08:29:55 -0700619 def ReverseBackwardEdges(self):
620 print("Reversing backward edges...")
621 in_order = 0
622 out_of_order = 0
623 stashes = 0
624 stash_size = 0
625
626 for xf in self.transfers:
627 lost = 0
628 size = xf.src_ranges.size()
629 for u in xf.goes_before.copy():
630 # xf should go before u
631 if xf.order < u.order:
632 # it does, hurray!
633 in_order += 1
634 else:
635 # it doesn't, boo. modify u to stash the blocks that it
636 # writes that xf wants to read, and then require u to go
637 # before xf.
638 out_of_order += 1
639
640 overlap = xf.src_ranges.intersect(u.tgt_ranges)
641 assert overlap
642
643 u.stash_before.append((stashes, overlap))
644 xf.use_stash.append((stashes, overlap))
645 stashes += 1
646 stash_size += overlap.size()
647
648 # reverse the edge direction; now xf must go after u
649 del xf.goes_before[u]
650 del u.goes_after[xf]
651 xf.goes_after[u] = None # value doesn't matter
652 u.goes_before[xf] = None
653
654 print((" %d/%d dependencies (%.2f%%) were violated; "
655 "%d source blocks stashed.") %
656 (out_of_order, in_order + out_of_order,
657 (out_of_order * 100.0 / (in_order + out_of_order))
658 if (in_order + out_of_order) else 0.0,
659 stash_size))
660
Doug Zongkerfc44a512014-08-26 13:10:25 -0700661 def FindVertexSequence(self):
662 print("Finding vertex sequence...")
663
664 # This is based on "A Fast & Effective Heuristic for the Feedback
665 # Arc Set Problem" by P. Eades, X. Lin, and W.F. Smyth. Think of
666 # it as starting with the digraph G and moving all the vertices to
667 # be on a horizontal line in some order, trying to minimize the
668 # number of edges that end up pointing to the left. Left-pointing
669 # edges will get removed to turn the digraph into a DAG. In this
670 # case each edge has a weight which is the number of source blocks
671 # we'll lose if that edge is removed; we try to minimize the total
672 # weight rather than just the number of edges.
673
674 # Make a copy of the edge set; this copy will get destroyed by the
675 # algorithm.
676 for xf in self.transfers:
677 xf.incoming = xf.goes_after.copy()
678 xf.outgoing = xf.goes_before.copy()
679
680 # We use an OrderedDict instead of just a set so that the output
681 # is repeatable; otherwise it would depend on the hash values of
682 # the transfer objects.
683 G = OrderedDict()
684 for xf in self.transfers:
685 G[xf] = None
686 s1 = deque() # the left side of the sequence, built from left to right
687 s2 = deque() # the right side of the sequence, built from right to left
688
689 while G:
690
691 # Put all sinks at the end of the sequence.
692 while True:
693 sinks = [u for u in G if not u.outgoing]
694 if not sinks: break
695 for u in sinks:
696 s2.appendleft(u)
697 del G[u]
698 for iu in u.incoming:
699 del iu.outgoing[u]
700
701 # Put all the sources at the beginning of the sequence.
702 while True:
703 sources = [u for u in G if not u.incoming]
704 if not sources: break
705 for u in sources:
706 s1.append(u)
707 del G[u]
708 for iu in u.outgoing:
709 del iu.incoming[u]
710
711 if not G: break
712
713 # Find the "best" vertex to put next. "Best" is the one that
714 # maximizes the net difference in source blocks saved we get by
715 # pretending it's a source rather than a sink.
716
717 max_d = None
718 best_u = None
719 for u in G:
720 d = sum(u.outgoing.values()) - sum(u.incoming.values())
721 if best_u is None or d > max_d:
722 max_d = d
723 best_u = u
724
725 u = best_u
726 s1.append(u)
727 del G[u]
728 for iu in u.outgoing:
729 del iu.incoming[u]
730 for iu in u.incoming:
731 del iu.outgoing[u]
732
733 # Now record the sequence in the 'order' field of each transfer,
734 # and by rearranging self.transfers to be in the chosen sequence.
735
736 new_transfers = []
737 for x in itertools.chain(s1, s2):
738 x.order = len(new_transfers)
739 new_transfers.append(x)
740 del x.incoming
741 del x.outgoing
742
743 self.transfers = new_transfers
744
745 def GenerateDigraph(self):
746 print("Generating digraph...")
747 for a in self.transfers:
748 for b in self.transfers:
749 if a is b: continue
750
751 # If the blocks written by A are read by B, then B needs to go before A.
752 i = a.tgt_ranges.intersect(b.src_ranges)
753 if i:
Doug Zongkerab7ca1d2014-08-26 10:40:28 -0700754 if b.src_name == "__ZERO":
755 # the cost of removing source blocks for the __ZERO domain
756 # is (nearly) zero.
757 size = 0
758 else:
759 size = i.size()
Doug Zongkerfc44a512014-08-26 13:10:25 -0700760 b.goes_before[a] = size
761 a.goes_after[b] = size
762
763 def FindTransfers(self):
764 self.transfers = []
765 empty = RangeSet()
766 for tgt_fn, tgt_ranges in self.tgt.file_map.items():
767 if tgt_fn == "__ZERO":
768 # the special "__ZERO" domain is all the blocks not contained
769 # in any file and that are filled with zeros. We have a
770 # special transfer style for zero blocks.
771 src_ranges = self.src.file_map.get("__ZERO", empty)
Doug Zongkerab7ca1d2014-08-26 10:40:28 -0700772 Transfer(tgt_fn, "__ZERO", tgt_ranges, src_ranges,
773 "zero", self.transfers)
Doug Zongkerfc44a512014-08-26 13:10:25 -0700774 continue
775
776 elif tgt_fn in self.src.file_map:
777 # Look for an exact pathname match in the source.
778 Transfer(tgt_fn, tgt_fn, tgt_ranges, self.src.file_map[tgt_fn],
779 "diff", self.transfers)
780 continue
781
782 b = os.path.basename(tgt_fn)
783 if b in self.src_basenames:
784 # Look for an exact basename match in the source.
785 src_fn = self.src_basenames[b]
786 Transfer(tgt_fn, src_fn, tgt_ranges, self.src.file_map[src_fn],
787 "diff", self.transfers)
788 continue
789
790 b = re.sub("[0-9]+", "#", b)
791 if b in self.src_numpatterns:
792 # Look for a 'number pattern' match (a basename match after
793 # all runs of digits are replaced by "#"). (This is useful
794 # for .so files that contain version numbers in the filename
795 # that get bumped.)
796 src_fn = self.src_numpatterns[b]
797 Transfer(tgt_fn, src_fn, tgt_ranges, self.src.file_map[src_fn],
798 "diff", self.transfers)
799 continue
800
801 Transfer(tgt_fn, None, tgt_ranges, empty, "new", self.transfers)
802
803 def AbbreviateSourceNames(self):
804 self.src_basenames = {}
805 self.src_numpatterns = {}
806
807 for k in self.src.file_map.keys():
808 b = os.path.basename(k)
809 self.src_basenames[b] = k
810 b = re.sub("[0-9]+", "#", b)
811 self.src_numpatterns[b] = k
812
813 @staticmethod
814 def AssertPartition(total, seq):
815 """Assert that all the RangeSets in 'seq' form a partition of the
816 'total' RangeSet (ie, they are nonintersecting and their union
817 equals 'total')."""
818 so_far = RangeSet()
819 for i in seq:
820 assert not so_far.overlaps(i)
821 so_far = so_far.union(i)
822 assert so_far == total