Blame - tools/releasetools/blockimgdiff.py - platform_build_make

blob: f031078a5ce0a21df5ad400b4781e1ad3520c247 [file] [log] [blame]

Doug Zongker	424296a	2014-09-02 08:53:09 -0700	[diff] [blame]	1	# Copyright (C) 2014 The Android Open Source Project
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# http://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	15	from __future__ import print_function
				16
				17	from collections import deque, OrderedDict
				18	from hashlib import sha1
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	19	import heapq
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	20	import itertools
				21	import multiprocessing
				22	import os
				23	import pprint
				24	import re
				25	import subprocess
				26	import sys
				27	import threading
				28	import tempfile
				29
				30	from rangelib import *
				31
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	32	__all__ = ["EmptyImage", "DataImage", "BlockImageDiff"]
				33
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	34	def compute_patch(src, tgt, imgdiff=False):
				35	srcfd, srcfile = tempfile.mkstemp(prefix="src-")
				36	tgtfd, tgtfile = tempfile.mkstemp(prefix="tgt-")
				37	patchfd, patchfile = tempfile.mkstemp(prefix="patch-")
				38	os.close(patchfd)
				39
				40	try:
				41	with os.fdopen(srcfd, "wb") as f_src:
				42	for p in src:
				43	f_src.write(p)
				44
				45	with os.fdopen(tgtfd, "wb") as f_tgt:
				46	for p in tgt:
				47	f_tgt.write(p)
				48	try:
				49	os.unlink(patchfile)
				50	except OSError:
				51	pass
				52	if imgdiff:
				53	p = subprocess.call(["imgdiff", "-z", srcfile, tgtfile, patchfile],
				54	stdout=open("/dev/null", "a"),
				55	stderr=subprocess.STDOUT)
				56	else:
				57	p = subprocess.call(["bsdiff", srcfile, tgtfile, patchfile])
				58
				59	if p:
				60	raise ValueError("diff failed: " + str(p))
				61
				62	with open(patchfile, "rb") as f:
				63	return f.read()
				64	finally:
				65	try:
				66	os.unlink(srcfile)
				67	os.unlink(tgtfile)
				68	os.unlink(patchfile)
				69	except OSError:
				70	pass
				71
				72	class EmptyImage(object):
				73	"""A zero-length image."""
				74	blocksize = 4096
				75	care_map = RangeSet()
				76	total_blocks = 0
				77	file_map = {}
				78	def ReadRangeSet(self, ranges):
				79	return ()
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	80	def TotalSha1(self):
				81	return sha1().hexdigest()
				82
				83
				84	class DataImage(object):
				85	"""An image wrapped around a single string of data."""
				86
				87	def __init__(self, data, trim=False, pad=False):
				88	self.data = data
				89	self.blocksize = 4096
				90
				91	assert not (trim and pad)
				92
				93	partial = len(self.data) % self.blocksize
				94	if partial > 0:
				95	if trim:
				96	self.data = self.data[:-partial]
				97	elif pad:
				98	self.data += '\0' * (self.blocksize - partial)
				99	else:
				100	raise ValueError(("data for DataImage must be multiple of %d bytes "
				101	"unless trim or pad is specified") %
				102	(self.blocksize,))
				103
				104	assert len(self.data) % self.blocksize == 0
				105
				106	self.total_blocks = len(self.data) / self.blocksize
				107	self.care_map = RangeSet(data=(0, self.total_blocks))
				108
				109	zero_blocks = []
				110	nonzero_blocks = []
				111	reference = '\0' * self.blocksize
				112
				113	for i in range(self.total_blocks):
				114	d = self.data[iself.blocksize : (i+1)self.blocksize]
				115	if d == reference:
				116	zero_blocks.append(i)
				117	zero_blocks.append(i+1)
				118	else:
				119	nonzero_blocks.append(i)
				120	nonzero_blocks.append(i+1)
				121
				122	self.file_map = {"__ZERO": RangeSet(zero_blocks),
				123	"__NONZERO": RangeSet(nonzero_blocks)}
				124
				125	def ReadRangeSet(self, ranges):
				126	return [self.data[sself.blocksize:eself.blocksize] for (s, e) in ranges]
				127
				128	def TotalSha1(self):
				129	if not hasattr(self, "sha1"):
				130	self.sha1 = sha1(self.data).hexdigest()
				131	return self.sha1
				132
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	133
				134	class Transfer(object):
				135	def __init__(self, tgt_name, src_name, tgt_ranges, src_ranges, style, by_id):
				136	self.tgt_name = tgt_name
				137	self.src_name = src_name
				138	self.tgt_ranges = tgt_ranges
				139	self.src_ranges = src_ranges
				140	self.style = style
				141	self.intact = (getattr(tgt_ranges, "monotonic", False) and
				142	getattr(src_ranges, "monotonic", False))
Tao Bao	b8c8717	2015-03-19 19:42:12 -0700	[diff] [blame]	143
				144	# We use OrderedDict rather than dict so that the output is repeatable;
				145	# otherwise it would depend on the hash values of the Transfer objects.
				146	self.goes_before = OrderedDict()
				147	self.goes_after = OrderedDict()
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	148
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	149	self.stash_before = []
				150	self.use_stash = []
				151
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	152	self.id = len(by_id)
				153	by_id.append(self)
				154
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	155	def NetStashChange(self):
				156	return (sum(sr.size() for (_, sr) in self.stash_before) -
				157	sum(sr.size() for (_, sr) in self.use_stash))
				158
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	159	def __str__(self):
				160	return (str(self.id) + ": <" + str(self.src_ranges) + " " + self.style +
				161	" to " + str(self.tgt_ranges) + ">")
				162
				163
				164	# BlockImageDiff works on two image objects. An image object is
				165	# anything that provides the following attributes:
				166	#
				167	# blocksize: the size in bytes of a block, currently must be 4096.
				168	#
				169	# total_blocks: the total size of the partition/image, in blocks.
				170	#
				171	# care_map: a RangeSet containing which blocks (in the range [0,
				172	# total_blocks) we actually care about; i.e. which blocks contain
				173	# data.
				174	#
				175	# file_map: a dict that partitions the blocks contained in care_map
				176	# into smaller domains that are useful for doing diffs on.
				177	# (Typically a domain is a file, and the key in file_map is the
				178	# pathname.)
				179	#
				180	# ReadRangeSet(): a function that takes a RangeSet and returns the
				181	# data contained in the image blocks of that RangeSet. The data
				182	# is returned as a list or tuple of strings; concatenating the
				183	# elements together should produce the requested data.
				184	# Implementations are free to break up the data into list/tuple
				185	# elements in any way that is convenient.
				186	#
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	187	# TotalSha1(): a function that returns (as a hex string) the SHA-1
				188	# hash of all the data in the image (ie, all the blocks in the
				189	# care_map)
				190	#
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	191	# When creating a BlockImageDiff, the src image may be None, in which
				192	# case the list of transfers produced will never read from the
				193	# original image.
				194
				195	class BlockImageDiff(object):
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	196	def __init__(self, tgt, src=None, threads=None, version=2):
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	197	if threads is None:
				198	threads = multiprocessing.cpu_count() // 2
				199	if threads == 0: threads = 1
				200	self.threads = threads
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	201	self.version = version
				202
				203	assert version in (1, 2)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	204
				205	self.tgt = tgt
				206	if src is None:
				207	src = EmptyImage()
				208	self.src = src
				209
				210	# The updater code that installs the patch always uses 4k blocks.
				211	assert tgt.blocksize == 4096
				212	assert src.blocksize == 4096
				213
				214	# The range sets in each filemap should comprise a partition of
				215	# the care map.
				216	self.AssertPartition(src.care_map, src.file_map.values())
				217	self.AssertPartition(tgt.care_map, tgt.file_map.values())
				218
				219	def Compute(self, prefix):
				220	# When looking for a source file to use as the diff input for a
				221	# target file, we try:
				222	# 1) an exact path match if available, otherwise
				223	# 2) a exact basename match if available, otherwise
				224	# 3) a basename match after all runs of digits are replaced by
				225	# "#" if available, otherwise
				226	# 4) we have no source for this target.
				227	self.AbbreviateSourceNames()
				228	self.FindTransfers()
				229
				230	# Find the ordering dependencies among transfers (this is O(n^2)
				231	# in the number of transfers).
				232	self.GenerateDigraph()
				233	# Find a sequence of transfers that satisfies as many ordering
				234	# dependencies as possible (heuristically).
				235	self.FindVertexSequence()
				236	# Fix up the ordering dependencies that the sequence didn't
				237	# satisfy.
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	238	if self.version == 1:
				239	self.RemoveBackwardEdges()
				240	else:
				241	self.ReverseBackwardEdges()
				242	self.ImproveVertexSequence()
				243
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	244	# Double-check our work.
				245	self.AssertSequenceGood()
				246
				247	self.ComputePatches(prefix)
				248	self.WriteTransfers(prefix)
				249
				250	def WriteTransfers(self, prefix):
				251	out = []
				252
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	253	total = 0
				254	performs_read = False
				255
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	256	stashes = {}
				257	stashed_blocks = 0
				258	max_stashed_blocks = 0
				259
				260	free_stash_ids = []
				261	next_stash_id = 0
				262
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	263	for xf in self.transfers:
				264
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	265	if self.version < 2:
				266	assert not xf.stash_before
				267	assert not xf.use_stash
				268
				269	for s, sr in xf.stash_before:
				270	assert s not in stashes
				271	if free_stash_ids:
				272	sid = heapq.heappop(free_stash_ids)
				273	else:
				274	sid = next_stash_id
				275	next_stash_id += 1
				276	stashes[s] = sid
				277	stashed_blocks += sr.size()
				278	out.append("stash %d %s\n" % (sid, sr.to_string_raw()))
				279
				280	if stashed_blocks > max_stashed_blocks:
				281	max_stashed_blocks = stashed_blocks
				282
Jesse Zhao	7b985f6	2015-03-02 16:53:08 -0800	[diff] [blame]	283	free_string = []
				284
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	285	if self.version == 1:
				286	src_string = xf.src_ranges.to_string_raw()
				287	elif self.version == 2:
				288
				289	# <# blocks> <src ranges>
				290	# OR
				291	# <# blocks> <src ranges> <src locs> <stash refs...>
				292	# OR
				293	# <# blocks> - <stash refs...>
				294
				295	size = xf.src_ranges.size()
				296	src_string = [str(size)]
				297
				298	unstashed_src_ranges = xf.src_ranges
				299	mapped_stashes = []
				300	for s, sr in xf.use_stash:
				301	sid = stashes.pop(s)
				302	stashed_blocks -= sr.size()
				303	unstashed_src_ranges = unstashed_src_ranges.subtract(sr)
				304	sr = xf.src_ranges.map_within(sr)
				305	mapped_stashes.append(sr)
				306	src_string.append("%d:%s" % (sid, sr.to_string_raw()))
				307	heapq.heappush(free_stash_ids, sid)
				308
				309	if unstashed_src_ranges:
				310	src_string.insert(1, unstashed_src_ranges.to_string_raw())
				311	if xf.use_stash:
				312	mapped_unstashed = xf.src_ranges.map_within(unstashed_src_ranges)
				313	src_string.insert(2, mapped_unstashed.to_string_raw())
				314	mapped_stashes.append(mapped_unstashed)
				315	self.AssertPartition(RangeSet(data=(0, size)), mapped_stashes)
				316	else:
				317	src_string.insert(1, "-")
				318	self.AssertPartition(RangeSet(data=(0, size)), mapped_stashes)
				319
				320	src_string = " ".join(src_string)
				321
				322	# both versions:
				323	# zero <rangeset>
				324	# new <rangeset>
				325	# erase <rangeset>
				326	#
				327	# version 1:
				328	# bsdiff patchstart patchlen <src rangeset> <tgt rangeset>
				329	# imgdiff patchstart patchlen <src rangeset> <tgt rangeset>
				330	# move <src rangeset> <tgt rangeset>
				331	#
				332	# version 2:
				333	# bsdiff patchstart patchlen <tgt rangeset> <src_string>
				334	# imgdiff patchstart patchlen <tgt rangeset> <src_string>
				335	# move <tgt rangeset> <src_string>
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	336
				337	tgt_size = xf.tgt_ranges.size()
				338
				339	if xf.style == "new":
				340	assert xf.tgt_ranges
				341	out.append("%s %s\n" % (xf.style, xf.tgt_ranges.to_string_raw()))
				342	total += tgt_size
				343	elif xf.style == "move":
				344	performs_read = True
				345	assert xf.tgt_ranges
				346	assert xf.src_ranges.size() == tgt_size
				347	if xf.src_ranges != xf.tgt_ranges:
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	348	if self.version == 1:
				349	out.append("%s %s %s\n" % (
				350	xf.style,
				351	xf.src_ranges.to_string_raw(), xf.tgt_ranges.to_string_raw()))
				352	elif self.version == 2:
				353	out.append("%s %s %s\n" % (
				354	xf.style,
				355	xf.tgt_ranges.to_string_raw(), src_string))
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	356	total += tgt_size
				357	elif xf.style in ("bsdiff", "imgdiff"):
				358	performs_read = True
				359	assert xf.tgt_ranges
				360	assert xf.src_ranges
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	361	if self.version == 1:
				362	out.append("%s %d %d %s %s\n" % (
				363	xf.style, xf.patch_start, xf.patch_len,
				364	xf.src_ranges.to_string_raw(), xf.tgt_ranges.to_string_raw()))
				365	elif self.version == 2:
				366	out.append("%s %d %d %s %s\n" % (
				367	xf.style, xf.patch_start, xf.patch_len,
				368	xf.tgt_ranges.to_string_raw(), src_string))
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	369	total += tgt_size
				370	elif xf.style == "zero":
				371	assert xf.tgt_ranges
				372	to_zero = xf.tgt_ranges.subtract(xf.src_ranges)
				373	if to_zero:
				374	out.append("%s %s\n" % (xf.style, to_zero.to_string_raw()))
				375	total += to_zero.size()
				376	else:
				377	raise ValueError, "unknown transfer style '%s'\n" % (xf.style,)
				378
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	379
				380	# sanity check: abort if we're going to need more than 512 MB if
				381	# stash space
				382	assert max_stashed_blocks * self.tgt.blocksize < (512 << 20)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	383
				384	all_tgt = RangeSet(data=(0, self.tgt.total_blocks))
				385	if performs_read:
				386	# if some of the original data is used, then at the end we'll
				387	# erase all the blocks on the partition that don't contain data
				388	# in the new image.
				389	new_dontcare = all_tgt.subtract(self.tgt.care_map)
				390	if new_dontcare:
				391	out.append("erase %s\n" % (new_dontcare.to_string_raw(),))
				392	else:
				393	# if nothing is read (ie, this is a full OTA), then we can start
				394	# by erasing the entire partition.
Doug Zongker	e985f6f	2014-09-09 12:38:47 -0700	[diff] [blame]	395	out.insert(0, "erase %s\n" % (all_tgt.to_string_raw(),))
				396
				397	out.insert(0, "%d\n" % (self.version,)) # format version number
				398	out.insert(1, str(total) + "\n")
				399	if self.version >= 2:
				400	# version 2 only: after the total block count, we give the number
				401	# of stash slots needed, and the maximum size needed (in blocks)
				402	out.insert(2, str(next_stash_id) + "\n")
				403	out.insert(3, str(max_stashed_blocks) + "\n")
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	404
				405	with open(prefix + ".transfer.list", "wb") as f:
				406	for i in out:
				407	f.write(i)
				408
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	409	if self.version >= 2:
				410	print("max stashed blocks: %d (%d bytes)\n" % (
				411	max_stashed_blocks, max_stashed_blocks * self.tgt.blocksize))
				412
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	413	def ComputePatches(self, prefix):
				414	print("Reticulating splines...")
				415	diff_q = []
				416	patch_num = 0
				417	with open(prefix + ".new.dat", "wb") as new_f:
				418	for xf in self.transfers:
				419	if xf.style == "zero":
				420	pass
				421	elif xf.style == "new":
				422	for piece in self.tgt.ReadRangeSet(xf.tgt_ranges):
				423	new_f.write(piece)
				424	elif xf.style == "diff":
				425	src = self.src.ReadRangeSet(xf.src_ranges)
				426	tgt = self.tgt.ReadRangeSet(xf.tgt_ranges)
				427
				428	# We can't compare src and tgt directly because they may have
				429	# the same content but be broken up into blocks differently, eg:
				430	#
				431	# ["he", "llo"] vs ["h", "ello"]
				432	#
				433	# We want those to compare equal, ideally without having to
				434	# actually concatenate the strings (these may be tens of
				435	# megabytes).
				436
				437	src_sha1 = sha1()
				438	for p in src:
				439	src_sha1.update(p)
				440	tgt_sha1 = sha1()
				441	tgt_size = 0
				442	for p in tgt:
				443	tgt_sha1.update(p)
				444	tgt_size += len(p)
				445
				446	if src_sha1.digest() == tgt_sha1.digest():
				447	# These are identical; we don't need to generate a patch,
				448	# just issue copy commands on the device.
				449	xf.style = "move"
				450	else:
				451	# For files in zip format (eg, APKs, JARs, etc.) we would
				452	# like to use imgdiff -z if possible (because it usually
				453	# produces significantly smaller patches than bsdiff).
				454	# This is permissible if:
				455	#
				456	# - the source and target files are monotonic (ie, the
				457	# data is stored with blocks in increasing order), and
				458	# - we haven't removed any blocks from the source set.
				459	#
				460	# If these conditions are satisfied then appending all the
				461	# blocks in the set together in order will produce a valid
				462	# zip file (plus possibly extra zeros in the last block),
				463	# which is what imgdiff needs to operate. (imgdiff is
				464	# fine with extra zeros at the end of the file.)
				465	imgdiff = (xf.intact and
				466	xf.tgt_name.split(".")[-1].lower()
				467	in ("apk", "jar", "zip"))
				468	xf.style = "imgdiff" if imgdiff else "bsdiff"
				469	diff_q.append((tgt_size, src, tgt, xf, patch_num))
				470	patch_num += 1
				471
				472	else:
				473	assert False, "unknown style " + xf.style
				474
				475	if diff_q:
				476	if self.threads > 1:
				477	print("Computing patches (using %d threads)..." % (self.threads,))
				478	else:
				479	print("Computing patches...")
				480	diff_q.sort()
				481
				482	patches = [None] * patch_num
				483
				484	lock = threading.Lock()
				485	def diff_worker():
				486	while True:
				487	with lock:
				488	if not diff_q: return
				489	tgt_size, src, tgt, xf, patchnum = diff_q.pop()
				490	patch = compute_patch(src, tgt, imgdiff=(xf.style == "imgdiff"))
				491	size = len(patch)
				492	with lock:
				493	patches[patchnum] = (patch, xf)
				494	print("%10d %10d (%6.2f%%) %7s %s" % (
				495	size, tgt_size, size * 100.0 / tgt_size, xf.style,
				496	xf.tgt_name if xf.tgt_name == xf.src_name else (
				497	xf.tgt_name + " (from " + xf.src_name + ")")))
				498
				499	threads = [threading.Thread(target=diff_worker)
				500	for i in range(self.threads)]
				501	for th in threads:
				502	th.start()
				503	while threads:
				504	threads.pop().join()
				505	else:
				506	patches = []
				507
				508	p = 0
				509	with open(prefix + ".patch.dat", "wb") as patch_f:
				510	for patch, xf in patches:
				511	xf.patch_start = p
				512	xf.patch_len = len(patch)
				513	patch_f.write(patch)
				514	p += len(patch)
				515
				516	def AssertSequenceGood(self):
				517	# Simulate the sequences of transfers we will output, and check that:
				518	# - we never read a block after writing it, and
				519	# - we write every block we care about exactly once.
				520
				521	# Start with no blocks having been touched yet.
				522	touched = RangeSet()
				523
				524	# Imagine processing the transfers in order.
				525	for xf in self.transfers:
				526	# Check that the input blocks for this transfer haven't yet been touched.
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	527
				528	x = xf.src_ranges
				529	if self.version >= 2:
				530	for _, sr in xf.use_stash:
				531	x = x.subtract(sr)
				532
				533	assert not touched.overlaps(x)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	534	# Check that the output blocks for this transfer haven't yet been touched.
				535	assert not touched.overlaps(xf.tgt_ranges)
				536	# Touch all the blocks written by this transfer.
				537	touched = touched.union(xf.tgt_ranges)
				538
				539	# Check that we've written every target block.
				540	assert touched == self.tgt.care_map
				541
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	542	def ImproveVertexSequence(self):
				543	print("Improving vertex order...")
				544
				545	# At this point our digraph is acyclic; we reversed any edges that
				546	# were backwards in the heuristically-generated sequence. The
				547	# previously-generated order is still acceptable, but we hope to
				548	# find a better order that needs less memory for stashed data.
				549	# Now we do a topological sort to generate a new vertex order,
				550	# using a greedy algorithm to choose which vertex goes next
				551	# whenever we have a choice.
				552
				553	# Make a copy of the edge set; this copy will get destroyed by the
				554	# algorithm.
				555	for xf in self.transfers:
				556	xf.incoming = xf.goes_after.copy()
				557	xf.outgoing = xf.goes_before.copy()
				558
				559	L = [] # the new vertex order
				560
				561	# S is the set of sources in the remaining graph; we always choose
				562	# the one that leaves the least amount of stashed data after it's
				563	# executed.
				564	S = [(u.NetStashChange(), u.order, u) for u in self.transfers
				565	if not u.incoming]
				566	heapq.heapify(S)
				567
				568	while S:
				569	_, _, xf = heapq.heappop(S)
				570	L.append(xf)
				571	for u in xf.outgoing:
				572	del u.incoming[xf]
				573	if not u.incoming:
				574	heapq.heappush(S, (u.NetStashChange(), u.order, u))
				575
				576	# if this fails then our graph had a cycle.
				577	assert len(L) == len(self.transfers)
				578
				579	self.transfers = L
				580	for i, xf in enumerate(L):
				581	xf.order = i
				582
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	583	def RemoveBackwardEdges(self):
				584	print("Removing backward edges...")
				585	in_order = 0
				586	out_of_order = 0
				587	lost_source = 0
				588
				589	for xf in self.transfers:
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	590	lost = 0
				591	size = xf.src_ranges.size()
				592	for u in xf.goes_before:
				593	# xf should go before u
				594	if xf.order < u.order:
				595	# it does, hurray!
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	596	in_order += 1
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	597	else:
				598	# it doesn't, boo. trim the blocks that u writes from xf's
				599	# source, so that xf can go after u.
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	600	out_of_order += 1
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	601	assert xf.src_ranges.overlaps(u.tgt_ranges)
				602	xf.src_ranges = xf.src_ranges.subtract(u.tgt_ranges)
				603	xf.intact = False
				604
				605	if xf.style == "diff" and not xf.src_ranges:
				606	# nothing left to diff from; treat as new data
				607	xf.style = "new"
				608
				609	lost = size - xf.src_ranges.size()
				610	lost_source += lost
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	611
				612	print((" %d/%d dependencies (%.2f%%) were violated; "
				613	"%d source blocks removed.") %
				614	(out_of_order, in_order + out_of_order,
				615	(out_of_order * 100.0 / (in_order + out_of_order))
				616	if (in_order + out_of_order) else 0.0,
				617	lost_source))
				618
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	619	def ReverseBackwardEdges(self):
				620	print("Reversing backward edges...")
				621	in_order = 0
				622	out_of_order = 0
				623	stashes = 0
				624	stash_size = 0
				625
				626	for xf in self.transfers:
				627	lost = 0
				628	size = xf.src_ranges.size()
				629	for u in xf.goes_before.copy():
				630	# xf should go before u
				631	if xf.order < u.order:
				632	# it does, hurray!
				633	in_order += 1
				634	else:
				635	# it doesn't, boo. modify u to stash the blocks that it
				636	# writes that xf wants to read, and then require u to go
				637	# before xf.
				638	out_of_order += 1
				639
				640	overlap = xf.src_ranges.intersect(u.tgt_ranges)
				641	assert overlap
				642
				643	u.stash_before.append((stashes, overlap))
				644	xf.use_stash.append((stashes, overlap))
				645	stashes += 1
				646	stash_size += overlap.size()
				647
				648	# reverse the edge direction; now xf must go after u
				649	del xf.goes_before[u]
				650	del u.goes_after[xf]
				651	xf.goes_after[u] = None # value doesn't matter
				652	u.goes_before[xf] = None
				653
				654	print((" %d/%d dependencies (%.2f%%) were violated; "
				655	"%d source blocks stashed.") %
				656	(out_of_order, in_order + out_of_order,
				657	(out_of_order * 100.0 / (in_order + out_of_order))
				658	if (in_order + out_of_order) else 0.0,
				659	stash_size))
				660
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	661	def FindVertexSequence(self):
				662	print("Finding vertex sequence...")
				663
				664	# This is based on "A Fast & Effective Heuristic for the Feedback
				665	# Arc Set Problem" by P. Eades, X. Lin, and W.F. Smyth. Think of
				666	# it as starting with the digraph G and moving all the vertices to
				667	# be on a horizontal line in some order, trying to minimize the
				668	# number of edges that end up pointing to the left. Left-pointing
				669	# edges will get removed to turn the digraph into a DAG. In this
				670	# case each edge has a weight which is the number of source blocks
				671	# we'll lose if that edge is removed; we try to minimize the total
				672	# weight rather than just the number of edges.
				673
				674	# Make a copy of the edge set; this copy will get destroyed by the
				675	# algorithm.
				676	for xf in self.transfers:
				677	xf.incoming = xf.goes_after.copy()
				678	xf.outgoing = xf.goes_before.copy()
				679
				680	# We use an OrderedDict instead of just a set so that the output
				681	# is repeatable; otherwise it would depend on the hash values of
				682	# the transfer objects.
				683	G = OrderedDict()
				684	for xf in self.transfers:
				685	G[xf] = None
				686	s1 = deque() # the left side of the sequence, built from left to right
				687	s2 = deque() # the right side of the sequence, built from right to left
				688
				689	while G:
				690
				691	# Put all sinks at the end of the sequence.
				692	while True:
				693	sinks = [u for u in G if not u.outgoing]
				694	if not sinks: break
				695	for u in sinks:
				696	s2.appendleft(u)
				697	del G[u]
				698	for iu in u.incoming:
				699	del iu.outgoing[u]
				700
				701	# Put all the sources at the beginning of the sequence.
				702	while True:
				703	sources = [u for u in G if not u.incoming]
				704	if not sources: break
				705	for u in sources:
				706	s1.append(u)
				707	del G[u]
				708	for iu in u.outgoing:
				709	del iu.incoming[u]
				710
				711	if not G: break
				712
				713	# Find the "best" vertex to put next. "Best" is the one that
				714	# maximizes the net difference in source blocks saved we get by
				715	# pretending it's a source rather than a sink.
				716
				717	max_d = None
				718	best_u = None
				719	for u in G:
				720	d = sum(u.outgoing.values()) - sum(u.incoming.values())
				721	if best_u is None or d > max_d:
				722	max_d = d
				723	best_u = u
				724
				725	u = best_u
				726	s1.append(u)
				727	del G[u]
				728	for iu in u.outgoing:
				729	del iu.incoming[u]
				730	for iu in u.incoming:
				731	del iu.outgoing[u]
				732
				733	# Now record the sequence in the 'order' field of each transfer,
				734	# and by rearranging self.transfers to be in the chosen sequence.
				735
				736	new_transfers = []
				737	for x in itertools.chain(s1, s2):
				738	x.order = len(new_transfers)
				739	new_transfers.append(x)
				740	del x.incoming
				741	del x.outgoing
				742
				743	self.transfers = new_transfers
				744
				745	def GenerateDigraph(self):
				746	print("Generating digraph...")
				747	for a in self.transfers:
				748	for b in self.transfers:
				749	if a is b: continue
				750
				751	# If the blocks written by A are read by B, then B needs to go before A.
				752	i = a.tgt_ranges.intersect(b.src_ranges)
				753	if i:
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	754	if b.src_name == "__ZERO":
				755	# the cost of removing source blocks for the __ZERO domain
				756	# is (nearly) zero.
				757	size = 0
				758	else:
				759	size = i.size()
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	760	b.goes_before[a] = size
				761	a.goes_after[b] = size
				762
				763	def FindTransfers(self):
				764	self.transfers = []
				765	empty = RangeSet()
				766	for tgt_fn, tgt_ranges in self.tgt.file_map.items():
				767	if tgt_fn == "__ZERO":
				768	# the special "__ZERO" domain is all the blocks not contained
				769	# in any file and that are filled with zeros. We have a
				770	# special transfer style for zero blocks.
				771	src_ranges = self.src.file_map.get("__ZERO", empty)
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	772	Transfer(tgt_fn, "__ZERO", tgt_ranges, src_ranges,
				773	"zero", self.transfers)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	774	continue
				775
				776	elif tgt_fn in self.src.file_map:
				777	# Look for an exact pathname match in the source.
				778	Transfer(tgt_fn, tgt_fn, tgt_ranges, self.src.file_map[tgt_fn],
				779	"diff", self.transfers)
				780	continue
				781
				782	b = os.path.basename(tgt_fn)
				783	if b in self.src_basenames:
				784	# Look for an exact basename match in the source.
				785	src_fn = self.src_basenames[b]
				786	Transfer(tgt_fn, src_fn, tgt_ranges, self.src.file_map[src_fn],
				787	"diff", self.transfers)
				788	continue
				789
				790	b = re.sub("[0-9]+", "#", b)
				791	if b in self.src_numpatterns:
				792	# Look for a 'number pattern' match (a basename match after
				793	# all runs of digits are replaced by "#"). (This is useful
				794	# for .so files that contain version numbers in the filename
				795	# that get bumped.)
				796	src_fn = self.src_numpatterns[b]
				797	Transfer(tgt_fn, src_fn, tgt_ranges, self.src.file_map[src_fn],
				798	"diff", self.transfers)
				799	continue
				800
				801	Transfer(tgt_fn, None, tgt_ranges, empty, "new", self.transfers)
				802
				803	def AbbreviateSourceNames(self):
				804	self.src_basenames = {}
				805	self.src_numpatterns = {}
				806
				807	for k in self.src.file_map.keys():
				808	b = os.path.basename(k)
				809	self.src_basenames[b] = k
				810	b = re.sub("[0-9]+", "#", b)
				811	self.src_numpatterns[b] = k
				812
				813	@staticmethod
				814	def AssertPartition(total, seq):
				815	"""Assert that all the RangeSets in 'seq' form a partition of the
				816	'total' RangeSet (ie, they are nonintersecting and their union
				817	equals 'total')."""
				818	so_far = RangeSet()
				819	for i in seq:
				820	assert not so_far.overlaps(i)
				821	so_far = so_far.union(i)
				822	assert so_far == total