Blame - tools/releasetools/blockimgdiff.py - platform_build_make

blob: 5b5c4cce9200258be9fda81acf34e0fdda5b0f71 [file] [log] [blame]

Doug Zongker	424296a	2014-09-02 08:53:09 -0700	[diff] [blame]	1	# Copyright (C) 2014 The Android Open Source Project
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# http://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	15	from __future__ import print_function
				16
				17	from collections import deque, OrderedDict
				18	from hashlib import sha1
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	19	import heapq
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	20	import itertools
				21	import multiprocessing
				22	import os
				23	import pprint
				24	import re
				25	import subprocess
				26	import sys
				27	import threading
				28	import tempfile
				29
				30	from rangelib import *
				31
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	32	__all__ = ["EmptyImage", "DataImage", "BlockImageDiff"]
				33
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	34	def compute_patch(src, tgt, imgdiff=False):
				35	srcfd, srcfile = tempfile.mkstemp(prefix="src-")
				36	tgtfd, tgtfile = tempfile.mkstemp(prefix="tgt-")
				37	patchfd, patchfile = tempfile.mkstemp(prefix="patch-")
				38	os.close(patchfd)
				39
				40	try:
				41	with os.fdopen(srcfd, "wb") as f_src:
				42	for p in src:
				43	f_src.write(p)
				44
				45	with os.fdopen(tgtfd, "wb") as f_tgt:
				46	for p in tgt:
				47	f_tgt.write(p)
				48	try:
				49	os.unlink(patchfile)
				50	except OSError:
				51	pass
				52	if imgdiff:
				53	p = subprocess.call(["imgdiff", "-z", srcfile, tgtfile, patchfile],
				54	stdout=open("/dev/null", "a"),
				55	stderr=subprocess.STDOUT)
				56	else:
				57	p = subprocess.call(["bsdiff", srcfile, tgtfile, patchfile])
				58
				59	if p:
				60	raise ValueError("diff failed: " + str(p))
				61
				62	with open(patchfile, "rb") as f:
				63	return f.read()
				64	finally:
				65	try:
				66	os.unlink(srcfile)
				67	os.unlink(tgtfile)
				68	os.unlink(patchfile)
				69	except OSError:
				70	pass
				71
				72	class EmptyImage(object):
				73	"""A zero-length image."""
				74	blocksize = 4096
				75	care_map = RangeSet()
				76	total_blocks = 0
				77	file_map = {}
				78	def ReadRangeSet(self, ranges):
				79	return ()
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	80	def TotalSha1(self):
				81	return sha1().hexdigest()
				82
				83
				84	class DataImage(object):
				85	"""An image wrapped around a single string of data."""
				86
				87	def __init__(self, data, trim=False, pad=False):
				88	self.data = data
				89	self.blocksize = 4096
				90
				91	assert not (trim and pad)
				92
				93	partial = len(self.data) % self.blocksize
				94	if partial > 0:
				95	if trim:
				96	self.data = self.data[:-partial]
				97	elif pad:
				98	self.data += '\0' * (self.blocksize - partial)
				99	else:
				100	raise ValueError(("data for DataImage must be multiple of %d bytes "
				101	"unless trim or pad is specified") %
				102	(self.blocksize,))
				103
				104	assert len(self.data) % self.blocksize == 0
				105
				106	self.total_blocks = len(self.data) / self.blocksize
				107	self.care_map = RangeSet(data=(0, self.total_blocks))
				108
				109	zero_blocks = []
				110	nonzero_blocks = []
				111	reference = '\0' * self.blocksize
				112
				113	for i in range(self.total_blocks):
				114	d = self.data[iself.blocksize : (i+1)self.blocksize]
				115	if d == reference:
				116	zero_blocks.append(i)
				117	zero_blocks.append(i+1)
				118	else:
				119	nonzero_blocks.append(i)
				120	nonzero_blocks.append(i+1)
				121
				122	self.file_map = {"__ZERO": RangeSet(zero_blocks),
				123	"__NONZERO": RangeSet(nonzero_blocks)}
				124
				125	def ReadRangeSet(self, ranges):
				126	return [self.data[sself.blocksize:eself.blocksize] for (s, e) in ranges]
				127
				128	def TotalSha1(self):
				129	if not hasattr(self, "sha1"):
				130	self.sha1 = sha1(self.data).hexdigest()
				131	return self.sha1
				132
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	133
				134	class Transfer(object):
				135	def __init__(self, tgt_name, src_name, tgt_ranges, src_ranges, style, by_id):
				136	self.tgt_name = tgt_name
				137	self.src_name = src_name
				138	self.tgt_ranges = tgt_ranges
				139	self.src_ranges = src_ranges
				140	self.style = style
				141	self.intact = (getattr(tgt_ranges, "monotonic", False) and
				142	getattr(src_ranges, "monotonic", False))
Tao Bao	b8c8717	2015-03-19 19:42:12 -0700	[diff] [blame^]	143
				144	# We use OrderedDict rather than dict so that the output is repeatable;
				145	# otherwise it would depend on the hash values of the Transfer objects.
				146	self.goes_before = OrderedDict()
				147	self.goes_after = OrderedDict()
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	148
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	149	self.stash_before = []
				150	self.use_stash = []
				151
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	152	self.id = len(by_id)
				153	by_id.append(self)
				154
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	155	def NetStashChange(self):
				156	return (sum(sr.size() for (_, sr) in self.stash_before) -
				157	sum(sr.size() for (_, sr) in self.use_stash))
				158
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	159	def __str__(self):
				160	return (str(self.id) + ": <" + str(self.src_ranges) + " " + self.style +
				161	" to " + str(self.tgt_ranges) + ">")
				162
				163
				164	# BlockImageDiff works on two image objects. An image object is
				165	# anything that provides the following attributes:
				166	#
				167	# blocksize: the size in bytes of a block, currently must be 4096.
				168	#
				169	# total_blocks: the total size of the partition/image, in blocks.
				170	#
				171	# care_map: a RangeSet containing which blocks (in the range [0,
				172	# total_blocks) we actually care about; i.e. which blocks contain
				173	# data.
				174	#
				175	# file_map: a dict that partitions the blocks contained in care_map
				176	# into smaller domains that are useful for doing diffs on.
				177	# (Typically a domain is a file, and the key in file_map is the
				178	# pathname.)
				179	#
				180	# ReadRangeSet(): a function that takes a RangeSet and returns the
				181	# data contained in the image blocks of that RangeSet. The data
				182	# is returned as a list or tuple of strings; concatenating the
				183	# elements together should produce the requested data.
				184	# Implementations are free to break up the data into list/tuple
				185	# elements in any way that is convenient.
				186	#
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	187	# TotalSha1(): a function that returns (as a hex string) the SHA-1
				188	# hash of all the data in the image (ie, all the blocks in the
				189	# care_map)
				190	#
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	191	# When creating a BlockImageDiff, the src image may be None, in which
				192	# case the list of transfers produced will never read from the
				193	# original image.
				194
				195	class BlockImageDiff(object):
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	196	def __init__(self, tgt, src=None, threads=None, version=3):
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	197	if threads is None:
				198	threads = multiprocessing.cpu_count() // 2
				199	if threads == 0: threads = 1
				200	self.threads = threads
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	201	self.version = version
				202
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	203	assert version in (1, 2, 3)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	204
				205	self.tgt = tgt
				206	if src is None:
				207	src = EmptyImage()
				208	self.src = src
				209
				210	# The updater code that installs the patch always uses 4k blocks.
				211	assert tgt.blocksize == 4096
				212	assert src.blocksize == 4096
				213
				214	# The range sets in each filemap should comprise a partition of
				215	# the care map.
				216	self.AssertPartition(src.care_map, src.file_map.values())
				217	self.AssertPartition(tgt.care_map, tgt.file_map.values())
				218
				219	def Compute(self, prefix):
				220	# When looking for a source file to use as the diff input for a
				221	# target file, we try:
				222	# 1) an exact path match if available, otherwise
				223	# 2) a exact basename match if available, otherwise
				224	# 3) a basename match after all runs of digits are replaced by
				225	# "#" if available, otherwise
				226	# 4) we have no source for this target.
				227	self.AbbreviateSourceNames()
				228	self.FindTransfers()
				229
				230	# Find the ordering dependencies among transfers (this is O(n^2)
				231	# in the number of transfers).
				232	self.GenerateDigraph()
				233	# Find a sequence of transfers that satisfies as many ordering
				234	# dependencies as possible (heuristically).
				235	self.FindVertexSequence()
				236	# Fix up the ordering dependencies that the sequence didn't
				237	# satisfy.
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	238	if self.version == 1:
				239	self.RemoveBackwardEdges()
				240	else:
				241	self.ReverseBackwardEdges()
				242	self.ImproveVertexSequence()
				243
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	244	# Double-check our work.
				245	self.AssertSequenceGood()
				246
				247	self.ComputePatches(prefix)
				248	self.WriteTransfers(prefix)
				249
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	250	def HashBlocks(self, source, ranges):
				251	data = source.ReadRangeSet(ranges)
				252	ctx = sha1()
				253
				254	for p in data:
				255	ctx.update(p)
				256
				257	return ctx.hexdigest()
				258
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	259	def WriteTransfers(self, prefix):
				260	out = []
				261
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	262	total = 0
				263	performs_read = False
				264
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	265	stashes = {}
				266	stashed_blocks = 0
				267	max_stashed_blocks = 0
				268
				269	free_stash_ids = []
				270	next_stash_id = 0
				271
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	272	for xf in self.transfers:
				273
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	274	if self.version < 2:
				275	assert not xf.stash_before
				276	assert not xf.use_stash
				277
				278	for s, sr in xf.stash_before:
				279	assert s not in stashes
				280	if free_stash_ids:
				281	sid = heapq.heappop(free_stash_ids)
				282	else:
				283	sid = next_stash_id
				284	next_stash_id += 1
				285	stashes[s] = sid
				286	stashed_blocks += sr.size()
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	287	if self.version == 2:
				288	out.append("stash %d %s\n" % (sid, sr.to_string_raw()))
				289	else:
				290	sh = self.HashBlocks(self.src, sr)
				291	if sh in stashes:
				292	stashes[sh] += 1
				293	else:
				294	stashes[sh] = 1
				295	out.append("stash %s %s\n" % (sh, sr.to_string_raw()))
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	296
				297	if stashed_blocks > max_stashed_blocks:
				298	max_stashed_blocks = stashed_blocks
				299
Jesse Zhao	7b985f6	2015-03-02 16:53:08 -0800	[diff] [blame]	300	free_string = []
				301
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	302	if self.version == 1:
				303	src_string = xf.src_ranges.to_string_raw()
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	304	elif self.version >= 2:
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	305
				306	# <# blocks> <src ranges>
				307	# OR
				308	# <# blocks> <src ranges> <src locs> <stash refs...>
				309	# OR
				310	# <# blocks> - <stash refs...>
				311
				312	size = xf.src_ranges.size()
				313	src_string = [str(size)]
				314
				315	unstashed_src_ranges = xf.src_ranges
				316	mapped_stashes = []
				317	for s, sr in xf.use_stash:
				318	sid = stashes.pop(s)
				319	stashed_blocks -= sr.size()
				320	unstashed_src_ranges = unstashed_src_ranges.subtract(sr)
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	321	sh = self.HashBlocks(self.src, sr)
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	322	sr = xf.src_ranges.map_within(sr)
				323	mapped_stashes.append(sr)
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	324	if self.version == 2:
				325	src_string.append("%d:%s" % (sid, sr.to_string_raw()))
				326	else:
				327	assert sh in stashes
				328	src_string.append("%s:%s" % (sh, sr.to_string_raw()))
				329	stashes[sh] -= 1
				330	if stashes[sh] == 0:
				331	free_string.append("free %s\n" % (sh))
				332	stashes.pop(sh)
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	333	heapq.heappush(free_stash_ids, sid)
				334
				335	if unstashed_src_ranges:
				336	src_string.insert(1, unstashed_src_ranges.to_string_raw())
				337	if xf.use_stash:
				338	mapped_unstashed = xf.src_ranges.map_within(unstashed_src_ranges)
				339	src_string.insert(2, mapped_unstashed.to_string_raw())
				340	mapped_stashes.append(mapped_unstashed)
				341	self.AssertPartition(RangeSet(data=(0, size)), mapped_stashes)
				342	else:
				343	src_string.insert(1, "-")
				344	self.AssertPartition(RangeSet(data=(0, size)), mapped_stashes)
				345
				346	src_string = " ".join(src_string)
				347
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	348	# all versions:
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	349	# zero <rangeset>
				350	# new <rangeset>
				351	# erase <rangeset>
				352	#
				353	# version 1:
				354	# bsdiff patchstart patchlen <src rangeset> <tgt rangeset>
				355	# imgdiff patchstart patchlen <src rangeset> <tgt rangeset>
				356	# move <src rangeset> <tgt rangeset>
				357	#
				358	# version 2:
				359	# bsdiff patchstart patchlen <tgt rangeset> <src_string>
				360	# imgdiff patchstart patchlen <tgt rangeset> <src_string>
				361	# move <tgt rangeset> <src_string>
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	362	#
				363	# version 3:
				364	# bsdiff patchstart patchlen srchash tgthash <tgt rangeset> <src_string>
				365	# imgdiff patchstart patchlen srchash tgthash <tgt rangeset> <src_string>
				366	# move hash <tgt rangeset> <src_string>
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	367
				368	tgt_size = xf.tgt_ranges.size()
				369
				370	if xf.style == "new":
				371	assert xf.tgt_ranges
				372	out.append("%s %s\n" % (xf.style, xf.tgt_ranges.to_string_raw()))
				373	total += tgt_size
				374	elif xf.style == "move":
				375	performs_read = True
				376	assert xf.tgt_ranges
				377	assert xf.src_ranges.size() == tgt_size
				378	if xf.src_ranges != xf.tgt_ranges:
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	379	if self.version == 1:
				380	out.append("%s %s %s\n" % (
				381	xf.style,
				382	xf.src_ranges.to_string_raw(), xf.tgt_ranges.to_string_raw()))
				383	elif self.version == 2:
				384	out.append("%s %s %s\n" % (
				385	xf.style,
				386	xf.tgt_ranges.to_string_raw(), src_string))
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	387	elif self.version >= 3:
				388	out.append("%s %s %s %s\n" % (
				389	xf.style,
				390	self.HashBlocks(self.tgt, xf.tgt_ranges),
				391	xf.tgt_ranges.to_string_raw(), src_string))
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	392	total += tgt_size
				393	elif xf.style in ("bsdiff", "imgdiff"):
				394	performs_read = True
				395	assert xf.tgt_ranges
				396	assert xf.src_ranges
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	397	if self.version == 1:
				398	out.append("%s %d %d %s %s\n" % (
				399	xf.style, xf.patch_start, xf.patch_len,
				400	xf.src_ranges.to_string_raw(), xf.tgt_ranges.to_string_raw()))
				401	elif self.version == 2:
				402	out.append("%s %d %d %s %s\n" % (
				403	xf.style, xf.patch_start, xf.patch_len,
				404	xf.tgt_ranges.to_string_raw(), src_string))
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	405	elif self.version >= 3:
				406	out.append("%s %d %d %s %s %s %s\n" % (
				407	xf.style,
				408	xf.patch_start, xf.patch_len,
				409	self.HashBlocks(self.src, xf.src_ranges),
				410	self.HashBlocks(self.tgt, xf.tgt_ranges),
				411	xf.tgt_ranges.to_string_raw(), src_string))
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	412	total += tgt_size
				413	elif xf.style == "zero":
				414	assert xf.tgt_ranges
				415	to_zero = xf.tgt_ranges.subtract(xf.src_ranges)
				416	if to_zero:
				417	out.append("%s %s\n" % (xf.style, to_zero.to_string_raw()))
				418	total += to_zero.size()
				419	else:
				420	raise ValueError, "unknown transfer style '%s'\n" % (xf.style,)
				421
Sami Tolvanen	dd67a29	2014-12-09 16:40:34 +0000	[diff] [blame]	422	if free_string:
				423	out.append("".join(free_string))
				424
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	425
				426	# sanity check: abort if we're going to need more than 512 MB if
				427	# stash space
				428	assert max_stashed_blocks * self.tgt.blocksize < (512 << 20)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	429
				430	all_tgt = RangeSet(data=(0, self.tgt.total_blocks))
				431	if performs_read:
				432	# if some of the original data is used, then at the end we'll
				433	# erase all the blocks on the partition that don't contain data
				434	# in the new image.
				435	new_dontcare = all_tgt.subtract(self.tgt.care_map)
				436	if new_dontcare:
				437	out.append("erase %s\n" % (new_dontcare.to_string_raw(),))
				438	else:
				439	# if nothing is read (ie, this is a full OTA), then we can start
				440	# by erasing the entire partition.
Doug Zongker	e985f6f	2014-09-09 12:38:47 -0700	[diff] [blame]	441	out.insert(0, "erase %s\n" % (all_tgt.to_string_raw(),))
				442
				443	out.insert(0, "%d\n" % (self.version,)) # format version number
				444	out.insert(1, str(total) + "\n")
				445	if self.version >= 2:
				446	# version 2 only: after the total block count, we give the number
				447	# of stash slots needed, and the maximum size needed (in blocks)
				448	out.insert(2, str(next_stash_id) + "\n")
				449	out.insert(3, str(max_stashed_blocks) + "\n")
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	450
				451	with open(prefix + ".transfer.list", "wb") as f:
				452	for i in out:
				453	f.write(i)
				454
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	455	if self.version >= 2:
				456	print("max stashed blocks: %d (%d bytes)\n" % (
				457	max_stashed_blocks, max_stashed_blocks * self.tgt.blocksize))
				458
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	459	def ComputePatches(self, prefix):
				460	print("Reticulating splines...")
				461	diff_q = []
				462	patch_num = 0
				463	with open(prefix + ".new.dat", "wb") as new_f:
				464	for xf in self.transfers:
				465	if xf.style == "zero":
				466	pass
				467	elif xf.style == "new":
				468	for piece in self.tgt.ReadRangeSet(xf.tgt_ranges):
				469	new_f.write(piece)
				470	elif xf.style == "diff":
				471	src = self.src.ReadRangeSet(xf.src_ranges)
				472	tgt = self.tgt.ReadRangeSet(xf.tgt_ranges)
				473
				474	# We can't compare src and tgt directly because they may have
				475	# the same content but be broken up into blocks differently, eg:
				476	#
				477	# ["he", "llo"] vs ["h", "ello"]
				478	#
				479	# We want those to compare equal, ideally without having to
				480	# actually concatenate the strings (these may be tens of
				481	# megabytes).
				482
				483	src_sha1 = sha1()
				484	for p in src:
				485	src_sha1.update(p)
				486	tgt_sha1 = sha1()
				487	tgt_size = 0
				488	for p in tgt:
				489	tgt_sha1.update(p)
				490	tgt_size += len(p)
				491
				492	if src_sha1.digest() == tgt_sha1.digest():
				493	# These are identical; we don't need to generate a patch,
				494	# just issue copy commands on the device.
				495	xf.style = "move"
				496	else:
				497	# For files in zip format (eg, APKs, JARs, etc.) we would
				498	# like to use imgdiff -z if possible (because it usually
				499	# produces significantly smaller patches than bsdiff).
				500	# This is permissible if:
				501	#
				502	# - the source and target files are monotonic (ie, the
				503	# data is stored with blocks in increasing order), and
				504	# - we haven't removed any blocks from the source set.
				505	#
				506	# If these conditions are satisfied then appending all the
				507	# blocks in the set together in order will produce a valid
				508	# zip file (plus possibly extra zeros in the last block),
				509	# which is what imgdiff needs to operate. (imgdiff is
				510	# fine with extra zeros at the end of the file.)
				511	imgdiff = (xf.intact and
				512	xf.tgt_name.split(".")[-1].lower()
				513	in ("apk", "jar", "zip"))
				514	xf.style = "imgdiff" if imgdiff else "bsdiff"
				515	diff_q.append((tgt_size, src, tgt, xf, patch_num))
				516	patch_num += 1
				517
				518	else:
				519	assert False, "unknown style " + xf.style
				520
				521	if diff_q:
				522	if self.threads > 1:
				523	print("Computing patches (using %d threads)..." % (self.threads,))
				524	else:
				525	print("Computing patches...")
				526	diff_q.sort()
				527
				528	patches = [None] * patch_num
				529
				530	lock = threading.Lock()
				531	def diff_worker():
				532	while True:
				533	with lock:
				534	if not diff_q: return
				535	tgt_size, src, tgt, xf, patchnum = diff_q.pop()
				536	patch = compute_patch(src, tgt, imgdiff=(xf.style == "imgdiff"))
				537	size = len(patch)
				538	with lock:
				539	patches[patchnum] = (patch, xf)
				540	print("%10d %10d (%6.2f%%) %7s %s" % (
				541	size, tgt_size, size * 100.0 / tgt_size, xf.style,
				542	xf.tgt_name if xf.tgt_name == xf.src_name else (
				543	xf.tgt_name + " (from " + xf.src_name + ")")))
				544
				545	threads = [threading.Thread(target=diff_worker)
				546	for i in range(self.threads)]
				547	for th in threads:
				548	th.start()
				549	while threads:
				550	threads.pop().join()
				551	else:
				552	patches = []
				553
				554	p = 0
				555	with open(prefix + ".patch.dat", "wb") as patch_f:
				556	for patch, xf in patches:
				557	xf.patch_start = p
				558	xf.patch_len = len(patch)
				559	patch_f.write(patch)
				560	p += len(patch)
				561
				562	def AssertSequenceGood(self):
				563	# Simulate the sequences of transfers we will output, and check that:
				564	# - we never read a block after writing it, and
				565	# - we write every block we care about exactly once.
				566
				567	# Start with no blocks having been touched yet.
				568	touched = RangeSet()
				569
				570	# Imagine processing the transfers in order.
				571	for xf in self.transfers:
				572	# Check that the input blocks for this transfer haven't yet been touched.
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	573
				574	x = xf.src_ranges
				575	if self.version >= 2:
				576	for _, sr in xf.use_stash:
				577	x = x.subtract(sr)
				578
				579	assert not touched.overlaps(x)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	580	# Check that the output blocks for this transfer haven't yet been touched.
				581	assert not touched.overlaps(xf.tgt_ranges)
				582	# Touch all the blocks written by this transfer.
				583	touched = touched.union(xf.tgt_ranges)
				584
				585	# Check that we've written every target block.
				586	assert touched == self.tgt.care_map
				587
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	588	def ImproveVertexSequence(self):
				589	print("Improving vertex order...")
				590
				591	# At this point our digraph is acyclic; we reversed any edges that
				592	# were backwards in the heuristically-generated sequence. The
				593	# previously-generated order is still acceptable, but we hope to
				594	# find a better order that needs less memory for stashed data.
				595	# Now we do a topological sort to generate a new vertex order,
				596	# using a greedy algorithm to choose which vertex goes next
				597	# whenever we have a choice.
				598
				599	# Make a copy of the edge set; this copy will get destroyed by the
				600	# algorithm.
				601	for xf in self.transfers:
				602	xf.incoming = xf.goes_after.copy()
				603	xf.outgoing = xf.goes_before.copy()
				604
				605	L = [] # the new vertex order
				606
				607	# S is the set of sources in the remaining graph; we always choose
				608	# the one that leaves the least amount of stashed data after it's
				609	# executed.
				610	S = [(u.NetStashChange(), u.order, u) for u in self.transfers
				611	if not u.incoming]
				612	heapq.heapify(S)
				613
				614	while S:
				615	_, _, xf = heapq.heappop(S)
				616	L.append(xf)
				617	for u in xf.outgoing:
				618	del u.incoming[xf]
				619	if not u.incoming:
				620	heapq.heappush(S, (u.NetStashChange(), u.order, u))
				621
				622	# if this fails then our graph had a cycle.
				623	assert len(L) == len(self.transfers)
				624
				625	self.transfers = L
				626	for i, xf in enumerate(L):
				627	xf.order = i
				628
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	629	def RemoveBackwardEdges(self):
				630	print("Removing backward edges...")
				631	in_order = 0
				632	out_of_order = 0
				633	lost_source = 0
				634
				635	for xf in self.transfers:
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	636	lost = 0
				637	size = xf.src_ranges.size()
				638	for u in xf.goes_before:
				639	# xf should go before u
				640	if xf.order < u.order:
				641	# it does, hurray!
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	642	in_order += 1
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	643	else:
				644	# it doesn't, boo. trim the blocks that u writes from xf's
				645	# source, so that xf can go after u.
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	646	out_of_order += 1
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	647	assert xf.src_ranges.overlaps(u.tgt_ranges)
				648	xf.src_ranges = xf.src_ranges.subtract(u.tgt_ranges)
				649	xf.intact = False
				650
				651	if xf.style == "diff" and not xf.src_ranges:
				652	# nothing left to diff from; treat as new data
				653	xf.style = "new"
				654
				655	lost = size - xf.src_ranges.size()
				656	lost_source += lost
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	657
				658	print((" %d/%d dependencies (%.2f%%) were violated; "
				659	"%d source blocks removed.") %
				660	(out_of_order, in_order + out_of_order,
				661	(out_of_order * 100.0 / (in_order + out_of_order))
				662	if (in_order + out_of_order) else 0.0,
				663	lost_source))
				664
Doug Zongker	6233818	2014-09-08 08:29:55 -0700	[diff] [blame]	665	def ReverseBackwardEdges(self):
				666	print("Reversing backward edges...")
				667	in_order = 0
				668	out_of_order = 0
				669	stashes = 0
				670	stash_size = 0
				671
				672	for xf in self.transfers:
				673	lost = 0
				674	size = xf.src_ranges.size()
				675	for u in xf.goes_before.copy():
				676	# xf should go before u
				677	if xf.order < u.order:
				678	# it does, hurray!
				679	in_order += 1
				680	else:
				681	# it doesn't, boo. modify u to stash the blocks that it
				682	# writes that xf wants to read, and then require u to go
				683	# before xf.
				684	out_of_order += 1
				685
				686	overlap = xf.src_ranges.intersect(u.tgt_ranges)
				687	assert overlap
				688
				689	u.stash_before.append((stashes, overlap))
				690	xf.use_stash.append((stashes, overlap))
				691	stashes += 1
				692	stash_size += overlap.size()
				693
				694	# reverse the edge direction; now xf must go after u
				695	del xf.goes_before[u]
				696	del u.goes_after[xf]
				697	xf.goes_after[u] = None # value doesn't matter
				698	u.goes_before[xf] = None
				699
				700	print((" %d/%d dependencies (%.2f%%) were violated; "
				701	"%d source blocks stashed.") %
				702	(out_of_order, in_order + out_of_order,
				703	(out_of_order * 100.0 / (in_order + out_of_order))
				704	if (in_order + out_of_order) else 0.0,
				705	stash_size))
				706
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	707	def FindVertexSequence(self):
				708	print("Finding vertex sequence...")
				709
				710	# This is based on "A Fast & Effective Heuristic for the Feedback
				711	# Arc Set Problem" by P. Eades, X. Lin, and W.F. Smyth. Think of
				712	# it as starting with the digraph G and moving all the vertices to
				713	# be on a horizontal line in some order, trying to minimize the
				714	# number of edges that end up pointing to the left. Left-pointing
				715	# edges will get removed to turn the digraph into a DAG. In this
				716	# case each edge has a weight which is the number of source blocks
				717	# we'll lose if that edge is removed; we try to minimize the total
				718	# weight rather than just the number of edges.
				719
				720	# Make a copy of the edge set; this copy will get destroyed by the
				721	# algorithm.
				722	for xf in self.transfers:
				723	xf.incoming = xf.goes_after.copy()
				724	xf.outgoing = xf.goes_before.copy()
				725
				726	# We use an OrderedDict instead of just a set so that the output
				727	# is repeatable; otherwise it would depend on the hash values of
				728	# the transfer objects.
				729	G = OrderedDict()
				730	for xf in self.transfers:
				731	G[xf] = None
				732	s1 = deque() # the left side of the sequence, built from left to right
				733	s2 = deque() # the right side of the sequence, built from right to left
				734
				735	while G:
				736
				737	# Put all sinks at the end of the sequence.
				738	while True:
				739	sinks = [u for u in G if not u.outgoing]
				740	if not sinks: break
				741	for u in sinks:
				742	s2.appendleft(u)
				743	del G[u]
				744	for iu in u.incoming:
				745	del iu.outgoing[u]
				746
				747	# Put all the sources at the beginning of the sequence.
				748	while True:
				749	sources = [u for u in G if not u.incoming]
				750	if not sources: break
				751	for u in sources:
				752	s1.append(u)
				753	del G[u]
				754	for iu in u.outgoing:
				755	del iu.incoming[u]
				756
				757	if not G: break
				758
				759	# Find the "best" vertex to put next. "Best" is the one that
				760	# maximizes the net difference in source blocks saved we get by
				761	# pretending it's a source rather than a sink.
				762
				763	max_d = None
				764	best_u = None
				765	for u in G:
				766	d = sum(u.outgoing.values()) - sum(u.incoming.values())
				767	if best_u is None or d > max_d:
				768	max_d = d
				769	best_u = u
				770
				771	u = best_u
				772	s1.append(u)
				773	del G[u]
				774	for iu in u.outgoing:
				775	del iu.incoming[u]
				776	for iu in u.incoming:
				777	del iu.outgoing[u]
				778
				779	# Now record the sequence in the 'order' field of each transfer,
				780	# and by rearranging self.transfers to be in the chosen sequence.
				781
				782	new_transfers = []
				783	for x in itertools.chain(s1, s2):
				784	x.order = len(new_transfers)
				785	new_transfers.append(x)
				786	del x.incoming
				787	del x.outgoing
				788
				789	self.transfers = new_transfers
				790
				791	def GenerateDigraph(self):
				792	print("Generating digraph...")
				793	for a in self.transfers:
				794	for b in self.transfers:
				795	if a is b: continue
				796
				797	# If the blocks written by A are read by B, then B needs to go before A.
				798	i = a.tgt_ranges.intersect(b.src_ranges)
				799	if i:
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	800	if b.src_name == "__ZERO":
				801	# the cost of removing source blocks for the __ZERO domain
				802	# is (nearly) zero.
				803	size = 0
				804	else:
				805	size = i.size()
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	806	b.goes_before[a] = size
				807	a.goes_after[b] = size
				808
				809	def FindTransfers(self):
				810	self.transfers = []
				811	empty = RangeSet()
				812	for tgt_fn, tgt_ranges in self.tgt.file_map.items():
				813	if tgt_fn == "__ZERO":
				814	# the special "__ZERO" domain is all the blocks not contained
				815	# in any file and that are filled with zeros. We have a
				816	# special transfer style for zero blocks.
				817	src_ranges = self.src.file_map.get("__ZERO", empty)
Doug Zongker	ab7ca1d	2014-08-26 10:40:28 -0700	[diff] [blame]	818	Transfer(tgt_fn, "__ZERO", tgt_ranges, src_ranges,
				819	"zero", self.transfers)
Doug Zongker	fc44a51	2014-08-26 13:10:25 -0700	[diff] [blame]	820	continue
				821
				822	elif tgt_fn in self.src.file_map:
				823	# Look for an exact pathname match in the source.
				824	Transfer(tgt_fn, tgt_fn, tgt_ranges, self.src.file_map[tgt_fn],
				825	"diff", self.transfers)
				826	continue
				827
				828	b = os.path.basename(tgt_fn)
				829	if b in self.src_basenames:
				830	# Look for an exact basename match in the source.
				831	src_fn = self.src_basenames[b]
				832	Transfer(tgt_fn, src_fn, tgt_ranges, self.src.file_map[src_fn],
				833	"diff", self.transfers)
				834	continue
				835
				836	b = re.sub("[0-9]+", "#", b)
				837	if b in self.src_numpatterns:
				838	# Look for a 'number pattern' match (a basename match after
				839	# all runs of digits are replaced by "#"). (This is useful
				840	# for .so files that contain version numbers in the filename
				841	# that get bumped.)
				842	src_fn = self.src_numpatterns[b]
				843	Transfer(tgt_fn, src_fn, tgt_ranges, self.src.file_map[src_fn],
				844	"diff", self.transfers)
				845	continue
				846
				847	Transfer(tgt_fn, None, tgt_ranges, empty, "new", self.transfers)
				848
				849	def AbbreviateSourceNames(self):
				850	self.src_basenames = {}
				851	self.src_numpatterns = {}
				852
				853	for k in self.src.file_map.keys():
				854	b = os.path.basename(k)
				855	self.src_basenames[b] = k
				856	b = re.sub("[0-9]+", "#", b)
				857	self.src_numpatterns[b] = k
				858
				859	@staticmethod
				860	def AssertPartition(total, seq):
				861	"""Assert that all the RangeSets in 'seq' form a partition of the
				862	'total' RangeSet (ie, they are nonintersecting and their union
				863	equals 'total')."""
				864	so_far = RangeSet()
				865	for i in seq:
				866	assert not so_far.overlaps(i)
				867	so_far = so_far.union(i)
				868	assert so_far == total