blob: 008344cc9480201b7f835410cae9926e1531f773 [file] [log] [blame]
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08001#!/usr/bin/env python
2
3import collections
Roozbeh Pournader5dde0872016-03-31 13:54:56 -07004import copy
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08005import glob
Roozbeh Pournader5dde0872016-03-31 13:54:56 -07006import itertools
Roozbeh Pournader0e969e22016-03-09 23:08:45 -08007from os import path
8import sys
9from xml.etree import ElementTree
10
11from fontTools import ttLib
12
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070013EMOJI_VS = 0xFE0F
14
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080015LANG_TO_SCRIPT = {
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070016 'as': 'Beng',
Roozbeh Pournader033b2222017-02-22 18:53:39 -080017 'bg': 'Cyrl',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070018 'bn': 'Beng',
Roozbeh Pournader033b2222017-02-22 18:53:39 -080019 'cu': 'Cyrl',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070020 'cy': 'Latn',
21 'da': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080022 'de': 'Latn',
23 'en': 'Latn',
24 'es': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070025 'et': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080026 'eu': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070027 'fr': 'Latn',
28 'ga': 'Latn',
29 'gu': 'Gujr',
30 'hi': 'Deva',
31 'hr': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080032 'hu': 'Latn',
33 'hy': 'Armn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070034 'ja': 'Jpan',
35 'kn': 'Knda',
36 'ko': 'Kore',
37 'ml': 'Mlym',
38 'mn': 'Cyrl',
39 'mr': 'Deva',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080040 'nb': 'Latn',
41 'nn': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070042 'or': 'Orya',
43 'pa': 'Guru',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080044 'pt': 'Latn',
Jungshik Shin6c4f9e02016-03-19 09:32:34 -070045 'sl': 'Latn',
46 'ta': 'Taml',
47 'te': 'Telu',
48 'tk': 'Latn',
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080049}
50
51def lang_to_script(lang_code):
52 lang = lang_code.lower()
53 while lang not in LANG_TO_SCRIPT:
54 hyphen_idx = lang.rfind('-')
55 assert hyphen_idx != -1, (
56 'We do not know what script the "%s" language is written in.'
57 % lang_code)
58 assumed_script = lang[hyphen_idx+1:]
59 if len(assumed_script) == 4 and assumed_script.isalpha():
60 # This is actually the script
61 return assumed_script.title()
62 lang = lang[:hyphen_idx]
63 return LANG_TO_SCRIPT[lang]
64
65
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070066def printable(inp):
67 if type(inp) is set: # set of character sequences
68 return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
69 if type(inp) is tuple: # character sequence
70 return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
71 else: # single character
72 return 'U+%04X' % inp
73
74
75def open_font(font):
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080076 font_file, index = font
77 font_path = path.join(_fonts_dir, font_file)
78 if index is not None:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070079 return ttLib.TTFont(font_path, fontNumber=index)
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080080 else:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -070081 return ttLib.TTFont(font_path)
82
83
84def get_best_cmap(font):
85 ttfont = open_font(font)
Roozbeh Pournader0e969e22016-03-09 23:08:45 -080086 all_unicode_cmap = None
87 bmp_cmap = None
88 for cmap in ttfont['cmap'].tables:
89 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
90 if specifier == (4, 3, 1):
91 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
92 bmp_cmap = cmap
93 elif specifier == (12, 3, 10):
94 assert all_unicode_cmap is None, (
95 'More than one UCS-4 cmap in %s' % (font, ))
96 all_unicode_cmap = cmap
97
98 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
99
100
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700101def get_variation_sequences_cmap(font):
102 ttfont = open_font(font)
103 vs_cmap = None
104 for cmap in ttfont['cmap'].tables:
105 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
106 if specifier == (14, 0, 5):
107 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
108 vs_cmap = cmap
109 return vs_cmap
110
111
112def get_emoji_map(font):
113 # Add normal characters
114 emoji_map = copy.copy(get_best_cmap(font))
115 reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
116
117 # Add variation sequences
118 vs_dict = get_variation_sequences_cmap(font).uvsDict
119 for vs in vs_dict:
120 for base, glyph in vs_dict[vs]:
121 if glyph is None:
122 emoji_map[(base, vs)] = emoji_map[base]
123 else:
124 emoji_map[(base, vs)] = glyph
125
126 # Add GSUB rules
127 ttfont = open_font(font)
128 for lookup in ttfont['GSUB'].table.LookupList.Lookup:
129 assert lookup.LookupType == 4, 'We only understand type 4 lookups'
130 for subtable in lookup.SubTable:
131 ligatures = subtable.ligatures
132 for first_glyph in ligatures:
133 for ligature in ligatures[first_glyph]:
134 sequence = [first_glyph] + ligature.Component
135 sequence = [reverse_cmap[glyph] for glyph in sequence]
136 sequence = tuple(sequence)
137 # Make sure no starting subsequence of 'sequence' has been
138 # seen before.
139 for sub_len in range(2, len(sequence)+1):
140 subsequence = sequence[:sub_len]
141 assert subsequence not in emoji_map
142 emoji_map[sequence] = ligature.LigGlyph
143
144 return emoji_map
145
146
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800147def assert_font_supports_any_of_chars(font, chars):
148 best_cmap = get_best_cmap(font)
149 for char in chars:
150 if char in best_cmap:
151 return
152 sys.exit('None of characters in %s were found in %s' % (chars, font))
153
154
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700155def assert_font_supports_all_of_chars(font, chars):
156 best_cmap = get_best_cmap(font)
157 for char in chars:
158 assert char in best_cmap, (
159 'U+%04X was not found in %s' % (char, font))
160
161
162def assert_font_supports_none_of_chars(font, chars):
163 best_cmap = get_best_cmap(font)
164 for char in chars:
165 assert char not in best_cmap, (
166 'U+%04X was found in %s' % (char, font))
167
168
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700169def assert_font_supports_all_sequences(font, sequences):
170 vs_dict = get_variation_sequences_cmap(font).uvsDict
171 for base, vs in sorted(sequences):
172 assert vs in vs_dict and (base, None) in vs_dict[vs], (
173 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
174
175
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800176def check_hyphens(hyphens_dir):
177 # Find all the scripts that need automatic hyphenation
178 scripts = set()
179 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
180 hyb_file = path.basename(hyb_file)
181 assert hyb_file.startswith('hyph-'), (
182 'Unknown hyphenation file %s' % hyb_file)
183 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
184 scripts.add(lang_to_script(lang_code))
185
186 HYPHENS = {0x002D, 0x2010}
187 for script in scripts:
188 fonts = _script_to_font_map[script]
189 assert fonts, 'No fonts found for the "%s" script' % script
190 for font in fonts:
191 assert_font_supports_any_of_chars(font, HYPHENS)
192
193
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700194class FontRecord(object):
195 def __init__(self, name, scripts, variant, weight, style, font):
196 self.name = name
197 self.scripts = scripts
198 self.variant = variant
199 self.weight = weight
200 self.style = style
201 self.font = font
202
203
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800204def parse_fonts_xml(fonts_xml_path):
205 global _script_to_font_map, _fallback_chain
206 _script_to_font_map = collections.defaultdict(set)
207 _fallback_chain = []
208 tree = ElementTree.parse(fonts_xml_path)
Seigo Nonaka9092dc22017-01-06 16:54:52 +0900209 families = tree.findall('family')
210 # Minikin supports up to 254 but users can place their own font at the first
211 # place. Thus, 253 is the maximum allowed number of font families in the
212 # default collection.
213 assert len(families) < 254, (
214 'System font collection can contains up to 253 font families.')
215 for family in families:
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800216 name = family.get('name')
217 variant = family.get('variant')
218 langs = family.get('lang')
219 if name:
220 assert variant is None, (
221 'No variant expected for LGC font %s.' % name)
222 assert langs is None, (
223 'No language expected for LGC fonts %s.' % name)
224 else:
225 assert variant in {None, 'elegant', 'compact'}, (
226 'Unexpected value for variant: %s' % variant)
227
228 if langs:
229 langs = langs.split()
230 scripts = {lang_to_script(lang) for lang in langs}
231 else:
232 scripts = set()
233
234 for child in family:
235 assert child.tag == 'font', (
236 'Unknown tag <%s>' % child.tag)
237 font_file = child.text
238 weight = int(child.get('weight'))
239 assert weight % 100 == 0, (
240 'Font weight "%d" is not a multiple of 100.' % weight)
241
242 style = child.get('style')
243 assert style in {'normal', 'italic'}, (
244 'Unknown style "%s"' % style)
245
246 index = child.get('index')
247 if index:
248 index = int(index)
249
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700250 _fallback_chain.append(FontRecord(
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800251 name,
252 frozenset(scripts),
253 variant,
254 weight,
255 style,
256 (font_file, index)))
257
258 if name: # non-empty names are used for default LGC fonts
259 map_scripts = {'Latn', 'Grek', 'Cyrl'}
260 else:
261 map_scripts = scripts
262 for script in map_scripts:
263 _script_to_font_map[script].add((font_file, index))
264
265
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700266def check_emoji_coverage(all_emoji, equivalent_emoji):
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700267 emoji_font = get_emoji_font()
268 check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
Doug Feltf874a192016-07-08 17:42:15 -0700269
270
271def get_emoji_font():
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700272 emoji_fonts = [
273 record.font for record in _fallback_chain
274 if 'Zsye' in record.scripts]
Roozbeh Pournader27ec3ac2016-03-31 13:05:32 -0700275 assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
Doug Feltf874a192016-07-08 17:42:15 -0700276 return emoji_fonts[0]
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700277
Doug Feltf874a192016-07-08 17:42:15 -0700278
279def check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
280 coverage = get_emoji_map(emoji_font)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700281 for sequence in all_emoji:
282 assert sequence in coverage, (
283 '%s is not supported in the emoji font.' % printable(sequence))
284
285 for sequence in coverage:
286 if sequence in {0x0000, 0x000D, 0x0020}:
287 # The font needs to support a few extra characters, which is OK
288 continue
289 assert sequence in all_emoji, (
290 'Emoji font should not support %s.' % printable(sequence))
291
292 for first, second in sorted(equivalent_emoji.items()):
293 assert coverage[first] == coverage[second], (
294 '%s and %s should map to the same glyph.' % (
295 printable(first),
296 printable(second)))
297
298 for glyph in set(coverage.values()):
299 maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
300 if len(maps_to_glyph) > 1:
301 # There are more than one sequences mapping to the same glyph. We
302 # need to make sure they were expected to be equivalent.
303 equivalent_seqs = set()
304 for seq in maps_to_glyph:
305 equivalent_seq = seq
306 while equivalent_seq in equivalent_emoji:
307 equivalent_seq = equivalent_emoji[equivalent_seq]
308 equivalent_seqs.add(equivalent_seq)
309 assert len(equivalent_seqs) == 1, (
310 'The sequences %s should not result in the same glyph %s' % (
311 printable(equivalent_seqs),
312 glyph))
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700313
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700314
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700315def check_emoji_defaults(default_emoji):
316 missing_text_chars = _emoji_properties['Emoji'] - default_emoji
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700317 emoji_font_seen = False
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700318 for record in _fallback_chain:
319 if 'Zsye' in record.scripts:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700320 emoji_font_seen = True
321 # No need to check the emoji font
322 continue
323 # For later fonts, we only check them if they have a script
324 # defined, since the defined script may get them to a higher
yiruif9936b92016-09-07 14:37:30 +0900325 # score even if they appear after the emoji font. However,
326 # we should skip checking the text symbols font, since
327 # symbol fonts should be able to override the emoji display
328 # style when 'Zsym' is explicitly specified by the user.
329 if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts):
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700330 continue
331
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700332 # Check default emoji-style characters
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700333 assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700334
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700335 # Mark default text-style characters appearing in fonts above the emoji
336 # font as seen
337 if not emoji_font_seen:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700338 missing_text_chars -= set(get_best_cmap(record.font))
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700339
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700340 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
341 # webdings yet.
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700342 missing_text_chars -= _chars_by_age['7.0']
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700343 assert missing_text_chars == set(), (
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700344 'Text style version of some emoji characters are missing: ' +
345 repr(missing_text_chars))
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700346
347
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700348# Setting reverse to true returns a dictionary that maps the values to sets of
349# characters, useful for some binary properties. Otherwise, we get a
350# dictionary that maps characters to the property values, assuming there's only
351# one property in the file.
352def parse_unicode_datafile(file_path, reverse=False):
353 if reverse:
354 output_dict = collections.defaultdict(set)
355 else:
356 output_dict = {}
357 with open(file_path) as datafile:
358 for line in datafile:
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700359 if '#' in line:
360 line = line[:line.index('#')]
361 line = line.strip()
362 if not line:
363 continue
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700364
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700365 chars, prop = line.split(';')[:2]
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700366 chars = chars.strip()
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700367 prop = prop.strip()
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700368
369 if ' ' in chars: # character sequence
370 sequence = [int(ch, 16) for ch in chars.split(' ')]
371 additions = [tuple(sequence)]
372 elif '..' in chars: # character range
373 char_start, char_end = chars.split('..')
374 char_start = int(char_start, 16)
375 char_end = int(char_end, 16)
376 additions = xrange(char_start, char_end+1)
377 else: # singe character
378 additions = [int(chars, 16)]
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700379 if reverse:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700380 output_dict[prop].update(additions)
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700381 else:
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700382 for addition in additions:
383 assert addition not in output_dict
384 output_dict[addition] = prop
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700385 return output_dict
386
387
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700388def parse_standardized_variants(file_path):
389 emoji_set = set()
390 text_set = set()
391 with open(file_path) as datafile:
392 for line in datafile:
393 if '#' in line:
394 line = line[:line.index('#')]
395 line = line.strip()
396 if not line:
397 continue
398 sequence, description, _ = line.split(';')
399 sequence = sequence.strip().split(' ')
400 base = int(sequence[0], 16)
401 vs = int(sequence[1], 16)
402 description = description.strip()
403 if description == 'text style':
404 text_set.add((base, vs))
405 elif description == 'emoji style':
406 emoji_set.add((base, vs))
407 return text_set, emoji_set
408
409
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700410def parse_ucd(ucd_path):
411 global _emoji_properties, _chars_by_age
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700412 global _text_variation_sequences, _emoji_variation_sequences
413 global _emoji_sequences, _emoji_zwj_sequences
Roozbeh Pournader7b822e52016-03-16 18:55:32 -0700414 _emoji_properties = parse_unicode_datafile(
415 path.join(ucd_path, 'emoji-data.txt'), reverse=True)
416 _chars_by_age = parse_unicode_datafile(
417 path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700418 sequences = parse_standardized_variants(
419 path.join(ucd_path, 'StandardizedVariants.txt'))
420 _text_variation_sequences, _emoji_variation_sequences = sequences
421 _emoji_sequences = parse_unicode_datafile(
422 path.join(ucd_path, 'emoji-sequences.txt'))
423 _emoji_zwj_sequences = parse_unicode_datafile(
424 path.join(ucd_path, 'emoji-zwj-sequences.txt'))
425
426
427def flag_sequence(territory_code):
428 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
429
430
431UNSUPPORTED_FLAGS = frozenset({
432 flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
433 flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
434 flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
435 flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
436 flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700437 flag_sequence('UN'), flag_sequence('WF'), flag_sequence('XK'),
438 flag_sequence('YT'),
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700439})
440
441EQUIVALENT_FLAGS = {
442 flag_sequence('BV'): flag_sequence('NO'),
443 flag_sequence('CP'): flag_sequence('FR'),
444 flag_sequence('HM'): flag_sequence('AU'),
445 flag_sequence('SJ'): flag_sequence('NO'),
446 flag_sequence('UM'): flag_sequence('US'),
447}
448
449COMBINING_KEYCAP = 0x20E3
450
Roozbeh Pournader10ea8f72016-07-25 18:14:14 -0700451# Characters that Android defaults to emoji style, different from the recommendations in UTR #51
452ANDROID_DEFAULT_EMOJI = frozenset({
453 0x2600, # BLACK SUN WITH RAYS
454 0x2601, # CLOUD
455 0x260E, # BLACK TELEPHONE
456 0x261D, # WHITE UP POINTING INDEX
457 0x263A, # WHITE SMILING FACE
458 0x2660, # BLACK SPADE SUIT
459 0x2663, # BLACK CLUB SUIT
460 0x2665, # BLACK HEART SUIT
461 0x2666, # BLACK DIAMOND SUIT
462 0x270C, # VICTORY HAND
463 0x2744, # SNOWFLAKE
464 0x2764, # HEAVY BLACK HEART
465})
466
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700467LEGACY_ANDROID_EMOJI = {
468 0xFE4E5: flag_sequence('JP'),
469 0xFE4E6: flag_sequence('US'),
470 0xFE4E7: flag_sequence('FR'),
471 0xFE4E8: flag_sequence('DE'),
472 0xFE4E9: flag_sequence('IT'),
473 0xFE4EA: flag_sequence('GB'),
474 0xFE4EB: flag_sequence('ES'),
475 0xFE4EC: flag_sequence('RU'),
476 0xFE4ED: flag_sequence('CN'),
477 0xFE4EE: flag_sequence('KR'),
478 0xFE82C: (ord('#'), COMBINING_KEYCAP),
479 0xFE82E: (ord('1'), COMBINING_KEYCAP),
480 0xFE82F: (ord('2'), COMBINING_KEYCAP),
481 0xFE830: (ord('3'), COMBINING_KEYCAP),
482 0xFE831: (ord('4'), COMBINING_KEYCAP),
483 0xFE832: (ord('5'), COMBINING_KEYCAP),
484 0xFE833: (ord('6'), COMBINING_KEYCAP),
485 0xFE834: (ord('7'), COMBINING_KEYCAP),
486 0xFE835: (ord('8'), COMBINING_KEYCAP),
487 0xFE836: (ord('9'), COMBINING_KEYCAP),
488 0xFE837: (ord('0'), COMBINING_KEYCAP),
489}
490
491ZWJ_IDENTICALS = {
492 # KISS
493 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
494 # COUPLE WITH HEART
495 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
496 # FAMILY
497 (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
498}
499
Doug Feltf874a192016-07-08 17:42:15 -0700500
501def is_fitzpatrick_modifier(cp):
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700502 return 0x1F3FB <= cp <= 0x1F3FF
503
504
505def reverse_emoji(seq):
506 rev = list(reversed(seq))
507 # if there are fitzpatrick modifiers in the sequence, keep them after
508 # the emoji they modify
509 for i in xrange(1, len(rev)):
510 if is_fitzpatrick_modifier(rev[i-1]):
511 rev[i], rev[i-1] = rev[i-1], rev[i]
512 return tuple(rev)
Doug Feltf874a192016-07-08 17:42:15 -0700513
514
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700515def compute_expected_emoji():
516 equivalent_emoji = {}
517 sequence_pieces = set()
518 all_sequences = set()
519 all_sequences.update(_emoji_variation_sequences)
520
Raph Levien2b8b8192016-08-09 14:28:54 -0700521 # add zwj sequences not in the current emoji-zwj-sequences.txt
522 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
523 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
Roozbeh Pournaderfabeed62017-03-16 17:57:49 -0700524 # Wrestlers with modifiers
Raph Levien2b8b8192016-08-09 14:28:54 -0700525 additional_emoji_zwj = (
Roozbeh Pournaderfabeed62017-03-16 17:57:49 -0700526 (0x1F93C, 0x1F3FB, 0x200D, 0x2640),
527 (0x1F93C, 0x1F3FB, 0x200D, 0x2642),
528 (0x1F93C, 0x1F3FC, 0x200D, 0x2640),
529 (0x1F93C, 0x1F3FC, 0x200D, 0x2642),
530 (0x1F93C, 0x1F3FD, 0x200D, 0x2640),
531 (0x1F93C, 0x1F3FD, 0x200D, 0x2642),
532 (0x1F93C, 0x1F3FE, 0x200D, 0x2640),
533 (0x1F93C, 0x1F3FE, 0x200D, 0x2642),
534 (0x1F93C, 0x1F3FF, 0x200D, 0x2640),
535 (0x1F93C, 0x1F3FF, 0x200D, 0x2642),
Raph Levien2b8b8192016-08-09 14:28:54 -0700536 )
537 for seq in additional_emoji_zwj:
538 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
Raph Levien2b8b8192016-08-09 14:28:54 -0700539
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700540 for sequence in _emoji_sequences.keys():
541 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
542 all_sequences.add(sequence)
543 sequence_pieces.update(sequence)
544
Raph Levien2b8b8192016-08-09 14:28:54 -0700545 for sequence in adjusted_emoji_zwj_sequences.keys():
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700546 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
547 all_sequences.add(sequence)
548 sequence_pieces.update(sequence)
549 # Add reverse of all emoji ZWJ sequences, which are added to the fonts
550 # as a workaround to get the sequences work in RTL text.
Roozbeh Pournader3b3c78e2016-07-25 14:04:34 -0700551 reversed_seq = reverse_emoji(sequence)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700552 all_sequences.add(reversed_seq)
553 equivalent_emoji[reversed_seq] = sequence
554
555 # Add all two-letter flag sequences, as even the unsupported ones should
556 # resolve to a flag tofu.
557 all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)]
558 all_two_letter_codes = itertools.product(all_letters, repeat=2)
559 all_flags = {flag_sequence(code) for code in all_two_letter_codes}
560 all_sequences.update(all_flags)
561 tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys()))
562
563 all_emoji = (
564 _emoji_properties['Emoji'] |
565 all_sequences |
566 sequence_pieces |
567 set(LEGACY_ANDROID_EMOJI.keys()))
568 default_emoji = (
569 _emoji_properties['Emoji_Presentation'] |
Roozbeh Pournader10ea8f72016-07-25 18:14:14 -0700570 ANDROID_DEFAULT_EMOJI |
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700571 all_sequences |
572 set(LEGACY_ANDROID_EMOJI.keys()))
573
574 first_tofu_flag = sorted(tofu_flags)[0]
575 for flag in tofu_flags:
576 if flag != first_tofu_flag:
577 equivalent_emoji[flag] = first_tofu_flag
578 equivalent_emoji.update(EQUIVALENT_FLAGS)
579 equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
580 equivalent_emoji.update(ZWJ_IDENTICALS)
581 for seq in _emoji_variation_sequences:
582 equivalent_emoji[seq] = seq[0]
583
584 return all_emoji, default_emoji, equivalent_emoji
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700585
586
Roozbeh Pournaderbac1aec2016-07-27 13:08:37 -0700587def check_vertical_metrics():
588 for record in _fallback_chain:
589 if record.name in ['sans-serif', 'sans-serif-condensed']:
590 font = open_font(record.font)
Roozbeh Pournaderede3a172016-07-27 16:35:12 -0700591 assert font['head'].yMax == 2163 and font['head'].yMin == -555, (
592 'yMax and yMin of %s do not match expected values.' % (record.font,))
593
594 if record.name in ['sans-serif', 'sans-serif-condensed', 'serif', 'monospace']:
595 font = open_font(record.font)
596 assert font['hhea'].ascent == 1900 and font['hhea'].descent == -500, (
597 'ascent and descent of %s do not match expected values.' % (record.font,))
Roozbeh Pournaderbac1aec2016-07-27 13:08:37 -0700598
599
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800600def main():
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800601 global _fonts_dir
Doug Feltf874a192016-07-08 17:42:15 -0700602 target_out = sys.argv[1]
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800603 _fonts_dir = path.join(target_out, 'fonts')
604
605 fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
606 parse_fonts_xml(fonts_xml_path)
607
Roozbeh Pournaderbac1aec2016-07-27 13:08:37 -0700608 check_vertical_metrics()
609
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800610 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
611 check_hyphens(hyphens_dir)
612
Roozbeh Pournader27ec3ac2016-03-31 13:05:32 -0700613 check_emoji = sys.argv[2]
614 if check_emoji == 'true':
615 ucd_path = sys.argv[3]
616 parse_ucd(ucd_path)
Roozbeh Pournader5dde0872016-03-31 13:54:56 -0700617 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
618 check_emoji_coverage(all_emoji, equivalent_emoji)
619 check_emoji_defaults(default_emoji)
Roozbeh Pournaderfa1facc2016-03-16 13:53:47 -0700620
Roozbeh Pournader0e969e22016-03-09 23:08:45 -0800621
622if __name__ == '__main__':
623 main()