blob: de81c587566a514c2c486eeefc8852e5caa10eaf [file] [log] [blame]
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -08001#!/usr/bin/env python
2
3from urllib2 import urlopen
4
5TLD_PREFIX = r"""
6 /**
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -08007 * Regular expression to match all IANA top-level domains.
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -08008 * List accurate as of 2010/02/05. List taken from:
9 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
10 * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
11 */
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -080012 public static final String TOP_LEVEL_DOMAIN_STR =
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -080013"""
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -080014TLD_SUFFIX = '";'
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -080015
16URL_PREFIX = r"""
17 /**
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -080018 * Regular expression to match all IANA top-level domains for WEB_URL.
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -080019 * List accurate as of 2010/02/05. List taken from:
20 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -080021 * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -080022 */
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -080023 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
24 "(?:"
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -080025"""
26
Shimeng (Simon) Wang3207e292010-02-11 14:07:44 -080027URL_SUFFIX = ';'
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -080028
29class Bucket:
30 def __init__(self, baseLetter):
31 self.base=baseLetter
32 self.words=[]
33 self.letters=[]
34
35 def dump(self, isWebUrl=False, isFirst=False, isLast=False):
36 if (len(self.words) == 0) and (len(self.letters) == 0):
37 return ''
38
39 self.words.sort()
40 self.letters.sort()
41
42 output = ' ';
43
44 if isFirst:
45 if isWebUrl:
46 output += '+ "'
47 else:
48 output += '"('
49 else:
50 output += '+ "|'
51
52 if len(self.words) != 0:
53 output += '('
54
55 if isWebUrl:
56 output += '?:'
57
58 firstWord = 1
59 for word in self.words:
60 if firstWord == 0:
61 output += '|'
62 firstWord = 0
63 for letter in word:
64 if letter == '-':
65 output += '\\\\' # escape the '-' character.
66 output += letter
67
68 if len(self.words) > 0 and len(self.letters) > 0:
69 output += '|'
70
71 if len(self.letters) == 1:
72 output += '%c%c' % (self.base, self.letters[0])
73 elif len(self.letters) > 0:
74 output += '%c[' % self.base
75
76 for letter in self.letters:
77 output += letter
78
79 output += ']'
80
81 if len(self.words) != 0:
82 output += ')'
83
84 if not isLast:
85 output += '"'
86 output += '\n'
87
88 return output;
89
90 def add(self, line):
91 length = len(line)
92
93 if line.startswith('#') or (length == 0):
94 return;
95
96 if length == 2:
97 self.letters.append(line[1:2])
98 else:
99 self.words.append(line)
100
101def getBucket(buckets, line):
102 letter = line[0]
103 bucket = buckets.get(letter)
104
105 if bucket is None:
106 bucket = Bucket(letter)
107 buckets[letter] = bucket
108
109 return bucket
110
111def makePattern(prefix, suffix, buckets, isWebUrl=False):
112 output = prefix
113
114 output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
115
116 for letter in range(ord('b'), ord('z')):
117 output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
118
119 output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
120
121 if isWebUrl:
122 output += '))"'
123 else:
124 output += ')'
125
126 output += suffix
127
128 print output
129
130if __name__ == "__main__":
131 f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
132 domains = f.readlines()
133 f.close()
134
135 buckets = {}
136
137 for domain in domains:
138 domain = domain.lower()
139
140 if len(domain) > 0:
141 getBucket(buckets, domain[0]).add(domain.strip())
142
143 makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
144 makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)