blob: ece4dcfea17e0a7365873246b9e2a6062b48df85 [file] [log] [blame]
Shimeng (Simon) Wang56811ab2010-02-10 11:22:01 -08001#!/usr/bin/env python
2
3from urllib2 import urlopen
4
5TLD_PREFIX = r"""
6 /**
7 * Regular expression pattern to match all IANA top-level domains.
8 * List accurate as of 2010/02/05. List taken from:
9 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
10 * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
11 */
12 public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
13"""
14TLD_SUFFIX = '");'
15
16URL_PREFIX = r"""
17 /**
18 * Regular expression pattern to match RFC 1738 URLs
19 * List accurate as of 2010/02/05. List taken from:
20 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
21 * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
22 */
23 public static final Pattern WEB_URL = Pattern.compile(
24 "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
25 + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
26 + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
27 + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
28 + "(?:" // plus top level domain
29"""
30
31URL_SUFFIX = r"""
32 + "|(?:(?:25[0-5]|2[0-4]" // or ip address
33 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
34 + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
35 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
36 + "|[1-9][0-9]|[0-9])))"
37 + "(?:\\:\\d{1,5})?)" // plus option port number
38 + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
39 + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
40 + "(?:\\b|$)"); // and finally, a word boundary or end of
41 // input. This is to stop foo.sure from
42 // matching as foo.su
43"""
44
45class Bucket:
46 def __init__(self, baseLetter):
47 self.base=baseLetter
48 self.words=[]
49 self.letters=[]
50
51 def dump(self, isWebUrl=False, isFirst=False, isLast=False):
52 if (len(self.words) == 0) and (len(self.letters) == 0):
53 return ''
54
55 self.words.sort()
56 self.letters.sort()
57
58 output = ' ';
59
60 if isFirst:
61 if isWebUrl:
62 output += '+ "'
63 else:
64 output += '"('
65 else:
66 output += '+ "|'
67
68 if len(self.words) != 0:
69 output += '('
70
71 if isWebUrl:
72 output += '?:'
73
74 firstWord = 1
75 for word in self.words:
76 if firstWord == 0:
77 output += '|'
78 firstWord = 0
79 for letter in word:
80 if letter == '-':
81 output += '\\\\' # escape the '-' character.
82 output += letter
83
84 if len(self.words) > 0 and len(self.letters) > 0:
85 output += '|'
86
87 if len(self.letters) == 1:
88 output += '%c%c' % (self.base, self.letters[0])
89 elif len(self.letters) > 0:
90 output += '%c[' % self.base
91
92 for letter in self.letters:
93 output += letter
94
95 output += ']'
96
97 if len(self.words) != 0:
98 output += ')'
99
100 if not isLast:
101 output += '"'
102 output += '\n'
103
104 return output;
105
106 def add(self, line):
107 length = len(line)
108
109 if line.startswith('#') or (length == 0):
110 return;
111
112 if length == 2:
113 self.letters.append(line[1:2])
114 else:
115 self.words.append(line)
116
117def getBucket(buckets, line):
118 letter = line[0]
119 bucket = buckets.get(letter)
120
121 if bucket is None:
122 bucket = Bucket(letter)
123 buckets[letter] = bucket
124
125 return bucket
126
127def makePattern(prefix, suffix, buckets, isWebUrl=False):
128 output = prefix
129
130 output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
131
132 for letter in range(ord('b'), ord('z')):
133 output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
134
135 output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
136
137 if isWebUrl:
138 output += '))"'
139 else:
140 output += ')'
141
142 output += suffix
143
144 print output
145
146if __name__ == "__main__":
147 f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
148 domains = f.readlines()
149 f.close()
150
151 buckets = {}
152
153 for domain in domains:
154 domain = domain.lower()
155
156 if len(domain) > 0:
157 getBucket(buckets, domain[0]).add(domain.strip())
158
159 makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
160 makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)