Shimeng (Simon) Wang | 56811ab | 2010-02-10 11:22:01 -0800 | [diff] [blame^] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | from urllib2 import urlopen |
| 4 | |
| 5 | TLD_PREFIX = r""" |
| 6 | /** |
| 7 | * Regular expression pattern to match all IANA top-level domains. |
| 8 | * List accurate as of 2010/02/05. List taken from: |
| 9 | * http://data.iana.org/TLD/tlds-alpha-by-domain.txt |
| 10 | * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py |
| 11 | */ |
| 12 | public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( |
| 13 | """ |
| 14 | TLD_SUFFIX = '");' |
| 15 | |
| 16 | URL_PREFIX = r""" |
| 17 | /** |
| 18 | * Regular expression pattern to match RFC 1738 URLs |
| 19 | * List accurate as of 2010/02/05. List taken from: |
| 20 | * http://data.iana.org/TLD/tlds-alpha-by-domain.txt |
| 21 | * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py |
| 22 | */ |
| 23 | public static final Pattern WEB_URL = Pattern.compile( |
| 24 | "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" |
| 25 | + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" |
| 26 | + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" |
| 27 | + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host |
| 28 | + "(?:" // plus top level domain |
| 29 | """ |
| 30 | |
| 31 | URL_SUFFIX = r""" |
| 32 | + "|(?:(?:25[0-5]|2[0-4]" // or ip address |
| 33 | + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" |
| 34 | + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" |
| 35 | + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" |
| 36 | + "|[1-9][0-9]|[0-9])))" |
| 37 | + "(?:\\:\\d{1,5})?)" // plus option port number |
| 38 | + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params |
| 39 | + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" |
| 40 | + "(?:\\b|$)"); // and finally, a word boundary or end of |
| 41 | // input. This is to stop foo.sure from |
| 42 | // matching as foo.su |
| 43 | """ |
| 44 | |
| 45 | class Bucket: |
| 46 | def __init__(self, baseLetter): |
| 47 | self.base=baseLetter |
| 48 | self.words=[] |
| 49 | self.letters=[] |
| 50 | |
| 51 | def dump(self, isWebUrl=False, isFirst=False, isLast=False): |
| 52 | if (len(self.words) == 0) and (len(self.letters) == 0): |
| 53 | return '' |
| 54 | |
| 55 | self.words.sort() |
| 56 | self.letters.sort() |
| 57 | |
| 58 | output = ' '; |
| 59 | |
| 60 | if isFirst: |
| 61 | if isWebUrl: |
| 62 | output += '+ "' |
| 63 | else: |
| 64 | output += '"(' |
| 65 | else: |
| 66 | output += '+ "|' |
| 67 | |
| 68 | if len(self.words) != 0: |
| 69 | output += '(' |
| 70 | |
| 71 | if isWebUrl: |
| 72 | output += '?:' |
| 73 | |
| 74 | firstWord = 1 |
| 75 | for word in self.words: |
| 76 | if firstWord == 0: |
| 77 | output += '|' |
| 78 | firstWord = 0 |
| 79 | for letter in word: |
| 80 | if letter == '-': |
| 81 | output += '\\\\' # escape the '-' character. |
| 82 | output += letter |
| 83 | |
| 84 | if len(self.words) > 0 and len(self.letters) > 0: |
| 85 | output += '|' |
| 86 | |
| 87 | if len(self.letters) == 1: |
| 88 | output += '%c%c' % (self.base, self.letters[0]) |
| 89 | elif len(self.letters) > 0: |
| 90 | output += '%c[' % self.base |
| 91 | |
| 92 | for letter in self.letters: |
| 93 | output += letter |
| 94 | |
| 95 | output += ']' |
| 96 | |
| 97 | if len(self.words) != 0: |
| 98 | output += ')' |
| 99 | |
| 100 | if not isLast: |
| 101 | output += '"' |
| 102 | output += '\n' |
| 103 | |
| 104 | return output; |
| 105 | |
| 106 | def add(self, line): |
| 107 | length = len(line) |
| 108 | |
| 109 | if line.startswith('#') or (length == 0): |
| 110 | return; |
| 111 | |
| 112 | if length == 2: |
| 113 | self.letters.append(line[1:2]) |
| 114 | else: |
| 115 | self.words.append(line) |
| 116 | |
| 117 | def getBucket(buckets, line): |
| 118 | letter = line[0] |
| 119 | bucket = buckets.get(letter) |
| 120 | |
| 121 | if bucket is None: |
| 122 | bucket = Bucket(letter) |
| 123 | buckets[letter] = bucket |
| 124 | |
| 125 | return bucket |
| 126 | |
| 127 | def makePattern(prefix, suffix, buckets, isWebUrl=False): |
| 128 | output = prefix |
| 129 | |
| 130 | output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) |
| 131 | |
| 132 | for letter in range(ord('b'), ord('z')): |
| 133 | output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) |
| 134 | |
| 135 | output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) |
| 136 | |
| 137 | if isWebUrl: |
| 138 | output += '))"' |
| 139 | else: |
| 140 | output += ')' |
| 141 | |
| 142 | output += suffix |
| 143 | |
| 144 | print output |
| 145 | |
| 146 | if __name__ == "__main__": |
| 147 | f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') |
| 148 | domains = f.readlines() |
| 149 | f.close() |
| 150 | |
| 151 | buckets = {} |
| 152 | |
| 153 | for domain in domains: |
| 154 | domain = domain.lower() |
| 155 | |
| 156 | if len(domain) > 0: |
| 157 | getBucket(buckets, domain[0]).add(domain.strip()) |
| 158 | |
| 159 | makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) |
| 160 | makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) |