Blame - common/tools/make-iana-tld-pattern.py - platform_frameworks_native

blob: ece4dcfea17e0a7365873246b9e2a6062b48df85 [file] [log] [blame]

Shimeng (Simon) Wang	56811ab	2010-02-10 11:22:01 -0800	[diff] [blame^]	1	#!/usr/bin/env python
				2
				3	from urllib2 import urlopen
				4
				5	TLD_PREFIX = r"""
				6	/**
				7	* Regular expression pattern to match all IANA top-level domains.
				8	* List accurate as of 2010/02/05. List taken from:
				9	* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
				10	* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
				11	*/
				12	public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
				13	"""
				14	TLD_SUFFIX = '");'
				15
				16	URL_PREFIX = r"""
				17	/**
				18	* Regular expression pattern to match RFC 1738 URLs
				19	* List accurate as of 2010/02/05. List taken from:
				20	* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
				21	* This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
				22	*/
				23	public static final Pattern WEB_URL = Pattern.compile(
				24	"((?:(http\|https\|Http\|Https\|rtsp\|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\$\$"
				25	+ "\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
				26	+ "\\.\\+\\!\\*\\'\$\$\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
				27	+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
				28	+ "(?:" // plus top level domain
				29	"""
				30
				31	URL_SUFFIX = r"""
				32	+ "\|(?:(?:25[0-5]\|2[0-4]" // or ip address
				33	+ "[0-9]\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9])\\.(?:25[0-5]\|2[0-4][0-9]"
				34	+ "\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1]"
				35	+ "[0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1][0-9]{2}"
				36	+ "\|[1-9][0-9]\|[0-9])))"
				37	+ "(?:\\:\\d{1,5})?)" // plus option port number
				38	+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
				39	+ "\\-\\.\\+\\!\\\\'\$\$\\,\\_])\|(?:\\%[a-fA-F0-9]{2})))?"
				40	+ "(?:\\b\|$)"); // and finally, a word boundary or end of
				41	// input. This is to stop foo.sure from
				42	// matching as foo.su
				43	"""
				44
				45	class Bucket:
				46	def __init__(self, baseLetter):
				47	self.base=baseLetter
				48	self.words=[]
				49	self.letters=[]
				50
				51	def dump(self, isWebUrl=False, isFirst=False, isLast=False):
				52	if (len(self.words) == 0) and (len(self.letters) == 0):
				53	return ''
				54
				55	self.words.sort()
				56	self.letters.sort()
				57
				58	output = ' ';
				59
				60	if isFirst:
				61	if isWebUrl:
				62	output += '+ "'
				63	else:
				64	output += '"('
				65	else:
				66	output += '+ "\|'
				67
				68	if len(self.words) != 0:
				69	output += '('
				70
				71	if isWebUrl:
				72	output += '?:'
				73
				74	firstWord = 1
				75	for word in self.words:
				76	if firstWord == 0:
				77	output += '\|'
				78	firstWord = 0
				79	for letter in word:
				80	if letter == '-':
				81	output += '\\\\' # escape the '-' character.
				82	output += letter
				83
				84	if len(self.words) > 0 and len(self.letters) > 0:
				85	output += '\|'
				86
				87	if len(self.letters) == 1:
				88	output += '%c%c' % (self.base, self.letters[0])
				89	elif len(self.letters) > 0:
				90	output += '%c[' % self.base
				91
				92	for letter in self.letters:
				93	output += letter
				94
				95	output += ']'
				96
				97	if len(self.words) != 0:
				98	output += ')'
				99
				100	if not isLast:
				101	output += '"'
				102	output += '\n'
				103
				104	return output;
				105
				106	def add(self, line):
				107	length = len(line)
				108
				109	if line.startswith('#') or (length == 0):
				110	return;
				111
				112	if length == 2:
				113	self.letters.append(line[1:2])
				114	else:
				115	self.words.append(line)
				116
				117	def getBucket(buckets, line):
				118	letter = line[0]
				119	bucket = buckets.get(letter)
				120
				121	if bucket is None:
				122	bucket = Bucket(letter)
				123	buckets[letter] = bucket
				124
				125	return bucket
				126
				127	def makePattern(prefix, suffix, buckets, isWebUrl=False):
				128	output = prefix
				129
				130	output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
				131
				132	for letter in range(ord('b'), ord('z')):
				133	output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
				134
				135	output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
				136
				137	if isWebUrl:
				138	output += '))"'
				139	else:
				140	output += ')'
				141
				142	output += suffix
				143
				144	print output
				145
				146	if __name__ == "__main__":
				147	f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
				148	domains = f.readlines()
				149	f.close()
				150
				151	buckets = {}
				152
				153	for domain in domains:
				154	domain = domain.lower()
				155
				156	if len(domain) > 0:
				157	getBucket(buckets, domain[0]).add(domain.strip())
				158
				159	makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
				160	makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)