Added the 7 new top level domains, and reworded the nameorgs output.

Not sure this is better in all cases.

parse(): Fixed a bug in the output; the dict is referred to in the
code as `countries' not `country'.  Also added no-case-fold for the
string "U.S." since the Virgin Islands name no longer wraps those in
parentheses.

main(): Fixed the argument parsing to agree with the docstring, i.e.
--outputdict instead of --output.

In the module docstring:

- updated my email address
- we don't need to explain about Python 1.5 regexps <wink>

We also don't need to wrap the import of re with a try/except.

Other style fixes:

- untabification
- revert back to <> style everywhere (and consistently)
This commit is contained in:
Barry Warsaw 2002-06-07 15:48:52 +00:00
parent 9e9d4f8ed8
commit aef8371acb
1 changed files with 109 additions and 112 deletions

View File

@ -3,7 +3,7 @@
"""world -- Print mappings between country names and DNS country codes. """world -- Print mappings between country names and DNS country codes.
Contact: Barry Warsaw Contact: Barry Warsaw
Email: bwarsaw@python.org Email: barry@python.org
Version: %(__version__)s Version: %(__version__)s
This script will take a list of Internet addresses and print out where in the This script will take a list of Internet addresses and print out where in the
@ -14,9 +14,9 @@ code found in the address. Addresses can be in any of the following forms:
host.domain.xx -- any Internet host or network name host.domain.xx -- any Internet host or network name
somebody@where.xx -- an Internet email address somebody@where.xx -- an Internet email address
If no match is found, the address is interpreted as a regular expression [*] If no match is found, the address is interpreted as a regular expression and a
and a reverse lookup is attempted. This script will search the country names reverse lookup is attempted. This script will search the country names and
and print a list of matching entries. You can force reverse mappings with the print a list of matching entries. You can force reverse mappings with the
`-r' flag (see below). `-r' flag (see below).
For example: For example:
@ -34,10 +34,6 @@ For example:
tz: Tanzania, United Republic of tz: Tanzania, United Republic of
gb: United Kingdom gb: United Kingdom
[*] Note that regular expressions must conform to Python 1.5's re.py module
syntax. The comparison is done with the search() method.
Country codes are maintained by the RIPE Network Coordination Centre, Country codes are maintained by the RIPE Network Coordination Centre,
in coordination with the ISO 3166 Maintenance Agency at DIN Berlin. The in coordination with the ISO 3166 Maintenance Agency at DIN Berlin. The
authoritative source of country code mappings is: authoritative source of country code mappings is:
@ -69,7 +65,7 @@ Usage: %(PROGRAM)s [-d] [-p file] [-o] [-h] addr [addr ...]
When used in conjunction with the `-p' option, output is in the form When used in conjunction with the `-p' option, output is in the form
of a Python dictionary, and country names are normalized of a Python dictionary, and country names are normalized
w.r.t. capitalization. This makes it appropriate for cutting and w.r.t. capitalization. This makes it appropriate for cutting and
pasting back into this file. pasting back into this file. Output is always to standard out.
--reverse --reverse
-r -r
@ -82,18 +78,13 @@ Usage: %(PROGRAM)s [-d] [-p file] [-o] [-h] addr [addr ...]
-h -h
--help --help
Print this message. Print this message.
""" """
__version__ = '$Revision$' __version__ = '$Revision$'
import sys import sys
import getopt import getopt
try: import re
import re
except ImportError:
print sys.argv[0], 'requires Python 1.5'
sys.exit(1)
PROGRAM = sys.argv[0] PROGRAM = sys.argv[0]
@ -110,22 +101,18 @@ def usage(code, msg=''):
def resolve(rawaddr): def resolve(rawaddr):
parts = rawaddr.split('.') parts = rawaddr.split('.')
if not len(parts): if not len(parts):
# no top level domain found, bounce it to the next step # no top level domain found, bounce it to the next step
return rawaddr return rawaddr
addr = parts[-1] addr = parts[-1]
if nameorgs.has_key(addr): if nameorgs.has_key(addr):
if nameorgs[addr][0].lower() in 'aeiou': print rawaddr, 'is in the', nameorgs[addr], 'top level domain'
ana = 'an' return None
else:
ana = 'a'
print rawaddr, 'is from', ana, nameorgs[addr], 'organization'
return None
elif countries.has_key(addr): elif countries.has_key(addr):
print rawaddr, 'originated from', countries[addr] print rawaddr, 'originated from', countries[addr]
return None return None
else: else:
# Not resolved, bounce it to the next step # Not resolved, bounce it to the next step
return rawaddr return rawaddr
@ -133,82 +120,83 @@ def reverse(regexp):
matches = [] matches = []
cre = re.compile(regexp, re.IGNORECASE) cre = re.compile(regexp, re.IGNORECASE)
for code, country in all.items(): for code, country in all.items():
mo = cre.search(country) mo = cre.search(country)
if mo: if mo:
matches.append(code) matches.append(code)
# print results # print results
if not matches: if not matches:
# not resolved, bounce it to the next step # not resolved, bounce it to the next step
return regexp return regexp
if len(matches) == 1: if len(matches) == 1:
code = matches[0] code = matches[0]
print regexp, "matches code `%s', %s" % (code, all[code]) print regexp, "matches code `%s', %s" % (code, all[code])
else: else:
print regexp, 'matches %d countries:' % len(matches) print regexp, 'matches %d countries:' % len(matches)
for code in matches: for code in matches:
print " %s: %s" % (code, all[code]) print " %s: %s" % (code, all[code])
return None return None
def parse(file, normalize): def parse(file, normalize):
try: try:
fp = open(file) fp = open(file)
except IOError, (err, msg): except IOError, (err, msg):
print msg, ':', file print msg, ':', file
cre = re.compile('(.*?)[ \t]+([A-Z]{2})[ \t]+[A-Z]{3}[ \t]+[0-9]{3}') cre = re.compile('(.*?)[ \t]+([A-Z]{2})[ \t]+[A-Z]{3}[ \t]+[0-9]{3}')
scanning = 0 scanning = 0
if normalize: if normalize:
print 'country = {' print 'countries = {'
while 1: while 1:
line = fp.readline() line = fp.readline()
if line == '': if line == '':
break # EOF break # EOF
if scanning: if scanning:
mo = cre.match(line) mo = cre.match(line)
if not mo: if not mo:
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
elif line[0] == '-': elif line[0] == '-':
break break
else: else:
print 'Could not parse line:', line print 'Could not parse line:', line
continue continue
country, code = mo.group(1, 2) country, code = mo.group(1, 2)
if normalize: if normalize:
words = country.split() words = country.split()
for i in range(len(words)): for i in range(len(words)):
w = words[i] w = words[i]
# XXX special cases # XXX special cases
if w in ('AND', 'OF', 'OF)', 'name:', 'METROPOLITAN'): if w in ('AND', 'OF', 'OF)', 'name:', 'METROPOLITAN'):
words[i] = w.lower() words[i] = w.lower()
elif w == 'THE' and i != 1: elif w == 'THE' and i <> 1:
words[i] = w.lower() words[i] = w.lower()
elif len(w) > 3 and w[1] == "'": elif len(w) > 3 and w[1] == "'":
words[i] = w[0:3].upper() + w[3:].lower() words[i] = w[0:3].upper() + w[3:].lower()
elif w == '(U.S.)': elif w in ('(U.S.)', 'U.S.'):
pass pass
elif w[0] == '(' and w != '(local': elif w[0] == '(' and w <> '(local':
words[i] = '(' + w[1:].capitalize() words[i] = '(' + w[1:].capitalize()
elif w.find('-') != -1: elif w.find('-') <> -1:
words[i] = '-'.join([s.capitalize() for s in w.split('-')]) words[i] = '-'.join(
else: [s.capitalize() for s in w.split('-')])
words[i] = w.capitalize() else:
code = code.lower() words[i] = w.capitalize()
country = ' '.join(words) code = code.lower()
print ' "%s": "%s",' % (code, country) country = ' '.join(words)
else: print ' "%s": "%s",' % (code, country)
print code, country else:
print code, country
elif line[0] == '-':
scanning = 1 elif line[0] == '-':
scanning = 1
if normalize: if normalize:
print ' }' print ' }'
def main(): def main():
@ -228,53 +216,62 @@ def main():
usage(1, msg) usage(1, msg)
for opt, arg in opts: for opt, arg in opts:
if opt in ('-h', '--help'): if opt in ('-h', '--help'):
help = 1 help = 1
elif opt in ('-d', '--dump'): elif opt in ('-d', '--dump'):
dump = 1 dump = 1
elif opt in ('-p', '--parse'): elif opt in ('-p', '--parse'):
parsefile = arg parsefile = arg
elif opt in ('-o', '--output'): elif opt in ('-o', '--outputdict'):
normalize = 1 normalize = 1
elif opt in ('-r', '--reverse'): elif opt in ('-r', '--reverse'):
forcerev = 1 forcerev = 1
if help: if help:
usage(status) usage(status)
if dump: if dump:
print 'Non-geographic domains:' print 'Non-geographic domains:'
codes = nameorgs.keys() codes = nameorgs.keys()
codes.sort() codes.sort()
for code in codes: for code in codes:
print ' %4s:' % code, nameorgs[code] print ' %4s:' % code, nameorgs[code]
print '\nCountry coded domains:' print '\nCountry coded domains:'
codes = countries.keys() codes = countries.keys()
codes.sort() codes.sort()
for code in codes: for code in codes:
print ' %2s:' % code, countries[code] print ' %2s:' % code, countries[code]
elif parsefile: elif parsefile:
parse(parsefile, normalize) parse(parsefile, normalize)
else: else:
if not forcerev: if not forcerev:
args = filter(None, map(resolve, args)) args = filter(None, map(resolve, args))
args = filter(None, map(reverse, args)) args = filter(None, map(reverse, args))
for arg in args: for arg in args:
print 'Where in the world is %s?' % arg print 'Where in the world is %s?' % arg
# The mappings # The mappings
nameorgs = { nameorgs = {
# New top level domains as described by ICANN
# http://www.icann.org/tlds/
"aero": "air-transport industry",
"arpa": "Arpanet", "arpa": "Arpanet",
"biz": "business",
"com": "commercial", "com": "commercial",
"coop": "cooperatives",
"edu": "educational", "edu": "educational",
"gov": "government", "gov": "government",
"info": "unrestricted `info'",
"int": "international",
"mil": "military", "mil": "military",
"museum": "museums",
"name": "`name' (for registration by individuals)",
"net": "networking", "net": "networking",
"org": "non-commercial", "org": "non-commercial",
"int": "international", "pro": "professionals",
# This isn't in the same class as those above, but is included here # This isn't in the same class as those above, but is included here
# because `uk' is the common practice country code for the United Kingdom. # because `uk' is the common practice country code for the United Kingdom.
# AFAICT, the official `gb' code is routinely ignored! # AFAICT, the official `gb' code is routinely ignored!
@ -525,7 +522,7 @@ countries = {
"ve": "Venezuela", "ve": "Venezuela",
"vn": "Viet Nam", "vn": "Viet Nam",
"vg": "Virgin Islands, British", "vg": "Virgin Islands, British",
"vi": "Virgin Islands, U.s.", "vi": "Virgin Islands, U.S.",
"wf": "Wallis and Futuna", "wf": "Wallis and Futuna",
"eh": "Western Sahara", "eh": "Western Sahara",
"ye": "Yemen", "ye": "Yemen",