cpython/Tools/world/world

#! /usr/bin/env python

"""Print mappings between country names and DNS country codes.

This script will take a list of Internet addresses and print out where in the
world those addresses originate from, based on the top-level domain country
code found in the address.  Addresses can be in any of the following forms:

    xx                -- just the country code or top-level domain identifier
    host.domain.xx    -- any Internet host or network name
    somebody@where.xx -- an Internet email address

If no match is found, the address is interpreted as a regular expression [*]
and a reverse lookup is attempted.  This script will search the country names
and printing a list of matching entries.  You can force reverse mappings with
the `-r' flag (see below).

For example:

    %% world tz us
    tz originated from Tanzania, United Republic of
    us originated from United States

    %% world united
    united matches 6 countries:
        ae: United Arab Emirates
        uk: United Kingdom (common practice)
        um: United States Minor Outlying Islands
        us: United States
        tz: Tanzania, United Republic of
        gb: United Kingdom


 [*] Note that regular expressions must conform to Python 1.5's re.py module
 syntax.  The comparison is done with the search() method.

Country codes are maintained by the RIPE Network Coordination Centre,
in coordination with the ISO 3166 Maintenance Agency at DIN Berlin.  The
authoritative source of counry code mappings is:

    <url:ftp://info.ripe.net/iso3166-countrycodes>

The latest known change to this information was:

    Thu Aug  7 17:59:51 MET DST 1997

This script also knows about non-geographic top-level domains.


Usage: %s [-d] [-p|-P file] [-h] addr [addr ...]

    --dump
    -d
        Print mapping of all top-level domains.

    --parse file
    -p file
        Parse an iso3166-countrycodes file extracting the two letter country
        code followed by the country name.  Note that the three letter country
        code and number, which are also provided in the standard format file,
        are ignored.

    --outputdict
    -o
        With used in conjunction with the `-p' option, output is in the form
        of a Python dictionary, and country names are normalized
        w.r.t. capitalization.  This makes it appropriate for cutting and
        pasting back into this file.

    --reverse
    -r
        Force reverse lookup.  In this mode the address can be any Python
        regular expression; this is matched against all country names and a
        list of matching mappings is printed.  In normal mode (e.g. without
        this flag), reverse lookup is performed on addresses if no matching
        country code is found.

    -h
    --help
        Print this message.

"""
__version__ = '$Revision$'
__author__ = 'Barry Warsaw <bwarsaw@python.org>'
__source__ = '<url:http://www.python.org/~bwarsaw/pyware/>'


import sys
import string
import getopt
try:
    import re
except ImportError:
    print sys.argv[0], 'requires Python 1.5'
    sys.exit(1)


def usage(status=0):
    print __doc__ % sys.argv[0]
    sys.exit(status)


def resolve(rawaddr):
    parts = string.splitfields(rawaddr, '.')
    if not len(parts):
	# no top level domain found, bounce it to the next step
	return rawaddr
    addr = parts[-1]
    if nameorgs.has_key(addr):
        if string.lower(nameorgs[addr][0]) in 'aeiou':
            ana = 'an'
        else:
            ana = 'a'
	print rawaddr, 'is from', ana, nameorgs[addr], 'organization'
	return None
    elif countries.has_key(addr):
	print rawaddr, 'originated from', countries[addr]
	return None
    else:
	# Not resolved, bounce it to the next step
	return rawaddr


def reverse(regexp):
    matches = []
    cre = re.compile(regexp, re.IGNORECASE)
    for code, country in all.items():
	mo = cre.search(country)
	if mo:
	    matches.append(code)
    # print results
    if not matches:
	# not resolved, bounce it to the next step
	return regexp
    if len(matches) == 1:
	code = matches[0]
	print regexp, "matches code `%s', %s" % (code, all[code])
    else:
	print regexp, 'matches %d countries:' % len(matches)
	for code in matches:
	    print "    %s: %s" % (code, all[code])
    return None


def parse(file, normalize):
    try:
	fp = open(file)
    except IOError, (err, msg):
	print msg, ':', file

    cre = re.compile('(.*?)[ \t]+([A-Z]{2})[ \t]+[A-Z]{3}[ \t]+[0-9]{3}')
    scanning = 0

    if normalize:
	print 'country = {'

    while 1:
	line = fp.readline()
	if line == '':
	    break			# EOF
	if scanning:
	    mo = cre.match(line)
	    if not mo:
		line = string.strip(line)
		if not line:
		    continue
		elif line[0] == '-':
		    break
		else:
		    print 'Could not parse line:', line
		    continue
	    country, code = mo.group(1, 2)
	    if normalize:
		words = string.split(country)
		for i in range(len(words)):
		    w = words[i]
		    # XXX special cases
		    if w in ('AND', 'OF', 'OF)', 'name:', 'METROPOLITAN'):
			words[i] = string.lower(w)
		    elif w == 'THE' and i <> 1:
			words[i] = string.lower(w)
		    elif len(w) > 3 and w[1] == "'":
			words[i] = string.upper(w[0:3]) + \
				   string.lower(w[3:])
		    elif w == '(U.S.)':
			pass
		    elif w[0] == '(' and w <> '(local':
			words[i] = '(' + string.capitalize(w[1:])
		    elif string.find(w, '-'):
			words[i] = string.join(
			    map(string.capitalize, string.split(w, '-')),
			    '-')
		    else:
			words[i] = string.capitalize(w)
		code = string.lower(code)
		country = string.join(words)
		print '    "%s": "%s",' % (code, country)
	    else:
		print code, country

	elif line[0] == '-':
	    scanning = 1

    if normalize:
	print '    }'


def main():
    help = 0
    status = 0
    dump = 0
    parsefile = None
    normalize = 0
    forcerev = 0

    opts, args = getopt.getopt(
	sys.argv[1:],
	'p:rohd',
	['parse', 'reverse', 'outputdict', 'help', 'dump'])
    for arg, val in opts:
	if arg in ('-h', '--help'):
	    help = 1
	elif arg in ('-d', '--dump'):
	    dump = 1
	elif arg in ('-p', '--parse'):
	    parsefile = val
	elif arg in ('-o', '--output'):
	    normalize = 1
	elif arg in ('-r', '--reverse'):
	    forcerev = 1

    if help:
	usage(status)

    if dump:
	print 'Non-geographic domains:'
	codes = nameorg.keys()
	codes.sort()
	for code in codes:
	    print '    %4s:' % code, nameorg[code]

	print '\nCountry coded domains:'
	codes = country.keys()
	codes.sort()
	for code in codes:
	    print '    %2s:' % code, country[code]
    elif parsefile:
	parse(parsefile, normalize)
    else:
	if not forcerev:
	    args = filter(None, map(resolve, args))
	args = filter(None, map(reverse, args))
	for arg in args:
	    print 'Where in the world is %s?' % arg


# The mappings
nameorgs = {
    "arpa": "Arpanet",
    "com": "commercial",
    "edu": "educational",
    "gov": "government",
    "mil": "military",
    "net": "networking",
    "org": "non-commercial",
    "int": "international",
    # this isn't in the same class as those above, but for some reason (of
    # which I am not aware), `uk' is the common practice country code for the
    # United Kingdom.  AFAICT, the official `gb' code is routinely ignored!
    # If you have a clue as to why this is, or if there is an official
    # document describing this, please let me know.  ISO3166 certainly makes
    # no mention of this.
    #
    # See <url:http://www.ripe.net/docs/ripe-159.html#222123>
    "uk": "United Kingdom (common practice)",
    }


countries = {
    "af": "Afghanistan",
    "al": "Albania",
    "dz": "Algeria",
    "as": "American Samoa",
    "ad": "Andorra",
    "ao": "Angola",
    "ai": "Anguilla",
    "aq": "Antarctica",
    "ag": "Antigua and Barbuda",
    "ar": "Argentina",
    "am": "Armenia",
    "aw": "Aruba",
    "au": "Australia",
    "at": "Austria",
    "az": "Azerbaijan",
    "bs": "Bahamas",
    "bh": "Bahrain",
    "bd": "Bangladesh",
    "bb": "Barbados",
    "by": "Belarus",
    "be": "Belgium",
    "bz": "Belize",
    "bj": "Benin",
    "bm": "Bermuda",
    "bt": "Bhutan",
    "bo": "Bolivia",
    "ba": "Bosnia and Herzegowina",
    "bw": "Botswana",
    "bv": "Bouvet Island",
    "br": "Brazil",
    "io": "British Indian Ocean Territory",
    "bn": "Brunei Darussalam",
    "bg": "Bulgaria",
    "bf": "Burkina Faso",
    "bi": "Burundi",
    "kh": "Cambodia",
    "cm": "Cameroon",
    "ca": "Canada",
    "cv": "Cape Verde",
    "ky": "Cayman Islands",
    "cf": "Central African Republic",
    "td": "Chad",
    "cl": "Chile",
    "cn": "China",
    "cx": "Christmas Island",
    "cc": "Cocos (Keeling) Islands",
    "co": "Colombia",
    "km": "Comoros",
    "cg": "Congo",
    "cd": "Congo, The Democratic Republic of the",
    "ck": "Cook Islands",
    "cr": "Costa Rica",
    "ci": "Cote D'Ivoire",
    "hr": "Croatia (local name: Hrvatska)",
    "cu": "Cuba",
    "cy": "Cyprus",
    "cz": "Czech Republic",
    "dk": "Denmark",
    "dj": "Djibouti",
    "dm": "Dominica",
    "do": "Dominican Republic",
    "tp": "East Timor",
    "ec": "Ecuador",
    "eg": "Egypt",
    "sv": "El Salvador",
    "gq": "Equatorial Guinea",
    "er": "Eritrea",
    "ee": "Estonia",
    "et": "Ethiopia",
    "fk": "Falkland Islands (Malvinas)",
    "fo": "Faroe Islands",
    "fj": "Fiji",
    "fi": "Finland",
    "fr": "France",
    "fx": "France, metropolitan",
    "gf": "French Guiana",
    "pf": "French Polynesia",
    "tf": "French Southern Territories",
    "ga": "Gabon",
    "gm": "Gambia",
    "ge": "Georgia",
    "de": "Germany",
    "gh": "Ghana",
    "gi": "Gibraltar",
    "gr": "Greece",
    "gl": "Greenland",
    "gd": "Grenada",
    "gp": "Guadeloupe",
    "gu": "Guam",
    "gt": "Guatemala",
    "gn": "Guinea",
    "gw": "Guinea-Bissau",
    "gy": "Guyana",
    "ht": "Haiti",
    "hm": "Heard and Mc Donald Islands",
    "va": "Holy See (Vatican City State)",
    "hn": "Honduras",
    "hk": "Hong Kong",
    "hu": "Hungary",
    "is": "Iceland",
    "in": "India",
    "id": "Indonesia",
    "ir": "Iran (Islamic Republic of)",
    "iq": "Iraq",
    "ie": "Ireland",
    "il": "Israel",
    "it": "Italy",
    "jm": "Jamaica",
    "jp": "Japan",
    "jo": "Jordan",
    "kz": "Kazakhstan",
    "ke": "Kenya",
    "ki": "Kiribati",
    "kp": "Korea, Democratic People's Republic of",
    "kr": "Korea, Republic of",
    "kw": "Kuwait",
    "kg": "Kyrgyzstan",
    "la": "Lao People's Democratic Republic",
    "lv": "Latvia",
    "lb": "Lebanon",
    "ls": "Lesotho",
    "lr": "Liberia",
    "ly": "Libyan Arab Jamahiriya",
    "li": "Liechtenstein",
    "lt": "Lithuania",
    "lu": "Luxembourg",
    "mo": "Macau",
    "mk": "Macedonia, The Former Yugoslav Republic of",
    "mg": "Madagascar",
    "mw": "Malawi",
    "my": "Malaysia",
    "mv": "Maldives",
    "ml": "Mali",
    "mt": "Malta",
    "mh": "Marshall Islands",
    "mq": "Martinique",
    "mr": "Mauritania",
    "mu": "Mauritius",
    "yt": "Mayotte",
    "mx": "Mexico",
    "fm": "Micronesia, Federated States of",
    "md": "Moldova, Republic of",
    "mc": "Monaco",
    "mn": "Mongolia",
    "ms": "Montserrat",
    "ma": "Morocco",
    "mz": "Mozambique",
    "mm": "Myanmar",
    "na": "Namibia",
    "nr": "Nauru",
    "np": "Nepal",
    "nl": "Netherlands",
    "an": "Netherlands Antilles",
    "nc": "New Caledonia",
    "nz": "New Zealand",
    "ni": "Nicaragua",
    "ne": "Niger",
    "ng": "Nigeria",
    "nu": "Niue",
    "nf": "Norfolk Island",
    "mp": "Northern Mariana Islands",
    "no": "Norway",
    "om": "Oman",
    "pk": "Pakistan",
    "pw": "Palau",
    "pa": "Panama",
    "pg": "Papua New Guinea",
    "py": "Paraguay",
    "pe": "Peru",
    "ph": "Philippines",
    "pn": "Pitcairn",
    "pl": "Poland",
    "pt": "Portugal",
    "pr": "Puerto Rico",
    "qa": "Qatar",
    "re": "Reunion",
    "ro": "Romania",
    "ru": "Russian Federation",
    "rw": "Rwanda",
    "kn": "Saint Kitts and Nevis",
    "lc": "Saint Lucia",
    "vc": "Saint Vincent and the Grenadines",
    "ws": "Samoa",
    "sm": "San Marino",
    "st": "Sao Tome and Principe",
    "sa": "Saudi Arabia",
    "sn": "Senegal",
    "sc": "Seychelles",
    "sl": "Sierra Leone",
    "sg": "Singapore",
    "sk": "Slovakia (Slovak Republic)",
    "si": "Slovenia",
    "sb": "Solomon Islands",
    "so": "Somalia",
    "za": "South Africa",
    "gs": "South Georgia and the South Sandwich Islands",
    "es": "Spain",
    "lk": "Sri Lanka",
    "sh": "St. Helena",
    "pm": "St. Pierre and Miquelon",
    "sd": "Sudan",
    "sr": "Suriname",
    "sj": "Svalbard and Jan Mayen Islands",
    "sz": "Swaziland",
    "se": "Sweden",
    "ch": "Switzerland",
    "sy": "Syrian Arab Republic",
    "tw": "Taiwan, Province of China",
    "tj": "Tajikistan",
    "tz": "Tanzania, United Republic of",
    "th": "Thailand",
    "tg": "Togo",
    "tk": "Tokelau",
    "to": "Tonga",
    "tt": "Trinidad and Tobago",
    "tn": "Tunisia",
    "tr": "Turkey",
    "tm": "Turkmenistan",
    "tc": "Turks and Caicos Islands",
    "tv": "Tuvalu",
    "ug": "Uganda",
    "ua": "Ukraine",
    "ae": "United Arab Emirates",
    "gb": "United Kingdom",
    "us": "United States",
    "um": "United States Minor Outlying Islands",
    "uy": "Uruguay",
    "uz": "Uzbekistan",
    "vu": "Vanuatu",
    "ve": "Venezuela",
    "vn": "Viet Nam",
    "vg": "Virgin Islands (British)",
    "vi": "Virgin Islands (U.S.)",
    "wf": "Wallis and Futuna Islands",
    "eh": "Western Sahara",
    "ye": "Yemen",
    "yu": "Yugoslavia",
    "zm": "Zambia",
    "zw": "Zimbabwe",
    }

all = nameorgs.copy()
all.update(countries)


if __name__ == '__main__':
    main()