From 2548c730c17d766ca04b2bf633552655f7f96cdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Fri, 18 Apr 2003 10:39:54 +0000 Subject: [PATCH] Implement IDNA (Internationalized Domain Names in Applications). --- Doc/lib/lib.tex | 1 + Doc/lib/libcodecs.tex | 72 +++++- Doc/lib/libstringprep.tex | 134 +++++++++++ Doc/whatsnew/whatsnew23.tex | 21 ++ Lib/encodings/idna.py | 187 +++++++++++++++ Lib/encodings/punycode.py | 222 +++++++++++++++++ Lib/httplib.py | 6 +- Lib/stringprep.py | 273 +++++++++++++++++++++ Lib/test/test_codecs.py | 296 +++++++++++++++++++++++ Misc/NEWS | 5 + Modules/socketmodule.c | 30 ++- Tools/unicode/mkstringprep.py | 433 ++++++++++++++++++++++++++++++++++ 12 files changed, 1671 insertions(+), 9 deletions(-) create mode 100644 Doc/lib/libstringprep.tex create mode 100644 Lib/encodings/idna.py create mode 100644 Lib/encodings/punycode.py create mode 100644 Lib/stringprep.py create mode 100644 Tools/unicode/mkstringprep.py diff --git a/Doc/lib/lib.tex b/Doc/lib/lib.tex index 755023d0d03..5faeedcd752 100644 --- a/Doc/lib/lib.tex +++ b/Doc/lib/lib.tex @@ -112,6 +112,7 @@ and how to embed it in other applications. \input{libtextwrap} \input{libcodecs} \input{libunicodedata} +\input{libstringprep} \input{libmisc} % Miscellaneous Services \input{libpydoc} diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index caaaaf49a18..38586efcac6 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -5,7 +5,7 @@ \modulesynopsis{Encode and decode data and streams.} \moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com} \sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com} - +\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de} \index{Unicode} \index{Codecs} @@ -809,6 +809,11 @@ listed as operand type in the table. {byte string} {Convert operand to hexadecimal representation, with two digits per byte} +\lineiv{idna} + {} + {Unicode string} + {Implements \rfc{3490}. \versionadded{2.3}. See also \module{encodings.idna}} + \lineiv{mbcs} {dbcs} {Unicode string} @@ -819,6 +824,11 @@ listed as operand type in the table. {Unicode string} {Encoding of PalmOS 3.5} +\lineiv{punycode} + {} + {Unicode string} + {Implements \rfc{3492}. \versionadded{2.3}} + \lineiv{quopri_codec} {quopri, quoted-printable, quotedprintable} {byte string} @@ -865,3 +875,63 @@ listed as operand type in the table. {Compress the operand using gzip} \end{tableiv} + +\subsection{\module{encodings.idna} --- + Internationalized Domain Names in Applications} + +\declaremodule{standard}{encodings.idna} +\modulesynopsis{Internationalized Domain Names implementation} +\moduleauthor{Martin v. L\"owis} + +This module implements \rfc{3490} (Internationalized Domain Names in +Applications) and \rfc{3492} (Nameprep: A Stringprep Profile for +Internationalized Domain Names (IDN)). It builds upon the +\code{punycode} encoding and \module{stringprep}. \versionadded{2.3} + +These RFCs together define a protocol to support non-ASCII characters +in domain names. A domain name containing non-ASCII characters (such +as ``www.Alliancefran\,caise.nu'') is converted into an +ASCII-compatible encoding (ACE, such as +``www.xn--alliancefranaise-npb.nu''). The ACE form of the domain name +is then used in all places where arbitrary characters are not allowed +by the protocol, such as DNS queries, HTTP \code{Host:} fields, and so +on. This conversion is carried out in the application; if possible +invisible to the user: The application should transparently convert +Unicode domain labels to IDNA on the wire, and convert back ACE labels +to Unicode before presenting them to the user. + +Python supports this conversion in several ways: The \code{idna} codec +allows to convert between Unicode and the ACE. Furthermore, the +\module{socket} module transparently converts Unicode host names to +ACE, so that applications need not be concerned about converting host +names themselves when they pass them to the socket module. On top of +that, modules that have host names as function parameters, such as +\module{httplib} and \module{ftplib}, accept Unicode host names +(\module{httplib} then also transparently sends an IDNA hostname in +the \code{Host:} field if it sends that field at all). + +When receiving host names from the wire (such as in reverse name +lookup), no automatic conversion to Unicode is performed: Applications +wishing to present such host names to the user should decode them to +Unicode. + +The module \module{encodings.idna} also implements the nameprep +procedure, which performs certain normalizations on host names, to +achieve case-insensitivity of international domain names, and to unify +similar characters. The nameprep functions can be used directly if +desired. + +\begin{funcdesc}{nameprep}{label} +Return the nameprepped version of \var{label}. The implementation +currently assumes query strings, so \code{AllowUnassigned} is +true. +\end{funcdesc} + +\begin{funcdesc}{ToASCCII}{label} +Convert a label to ASCII, as specified in \rfc{3490}. +\code{UseSTD3ASCIIRules} is assumed to be false. +\end{funcdesc} + +\begin{funcdesc}{ToUnicode}{label} +Convert a label to Unicode, as specified in \rfc{3490}. +\end{funcdesc} diff --git a/Doc/lib/libstringprep.tex b/Doc/lib/libstringprep.tex new file mode 100644 index 00000000000..3492d021fe1 --- /dev/null +++ b/Doc/lib/libstringprep.tex @@ -0,0 +1,134 @@ +\section{\module{stringprep} --- + Internet String Preparation} + +\declaremodule{standard}{stringprep} +\modulesynopsis{String preparation, as per RFC 3453} +\moduleauthor{Martin v. L\"owis}{martin@v.loewis.de} +\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de} + +When identifying things (such as host names) in the internet, it is +often necessary to compare such identifications for +``equality''. Exactly how this comparison is executed may depend on +the application domain, e.g. whether it should be case-insensitive or +not. It may be also necessary to restrict the possible +identifications, to allow only identifications consisting of +``printable'' characters. + +\rfc{3454} defines a procedure for ``preparing'' Unicode strings in +internet protocols. Before passing strings onto the wire, they are +processed with the preparation procedure, after which they have a +certain normalized form. The RFC defines a set of tables, which can be +combined into profiles. Each profile must define which tables it uses, +and what other optional parts of the \code{stringprep} procedure are +part of the profile. One example of a \code{stringprep} profile is +\code{nameprep}, which is used for internationalized domain names. + +The module \module{stringprep} only exposes the tables from RFC +3454. As these tables would be very large to represent them as +dictionaries or lists, the module uses the Unicode character database +internally. The module source code itself was generated using the +\code{mkstringprep.py} utility. + +As a result, these tables are exposed as functions, not as data +structures. There are two kinds of tables in the RFC: sets and +mappings. For a set, \module{stringprep} provides the ``characteristic +function'', i.e. a function that returns true if the parameter is part +of the set. For mappings, it provides the mapping function: given the +key, it returns the associated value. Below is a list of all functions +available in the module. + +\begin{funcdesc}{in_table_a1}{code} +Determine whether \var{code} is in table{A.1} (Unassigned code points +in Unicode 3.2). +\end{funcdesc} + +\begin{funcdesc}{in_table_b1}{code} +Determine whether \var{code} is in table{B.1} (Commonly mapped to +nothing). +\end{funcdesc} + +\begin{funcdesc}{map_table_b2}{code} +Return the mapped value for \var{code} according to table{B.2} +(Mapping for case-folding used with NFKC). +\end{funcdesc} + +\begin{funcdesc}{map_table_b3}{code} +Return the mapped value for \var{code} according to table{B.3} +(Mapping for case-folding used with no normalization). +\end{funcdesc} + +\begin{funcdesc}{in_table_c11}{code} +Determine whether \var{code} is in table{C.1.1} +(ASCII space characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c12}{code} +Determine whether \var{code} is in table{C.1.2} +(Non-ASCII space characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c11_c12}{code} +Determine whether \var{code} is in table{C.1} +(Space characters, union of C.1.1 and C.1.2). +\end{funcdesc} + +\begin{funcdesc}{in_table_c21}{code} +Determine whether \var{code} is in table{C.2.1} +(ASCII control characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c22}{code} +Determine whether \var{code} is in table{C.2.2} +(Non-ASCII control characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_c21_c22}{code} +Determine whether \var{code} is in table{C.2} +(Control characters, union of C.2.1 and C.2.2). +\end{funcdesc} + +\begin{funcdesc}{in_table_c3}{code} +Determine whether \var{code} is in table{C.3} +(Private use). +\end{funcdesc} + +\begin{funcdesc}{in_table_c4}{code} +Determine whether \var{code} is in table{C.4} +(Non-character code points). +\end{funcdesc} + +\begin{funcdesc}{in_table_c5}{code} +Determine whether \var{code} is in table{C.5} +(Surrogate codes). +\end{funcdesc} + +\begin{funcdesc}{in_table_c6}{code} +Determine whether \var{code} is in table{C.6} +(Inappropriate for plain text). +\end{funcdesc} + +\begin{funcdesc}{in_table_c7}{code} +Determine whether \var{code} is in table{C.7} +(Inappropriate for canonical representation). +\end{funcdesc} + +\begin{funcdesc}{in_table_c8}{code} +Determine whether \var{code} is in table{C.8} +(Change display properties or are deprecated). +\end{funcdesc} + +\begin{funcdesc}{in_table_c9}{code} +Determine whether \var{code} is in table{C.9} +(Tagging characters). +\end{funcdesc} + +\begin{funcdesc}{in_table_d1}{code} +Determine whether \var{code} is in table{D.1} +(Characters with bidirectional property ``R'' or ``AL''). +\end{funcdesc} + +\begin{funcdesc}{in_table_d2}{code} +Determine whether \var{code} is in table{D.2} +(Characters with bidirectional property ``L''). +\end{funcdesc} + diff --git a/Doc/whatsnew/whatsnew23.tex b/Doc/whatsnew/whatsnew23.tex index d70a84951f0..38669b2bd5a 100644 --- a/Doc/whatsnew/whatsnew23.tex +++ b/Doc/whatsnew/whatsnew23.tex @@ -1791,6 +1791,27 @@ Tkinter.wantobjects = 0 Any breakage caused by this change should be reported as a bug. +\item Support for internationalized domain names (RFCs 3454, 3490, +3491, and 3492) has been added. The ``idna'' encoding can be used +to convert between a Unicode domain name and the ASCII-compatible +encoding (ACE). + +\begin{verbatim} +>>> u"www.Alliancefran\,caise.nu".encode("idna") +'www.xn--alliancefranaise-npb.nu' +\end{verbatim} + +In addition, the \module{socket} has been extended to transparently +convert Unicode hostnames to the ACE before passing them to the C +library. In turn, modules that pass hostnames ``through'' (such as +\module{httplib}, \module{ftplib}) also support Unicode host names +(httplib also sends ACE Host: headers). \module{urllib} supports +Unicode URLs with non-ASCII host names as long as the \code{path} part +of the URL is ASCII only. + +To implement this change, the module \module{stringprep}, the tool +\code{mkstringprep} and the \code{punycode} encoding have been added. + \end{itemize} diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py new file mode 100644 index 00000000000..7e4d04e2dde --- /dev/null +++ b/Lib/encodings/idna.py @@ -0,0 +1,187 @@ +# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) + +import stringprep, unicodedata, re, codecs + +# IDNA section 3.1 +dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") + +# IDNA section 5 +ace_prefix = "xn--" +uace_prefix = unicode(ace_prefix, "ascii") + +# This assumes query strings, so AllowUnassigned is true +def nameprep(label): + # Map + newlabel = [] + for c in label: + if stringprep.in_table_b1(c): + # Map to nothing + continue + newlabel.append(stringprep.map_table_b2(c)) + label = u"".join(newlabel) + + # Normalize + label = unicodedata.normalize("NFKC", label) + + # Prohibit + for c in label: + if stringprep.in_table_c12(c) or \ + stringprep.in_table_c22(c) or \ + stringprep.in_table_c3(c) or \ + stringprep.in_table_c4(c) or \ + stringprep.in_table_c5(c) or \ + stringprep.in_table_c6(c) or \ + stringprep.in_table_c7(c) or \ + stringprep.in_table_c8(c) or \ + stringprep.in_table_c9(c): + raise UnicodeError, "Invalid character %s" % repr(c) + + # Check bidi + RandAL = map(stringprep.in_table_d1, label) + for c in RandAL: + if c: + # There is a RandAL char in the string. Must perform further + # tests: + # 1) The characters in section 5.8 MUST be prohibited. + # This is table C.8, which was already checked + # 2) If a string contains any RandALCat character, the string + # MUST NOT contain any LCat character. + if filter(stringprep.in_table_d2, label): + raise UnicodeError, "Violation of BIDI requirement 2" + + # 3) If a string contains any RandALCat character, a + # RandALCat character MUST be the first character of the + # string, and a RandALCat character MUST be the last + # character of the string. + if not RandAL[0] or not RandAL[-1]: + raise UnicodeError, "Violation of BIDI requirement 3" + + return label + +def ToASCII(label): + try: + # Step 1: try ASCII + label = label.encode("ascii") + except UnicodeError: + pass + else: + # Skip to step 3: UseSTD3ASCIIRules is false, so + # Skip to step 8. + if 0 < len(label) < 64: + return label + raise UnicodeError, "label too long" + + # Step 2: nameprep + label = nameprep(label) + + # Step 3: UseSTD3ASCIIRules is false + # Step 4: try ASCII + try: + label = label.encode("ascii") + except UnicodeError: + pass + else: + # Skip to step 8. + if 0 < len(label) < 64: + return label + raise UnicodeError, "label too long" + + # Step 5: Check ACE prefix + if label.startswith(uace_prefix): + raise UnicodeError, "Label starts with ACE prefix" + + # Step 6: Encode with PUNYCODE + label = label.encode("punycode") + + # Step 7: Prepend ACE prefix + label = ace_prefix + label + + # Step 8: Check size + if 0 < len(label) < 64: + return label + raise UnicodeError, "label too long" + +def ToUnicode(label): + # Step 1: Check for ASCII + if isinstance(label, str): + pure_ascii = True + else: + try: + label = label.encode("ascii") + pure_ascii = True + except UnicodeError: + pure_ascii = False + if not pure_ascii: + # Step 2: Perform nameprep + label = nameprep(label) + # It doesn't say this, but apparently, it should be ASCII now + try: + label = label.encode("ascii") + except UnicodeError: + raise UnicodeError, "Invalid character in IDN label" + # Step 3: Check for ACE prefix + if not label.startswith(ace_prefix): + return unicode(label, "ascii") + + # Step 4: Remove ACE prefix + label1 = label[len(ace_prefix):] + + # Step 5: Decode using PUNYCODE + result = label1.decode("punycode") + + # Step 6: Apply ToASCII + label2 = ToASCII(result) + + # Step 7: Compare the result of step 6 with the one of step 3 + # label2 will already be in lower case. + if label.lower() != label2: + raise UnicodeError, ("IDNA does not round-trip", label, label2) + + # Step 8: return the result of step 5 + return result + +### Codec APIs + +class Codec(codecs.Codec): + def encode(self,input,errors='strict'): + + if errors != 'strict': + # IDNA is quite clear that implementations must be strict + raise UnicodeError, "unsupported error handling "+errors + + result = [] + for label in dots.split(input): + result.append(ToASCII(label)) + # Join with U+002E + return ".".join(result), len(input) + + def decode(self,input,errors='strict'): + + if errors != 'strict': + raise UnicodeError, "Unsupported error handling "+errors + + # IDNA allows decoding to operate on Unicode strings, too. + if isinstance(input, unicode): + labels = dots.split(input) + else: + # Must be ASCII string + unicode(input, "ascii") + labels = input.split(".") + + result = [] + for label in labels: + result.append(ToUnicode(label)) + + return u".".join(result), len(input) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py new file mode 100644 index 00000000000..e7f2d453957 --- /dev/null +++ b/Lib/encodings/punycode.py @@ -0,0 +1,222 @@ +# -*- coding: iso-8859-1 -*- +""" Codec for the Punicode encoding, as specified in RFC 3492 + +Written by Martin v. Löwis. +""" + +import codecs + +##################### Encoding ##################################### + +def segregate(str): + """3.1 Basic code point segregation""" + base = [] + extended = {} + for c in str: + if ord(c) < 128: + base.append(c) + else: + extended[c] = 1 + extended = extended.keys() + extended.sort() + return "".join(base).encode("ascii"),extended + +def selective_len(str, max): + """Return the length of str, considering only characters below max.""" + res = 0 + for c in str: + if ord(c) < max: + res += 1 + return res + +def selective_find(str, char, index, pos): + """Return a pair (index, pos), indicating the next occurrence of + char in str. index is the position of the character considering + only ordinals up to and including char, and pos is the position in + the full string. index/pos is the starting position in the full + string.""" + + l = len(str) + while 1: + pos += 1 + if pos == l: + return (-1, -1) + c = str[pos] + if c == char: + return index+1, pos + elif c < char: + index += 1 + +def insertion_unsort(str, extended): + """3.2 Insertion unsort coding""" + oldchar = 0x80 + result = [] + oldindex = -1 + for c in extended: + index = pos = -1 + char = ord(c) + curlen = selective_len(str, char) + delta = (curlen+1) * (char - oldchar) + while 1: + index,pos = selective_find(str,c,index,pos) + if index == -1: + break + delta += index - oldindex + result.append(delta-1) + oldindex = index + delta = 0 + oldchar = char + + return result + +def T(j, bias): + # Punycode parameters: tmin = 1, tmax = 26, base = 36 + res = 36 * (j + 1) - bias + if res < 1: return 1 + if res > 26: return 26 + return res + +digits = "abcdefghijklmnopqrstuvwxyz0123456789" +def generate_generalized_integer(N, bias): + """3.3 Generalized variable-length integers""" + result = [] + j = 0 + while 1: + t = T(j, bias) + if N < t: + result.append(digits[N]) + return result + result.append(digits[t + ((N - t) % (36 - t))]) + N = (N - t) // (36 - t) + j += 1 + +def adapt(delta, first, numchars): + if first: + delta //= 700 + else: + delta //= 2 + delta += delta // numchars + # ((base - tmin) * tmax) // 2 == 455 + divisions = 0 + while delta > 455: + delta = delta // 35 # base - tmin + divisions += 36 + bias = divisions + (36 * delta // (delta + 38)) + return bias + + +def generate_integers(baselen, deltas): + """3.4 Bias adaptation""" + # Punycode parameters: initial bias = 72, damp = 700, skew = 38 + result = [] + bias = 72 + for points, delta in enumerate(deltas): + s = generate_generalized_integer(delta, bias) + result.extend(s) + bias = adapt(delta, points==0, baselen+points+1) + return "".join(result) + +def punycode_encode(text): + base, extended = segregate(text) + base = base.encode("ascii") + deltas = insertion_unsort(text, extended) + extended = generate_integers(len(base), deltas) + if base: + return base + "-" + extended + return extended + +##################### Decoding ##################################### + +def decode_generalized_number(extended, extpos, bias, errors): + """3.3 Generalized variable-length integers""" + result = 0 + w = 1 + j = 0 + while 1: + try: + char = ord(extended[extpos]) + except IndexError: + if errors == "strict": + raise UnicodeError, "incomplete punicode string" + return extpos + 1, None + extpos += 1 + if 0x41 <= char <= 0x5A: # A-Z + digit = char - 0x41 + elif 0x30 <= char <= 0x39: + digit = char - 22 # 0x30-26 + elif errors == "strict": + raise UnicodeError("Invalid extended code point '%s'" + % extended[extpos]) + else: + return extpos, None + t = T(j, bias) + result += digit * w + if digit < t: + return extpos, result + w = w * (36 - t) + j += 1 + + +def insertion_sort(base, extended, errors): + """3.2 Insertion unsort coding""" + char = 0x80 + pos = -1 + bias = 72 + extpos = 0 + while extpos < len(extended): + newpos, delta = decode_generalized_number(extended, extpos, + bias, errors) + if delta is None: + # There was an error in decoding. We can't continue because + # synchronization is lost. + return base + pos += delta+1 + char += pos // (len(base) + 1) + if char > 0x10FFFF: + if errors == "strict": + raise UnicodeError, ("Invalid character U+%x" % char) + char = ord('?') + pos = pos % (len(base) + 1) + base = base[:pos] + unichr(char) + base[pos:] + bias = adapt(delta, (extpos == 0), len(base)) + extpos = newpos + return base + +def punycode_decode(text, errors): + pos = text.rfind("-") + if pos == -1: + base = "" + extended = text + else: + base = text[:pos] + extended = text[pos+1:] + base = unicode(base, "ascii", errors) + extended = extended.upper() + return insertion_sort(base, extended, errors) + +### Codec APIs + +class Codec(codecs.Codec): + def encode(self,input,errors='strict'): + + res = punycode_encode(input) + return res, len(input) + + def decode(self,input,errors='strict'): + + if errors not in ('strict', 'replace', 'ignore'): + raise UnicodeError, "Unsupported error handling "+errors + res = punycode_decode(input, errors) + return res, len(input) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) diff --git a/Lib/httplib.py b/Lib/httplib.py index ca215a4f7ac..caf6ccd3e77 100644 --- a/Lib/httplib.py +++ b/Lib/httplib.py @@ -655,11 +655,11 @@ class HTTPConnection: nil, netloc, nil, nil, nil = urlsplit(url) if netloc: - self.putheader('Host', netloc) + self.putheader('Host', netloc.encode("idna")) elif self.port == HTTP_PORT: - self.putheader('Host', self.host) + self.putheader('Host', self.host.encode("idna")) else: - self.putheader('Host', "%s:%s" % (self.host, self.port)) + self.putheader('Host', "%s:%s" % (self.host.encode("idna"), self.port)) # note: we are assuming that clients will not attempt to set these # headers since *this* library must deal with the diff --git a/Lib/stringprep.py b/Lib/stringprep.py new file mode 100644 index 00000000000..ec5b098cbdc --- /dev/null +++ b/Lib/stringprep.py @@ -0,0 +1,273 @@ +# This file is generated by mkstringprep.py. DO NOT EDIT. +"""Library that exposes various tables found in the StringPrep RFC 3454. + +There are two kinds of tables: sets, for which a member test is provided, +and mappings, for which a mapping function is provided. +""" + +import unicodedata, sets + +assert unicodedata.unidata_version == '3.2.0' + +def in_table_a1(code): + if unicodedata.category(code) != 'Cn': return False + c = ord(code) + if 0xFDD0 <= c < 0xFDF0: return False + return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) + + +b1_set = sets.Set([173, 847, 6150, 6155, 6156, 6157, 8203, 8204, 8205, 8288, 65279] + range(65024,65040)) +def in_table_b1(code): + return ord(code) in b1_set + + +b3_exceptions = { +0xb5:u'\u03bc', 0xdf:u'ss', 0x130:u'i\u0307', 0x149:u'\u02bcn', +0x17f:u's', 0x1f0:u'j\u030c', 0x345:u'\u03b9', 0x37a:u' \u03b9', +0x390:u'\u03b9\u0308\u0301', 0x3b0:u'\u03c5\u0308\u0301', 0x3c2:u'\u03c3', 0x3d0:u'\u03b2', +0x3d1:u'\u03b8', 0x3d2:u'\u03c5', 0x3d3:u'\u03cd', 0x3d4:u'\u03cb', +0x3d5:u'\u03c6', 0x3d6:u'\u03c0', 0x3f0:u'\u03ba', 0x3f1:u'\u03c1', +0x3f2:u'\u03c3', 0x3f5:u'\u03b5', 0x587:u'\u0565\u0582', 0x1e96:u'h\u0331', +0x1e97:u't\u0308', 0x1e98:u'w\u030a', 0x1e99:u'y\u030a', 0x1e9a:u'a\u02be', +0x1e9b:u'\u1e61', 0x1f50:u'\u03c5\u0313', 0x1f52:u'\u03c5\u0313\u0300', 0x1f54:u'\u03c5\u0313\u0301', +0x1f56:u'\u03c5\u0313\u0342', 0x1f80:u'\u1f00\u03b9', 0x1f81:u'\u1f01\u03b9', 0x1f82:u'\u1f02\u03b9', +0x1f83:u'\u1f03\u03b9', 0x1f84:u'\u1f04\u03b9', 0x1f85:u'\u1f05\u03b9', 0x1f86:u'\u1f06\u03b9', +0x1f87:u'\u1f07\u03b9', 0x1f88:u'\u1f00\u03b9', 0x1f89:u'\u1f01\u03b9', 0x1f8a:u'\u1f02\u03b9', +0x1f8b:u'\u1f03\u03b9', 0x1f8c:u'\u1f04\u03b9', 0x1f8d:u'\u1f05\u03b9', 0x1f8e:u'\u1f06\u03b9', +0x1f8f:u'\u1f07\u03b9', 0x1f90:u'\u1f20\u03b9', 0x1f91:u'\u1f21\u03b9', 0x1f92:u'\u1f22\u03b9', +0x1f93:u'\u1f23\u03b9', 0x1f94:u'\u1f24\u03b9', 0x1f95:u'\u1f25\u03b9', 0x1f96:u'\u1f26\u03b9', +0x1f97:u'\u1f27\u03b9', 0x1f98:u'\u1f20\u03b9', 0x1f99:u'\u1f21\u03b9', 0x1f9a:u'\u1f22\u03b9', +0x1f9b:u'\u1f23\u03b9', 0x1f9c:u'\u1f24\u03b9', 0x1f9d:u'\u1f25\u03b9', 0x1f9e:u'\u1f26\u03b9', +0x1f9f:u'\u1f27\u03b9', 0x1fa0:u'\u1f60\u03b9', 0x1fa1:u'\u1f61\u03b9', 0x1fa2:u'\u1f62\u03b9', +0x1fa3:u'\u1f63\u03b9', 0x1fa4:u'\u1f64\u03b9', 0x1fa5:u'\u1f65\u03b9', 0x1fa6:u'\u1f66\u03b9', +0x1fa7:u'\u1f67\u03b9', 0x1fa8:u'\u1f60\u03b9', 0x1fa9:u'\u1f61\u03b9', 0x1faa:u'\u1f62\u03b9', +0x1fab:u'\u1f63\u03b9', 0x1fac:u'\u1f64\u03b9', 0x1fad:u'\u1f65\u03b9', 0x1fae:u'\u1f66\u03b9', +0x1faf:u'\u1f67\u03b9', 0x1fb2:u'\u1f70\u03b9', 0x1fb3:u'\u03b1\u03b9', 0x1fb4:u'\u03ac\u03b9', +0x1fb6:u'\u03b1\u0342', 0x1fb7:u'\u03b1\u0342\u03b9', 0x1fbc:u'\u03b1\u03b9', 0x1fbe:u'\u03b9', +0x1fc2:u'\u1f74\u03b9', 0x1fc3:u'\u03b7\u03b9', 0x1fc4:u'\u03ae\u03b9', 0x1fc6:u'\u03b7\u0342', +0x1fc7:u'\u03b7\u0342\u03b9', 0x1fcc:u'\u03b7\u03b9', 0x1fd2:u'\u03b9\u0308\u0300', 0x1fd3:u'\u03b9\u0308\u0301', +0x1fd6:u'\u03b9\u0342', 0x1fd7:u'\u03b9\u0308\u0342', 0x1fe2:u'\u03c5\u0308\u0300', 0x1fe3:u'\u03c5\u0308\u0301', +0x1fe4:u'\u03c1\u0313', 0x1fe6:u'\u03c5\u0342', 0x1fe7:u'\u03c5\u0308\u0342', 0x1ff2:u'\u1f7c\u03b9', +0x1ff3:u'\u03c9\u03b9', 0x1ff4:u'\u03ce\u03b9', 0x1ff6:u'\u03c9\u0342', 0x1ff7:u'\u03c9\u0342\u03b9', +0x1ffc:u'\u03c9\u03b9', 0x20a8:u'rs', 0x2102:u'c', 0x2103:u'\xb0c', +0x2107:u'\u025b', 0x2109:u'\xb0f', 0x210b:u'h', 0x210c:u'h', +0x210d:u'h', 0x2110:u'i', 0x2111:u'i', 0x2112:u'l', +0x2115:u'n', 0x2116:u'no', 0x2119:u'p', 0x211a:u'q', +0x211b:u'r', 0x211c:u'r', 0x211d:u'r', 0x2120:u'sm', +0x2121:u'tel', 0x2122:u'tm', 0x2124:u'z', 0x2128:u'z', +0x212c:u'b', 0x212d:u'c', 0x2130:u'e', 0x2131:u'f', +0x2133:u'm', 0x213e:u'\u03b3', 0x213f:u'\u03c0', 0x2145:u'd', +0x3371:u'hpa', 0x3373:u'au', 0x3375:u'ov', 0x3380:u'pa', +0x3381:u'na', 0x3382:u'\u03bca', 0x3383:u'ma', 0x3384:u'ka', +0x3385:u'kb', 0x3386:u'mb', 0x3387:u'gb', 0x338a:u'pf', +0x338b:u'nf', 0x338c:u'\u03bcf', 0x3390:u'hz', 0x3391:u'khz', +0x3392:u'mhz', 0x3393:u'ghz', 0x3394:u'thz', 0x33a9:u'pa', +0x33aa:u'kpa', 0x33ab:u'mpa', 0x33ac:u'gpa', 0x33b4:u'pv', +0x33b5:u'nv', 0x33b6:u'\u03bcv', 0x33b7:u'mv', 0x33b8:u'kv', +0x33b9:u'mv', 0x33ba:u'pw', 0x33bb:u'nw', 0x33bc:u'\u03bcw', +0x33bd:u'mw', 0x33be:u'kw', 0x33bf:u'mw', 0x33c0:u'k\u03c9', +0x33c1:u'm\u03c9', 0x33c3:u'bq', 0x33c6:u'c\u2215kg', 0x33c7:u'co.', +0x33c8:u'db', 0x33c9:u'gy', 0x33cb:u'hp', 0x33cd:u'kk', +0x33ce:u'km', 0x33d7:u'ph', 0x33d9:u'ppm', 0x33da:u'pr', +0x33dc:u'sv', 0x33dd:u'wb', 0xfb00:u'ff', 0xfb01:u'fi', +0xfb02:u'fl', 0xfb03:u'ffi', 0xfb04:u'ffl', 0xfb05:u'st', +0xfb06:u'st', 0xfb13:u'\u0574\u0576', 0xfb14:u'\u0574\u0565', 0xfb15:u'\u0574\u056b', +0xfb16:u'\u057e\u0576', 0xfb17:u'\u0574\u056d', 0x1d400:u'a', 0x1d401:u'b', +0x1d402:u'c', 0x1d403:u'd', 0x1d404:u'e', 0x1d405:u'f', +0x1d406:u'g', 0x1d407:u'h', 0x1d408:u'i', 0x1d409:u'j', +0x1d40a:u'k', 0x1d40b:u'l', 0x1d40c:u'm', 0x1d40d:u'n', +0x1d40e:u'o', 0x1d40f:u'p', 0x1d410:u'q', 0x1d411:u'r', +0x1d412:u's', 0x1d413:u't', 0x1d414:u'u', 0x1d415:u'v', +0x1d416:u'w', 0x1d417:u'x', 0x1d418:u'y', 0x1d419:u'z', +0x1d434:u'a', 0x1d435:u'b', 0x1d436:u'c', 0x1d437:u'd', +0x1d438:u'e', 0x1d439:u'f', 0x1d43a:u'g', 0x1d43b:u'h', +0x1d43c:u'i', 0x1d43d:u'j', 0x1d43e:u'k', 0x1d43f:u'l', +0x1d440:u'm', 0x1d441:u'n', 0x1d442:u'o', 0x1d443:u'p', +0x1d444:u'q', 0x1d445:u'r', 0x1d446:u's', 0x1d447:u't', +0x1d448:u'u', 0x1d449:u'v', 0x1d44a:u'w', 0x1d44b:u'x', +0x1d44c:u'y', 0x1d44d:u'z', 0x1d468:u'a', 0x1d469:u'b', +0x1d46a:u'c', 0x1d46b:u'd', 0x1d46c:u'e', 0x1d46d:u'f', +0x1d46e:u'g', 0x1d46f:u'h', 0x1d470:u'i', 0x1d471:u'j', +0x1d472:u'k', 0x1d473:u'l', 0x1d474:u'm', 0x1d475:u'n', +0x1d476:u'o', 0x1d477:u'p', 0x1d478:u'q', 0x1d479:u'r', +0x1d47a:u's', 0x1d47b:u't', 0x1d47c:u'u', 0x1d47d:u'v', +0x1d47e:u'w', 0x1d47f:u'x', 0x1d480:u'y', 0x1d481:u'z', +0x1d49c:u'a', 0x1d49e:u'c', 0x1d49f:u'd', 0x1d4a2:u'g', +0x1d4a5:u'j', 0x1d4a6:u'k', 0x1d4a9:u'n', 0x1d4aa:u'o', +0x1d4ab:u'p', 0x1d4ac:u'q', 0x1d4ae:u's', 0x1d4af:u't', +0x1d4b0:u'u', 0x1d4b1:u'v', 0x1d4b2:u'w', 0x1d4b3:u'x', +0x1d4b4:u'y', 0x1d4b5:u'z', 0x1d4d0:u'a', 0x1d4d1:u'b', +0x1d4d2:u'c', 0x1d4d3:u'd', 0x1d4d4:u'e', 0x1d4d5:u'f', +0x1d4d6:u'g', 0x1d4d7:u'h', 0x1d4d8:u'i', 0x1d4d9:u'j', +0x1d4da:u'k', 0x1d4db:u'l', 0x1d4dc:u'm', 0x1d4dd:u'n', +0x1d4de:u'o', 0x1d4df:u'p', 0x1d4e0:u'q', 0x1d4e1:u'r', +0x1d4e2:u's', 0x1d4e3:u't', 0x1d4e4:u'u', 0x1d4e5:u'v', +0x1d4e6:u'w', 0x1d4e7:u'x', 0x1d4e8:u'y', 0x1d4e9:u'z', +0x1d504:u'a', 0x1d505:u'b', 0x1d507:u'd', 0x1d508:u'e', +0x1d509:u'f', 0x1d50a:u'g', 0x1d50d:u'j', 0x1d50e:u'k', +0x1d50f:u'l', 0x1d510:u'm', 0x1d511:u'n', 0x1d512:u'o', +0x1d513:u'p', 0x1d514:u'q', 0x1d516:u's', 0x1d517:u't', +0x1d518:u'u', 0x1d519:u'v', 0x1d51a:u'w', 0x1d51b:u'x', +0x1d51c:u'y', 0x1d538:u'a', 0x1d539:u'b', 0x1d53b:u'd', +0x1d53c:u'e', 0x1d53d:u'f', 0x1d53e:u'g', 0x1d540:u'i', +0x1d541:u'j', 0x1d542:u'k', 0x1d543:u'l', 0x1d544:u'm', +0x1d546:u'o', 0x1d54a:u's', 0x1d54b:u't', 0x1d54c:u'u', +0x1d54d:u'v', 0x1d54e:u'w', 0x1d54f:u'x', 0x1d550:u'y', +0x1d56c:u'a', 0x1d56d:u'b', 0x1d56e:u'c', 0x1d56f:u'd', +0x1d570:u'e', 0x1d571:u'f', 0x1d572:u'g', 0x1d573:u'h', +0x1d574:u'i', 0x1d575:u'j', 0x1d576:u'k', 0x1d577:u'l', +0x1d578:u'm', 0x1d579:u'n', 0x1d57a:u'o', 0x1d57b:u'p', +0x1d57c:u'q', 0x1d57d:u'r', 0x1d57e:u's', 0x1d57f:u't', +0x1d580:u'u', 0x1d581:u'v', 0x1d582:u'w', 0x1d583:u'x', +0x1d584:u'y', 0x1d585:u'z', 0x1d5a0:u'a', 0x1d5a1:u'b', +0x1d5a2:u'c', 0x1d5a3:u'd', 0x1d5a4:u'e', 0x1d5a5:u'f', +0x1d5a6:u'g', 0x1d5a7:u'h', 0x1d5a8:u'i', 0x1d5a9:u'j', +0x1d5aa:u'k', 0x1d5ab:u'l', 0x1d5ac:u'm', 0x1d5ad:u'n', +0x1d5ae:u'o', 0x1d5af:u'p', 0x1d5b0:u'q', 0x1d5b1:u'r', +0x1d5b2:u's', 0x1d5b3:u't', 0x1d5b4:u'u', 0x1d5b5:u'v', +0x1d5b6:u'w', 0x1d5b7:u'x', 0x1d5b8:u'y', 0x1d5b9:u'z', +0x1d5d4:u'a', 0x1d5d5:u'b', 0x1d5d6:u'c', 0x1d5d7:u'd', +0x1d5d8:u'e', 0x1d5d9:u'f', 0x1d5da:u'g', 0x1d5db:u'h', +0x1d5dc:u'i', 0x1d5dd:u'j', 0x1d5de:u'k', 0x1d5df:u'l', +0x1d5e0:u'm', 0x1d5e1:u'n', 0x1d5e2:u'o', 0x1d5e3:u'p', +0x1d5e4:u'q', 0x1d5e5:u'r', 0x1d5e6:u's', 0x1d5e7:u't', +0x1d5e8:u'u', 0x1d5e9:u'v', 0x1d5ea:u'w', 0x1d5eb:u'x', +0x1d5ec:u'y', 0x1d5ed:u'z', 0x1d608:u'a', 0x1d609:u'b', +0x1d60a:u'c', 0x1d60b:u'd', 0x1d60c:u'e', 0x1d60d:u'f', +0x1d60e:u'g', 0x1d60f:u'h', 0x1d610:u'i', 0x1d611:u'j', +0x1d612:u'k', 0x1d613:u'l', 0x1d614:u'm', 0x1d615:u'n', +0x1d616:u'o', 0x1d617:u'p', 0x1d618:u'q', 0x1d619:u'r', +0x1d61a:u's', 0x1d61b:u't', 0x1d61c:u'u', 0x1d61d:u'v', +0x1d61e:u'w', 0x1d61f:u'x', 0x1d620:u'y', 0x1d621:u'z', +0x1d63c:u'a', 0x1d63d:u'b', 0x1d63e:u'c', 0x1d63f:u'd', +0x1d640:u'e', 0x1d641:u'f', 0x1d642:u'g', 0x1d643:u'h', +0x1d644:u'i', 0x1d645:u'j', 0x1d646:u'k', 0x1d647:u'l', +0x1d648:u'm', 0x1d649:u'n', 0x1d64a:u'o', 0x1d64b:u'p', +0x1d64c:u'q', 0x1d64d:u'r', 0x1d64e:u's', 0x1d64f:u't', +0x1d650:u'u', 0x1d651:u'v', 0x1d652:u'w', 0x1d653:u'x', +0x1d654:u'y', 0x1d655:u'z', 0x1d670:u'a', 0x1d671:u'b', +0x1d672:u'c', 0x1d673:u'd', 0x1d674:u'e', 0x1d675:u'f', +0x1d676:u'g', 0x1d677:u'h', 0x1d678:u'i', 0x1d679:u'j', +0x1d67a:u'k', 0x1d67b:u'l', 0x1d67c:u'm', 0x1d67d:u'n', +0x1d67e:u'o', 0x1d67f:u'p', 0x1d680:u'q', 0x1d681:u'r', +0x1d682:u's', 0x1d683:u't', 0x1d684:u'u', 0x1d685:u'v', +0x1d686:u'w', 0x1d687:u'x', 0x1d688:u'y', 0x1d689:u'z', +0x1d6a8:u'\u03b1', 0x1d6a9:u'\u03b2', 0x1d6aa:u'\u03b3', 0x1d6ab:u'\u03b4', +0x1d6ac:u'\u03b5', 0x1d6ad:u'\u03b6', 0x1d6ae:u'\u03b7', 0x1d6af:u'\u03b8', +0x1d6b0:u'\u03b9', 0x1d6b1:u'\u03ba', 0x1d6b2:u'\u03bb', 0x1d6b3:u'\u03bc', +0x1d6b4:u'\u03bd', 0x1d6b5:u'\u03be', 0x1d6b6:u'\u03bf', 0x1d6b7:u'\u03c0', +0x1d6b8:u'\u03c1', 0x1d6b9:u'\u03b8', 0x1d6ba:u'\u03c3', 0x1d6bb:u'\u03c4', +0x1d6bc:u'\u03c5', 0x1d6bd:u'\u03c6', 0x1d6be:u'\u03c7', 0x1d6bf:u'\u03c8', +0x1d6c0:u'\u03c9', 0x1d6d3:u'\u03c3', 0x1d6e2:u'\u03b1', 0x1d6e3:u'\u03b2', +0x1d6e4:u'\u03b3', 0x1d6e5:u'\u03b4', 0x1d6e6:u'\u03b5', 0x1d6e7:u'\u03b6', +0x1d6e8:u'\u03b7', 0x1d6e9:u'\u03b8', 0x1d6ea:u'\u03b9', 0x1d6eb:u'\u03ba', +0x1d6ec:u'\u03bb', 0x1d6ed:u'\u03bc', 0x1d6ee:u'\u03bd', 0x1d6ef:u'\u03be', +0x1d6f0:u'\u03bf', 0x1d6f1:u'\u03c0', 0x1d6f2:u'\u03c1', 0x1d6f3:u'\u03b8', +0x1d6f4:u'\u03c3', 0x1d6f5:u'\u03c4', 0x1d6f6:u'\u03c5', 0x1d6f7:u'\u03c6', +0x1d6f8:u'\u03c7', 0x1d6f9:u'\u03c8', 0x1d6fa:u'\u03c9', 0x1d70d:u'\u03c3', +0x1d71c:u'\u03b1', 0x1d71d:u'\u03b2', 0x1d71e:u'\u03b3', 0x1d71f:u'\u03b4', +0x1d720:u'\u03b5', 0x1d721:u'\u03b6', 0x1d722:u'\u03b7', 0x1d723:u'\u03b8', +0x1d724:u'\u03b9', 0x1d725:u'\u03ba', 0x1d726:u'\u03bb', 0x1d727:u'\u03bc', +0x1d728:u'\u03bd', 0x1d729:u'\u03be', 0x1d72a:u'\u03bf', 0x1d72b:u'\u03c0', +0x1d72c:u'\u03c1', 0x1d72d:u'\u03b8', 0x1d72e:u'\u03c3', 0x1d72f:u'\u03c4', +0x1d730:u'\u03c5', 0x1d731:u'\u03c6', 0x1d732:u'\u03c7', 0x1d733:u'\u03c8', +0x1d734:u'\u03c9', 0x1d747:u'\u03c3', 0x1d756:u'\u03b1', 0x1d757:u'\u03b2', +0x1d758:u'\u03b3', 0x1d759:u'\u03b4', 0x1d75a:u'\u03b5', 0x1d75b:u'\u03b6', +0x1d75c:u'\u03b7', 0x1d75d:u'\u03b8', 0x1d75e:u'\u03b9', 0x1d75f:u'\u03ba', +0x1d760:u'\u03bb', 0x1d761:u'\u03bc', 0x1d762:u'\u03bd', 0x1d763:u'\u03be', +0x1d764:u'\u03bf', 0x1d765:u'\u03c0', 0x1d766:u'\u03c1', 0x1d767:u'\u03b8', +0x1d768:u'\u03c3', 0x1d769:u'\u03c4', 0x1d76a:u'\u03c5', 0x1d76b:u'\u03c6', +0x1d76c:u'\u03c7', 0x1d76d:u'\u03c8', 0x1d76e:u'\u03c9', 0x1d781:u'\u03c3', +0x1d790:u'\u03b1', 0x1d791:u'\u03b2', 0x1d792:u'\u03b3', 0x1d793:u'\u03b4', +0x1d794:u'\u03b5', 0x1d795:u'\u03b6', 0x1d796:u'\u03b7', 0x1d797:u'\u03b8', +0x1d798:u'\u03b9', 0x1d799:u'\u03ba', 0x1d79a:u'\u03bb', 0x1d79b:u'\u03bc', +0x1d79c:u'\u03bd', 0x1d79d:u'\u03be', 0x1d79e:u'\u03bf', 0x1d79f:u'\u03c0', +0x1d7a0:u'\u03c1', 0x1d7a1:u'\u03b8', 0x1d7a2:u'\u03c3', 0x1d7a3:u'\u03c4', +0x1d7a4:u'\u03c5', 0x1d7a5:u'\u03c6', 0x1d7a6:u'\u03c7', 0x1d7a7:u'\u03c8', +0x1d7a8:u'\u03c9', 0x1d7bb:u'\u03c3', } + +def map_table_b3(code): + r = b3_exceptions.get(ord(code)) + if r is not None: return r + return code.lower() + + +def map_table_b2(a): + al = map_table_b3(a) + b = unicodedata.normalize("NFKC", al) + bl = u"".join([map_table_b3(ch) for ch in b]) + c = unicodedata.normalize("NFKC", bl) + if b != c: + return c + else: + return al + + +def in_table_c11(code): + return code == u" " + + +def in_table_c12(code): + return unicodedata.category(code) == "Zs" and code != u" " + +def in_table_c11_c12(code): + return unicodedata.category(code) == "Zs" + + +def in_table_c21(code): + return ord(code) < 128 and unicodedata.category(code) == "Cc" + +c22_specials = sets.Set([1757, 1807, 6158, 8204, 8205, 8232, 8233, 65279] + range(8288,8292) + range(8298,8304) + range(65529,65533) + range(119155,119163)) +def in_table_c22(code): + c = ord(code) + if c < 128: return False + if unicodedata.category(code) == "Cc": return True + return c in c22_specials + +def in_table_c21_c22(code): + return unicodedata.category(code) == "Cc" or \ + ord(code) in c22_specials + + +def in_table_c3(code): + return unicodedata.category(code) == "Co" + + +def in_table_c4(code): + c = ord(code) + if c < 0xFDD0: return False + if c < 0xFDF0: return True + return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) + + +def in_table_c5(code): + return unicodedata.category(code) == "Cs" + + +c6_set = sets.Set(range(65529,65534)) +def in_table_c6(code): + return ord(code) in c6_set + + +c7_set = sets.Set(range(12272,12284)) +def in_table_c7(code): + return ord(code) in c7_set + + +c8_set = sets.Set([832, 833, 8206, 8207] + range(8234,8239) + range(8298,8304)) +def in_table_c8(code): + return ord(code) in c8_set + + +c9_set = sets.Set([917505] + range(917536,917632)) +def in_table_c9(code): + return ord(code) in c9_set + + +def in_table_d1(code): + return unicodedata.bidirectional(code) in ("R","AL") + + +def in_table_d2(code): + return unicodedata.bidirectional(code) == "L" + diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 9a4f35f2166..769a40d20aa 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -36,11 +36,307 @@ class RecodingTest(unittest.TestCase): # Python used to crash on this at exit because of a refcount # bug in _codecsmodule.c +# From RFC 3492 +punycode_testcases = [ + # A Arabic (Egyptian): + (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" + u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", + "egbpdaj6bu4bxfgehfvwxn"), + # B Chinese (simplified): + (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", + "ihqwcrb4cv8a8dqg056pqjye"), + # C Chinese (traditional): + (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", + "ihqwctvzc91f659drss3x8bo0yb"), + # D Czech: Proprostnemluvesky + (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" + u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" + u"\u0065\u0073\u006B\u0079", + "Proprostnemluvesky-uyb24dma41a"), + # E Hebrew: + (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" + u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" + u"\u05D1\u05E8\u05D9\u05EA", + "4dbcagdahymbxekheh6e0a7fei0b"), + # F Hindi (Devanagari): + (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" + u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" + u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" + u"\u0939\u0948\u0902", + "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), + + #(G) Japanese (kanji and hiragana): + (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" + u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", + "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), + + # (H) Korean (Hangul syllables): + (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" + u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" + u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", + "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" + "psd879ccm6fea98c"), + + # (I) Russian (Cyrillic): + (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" + u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" + u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" + u"\u0438", + "b1abfaaepdrnnbgefbaDotcwatmq2g4l"), + + # (J) Spanish: PorqunopuedensimplementehablarenEspaol + (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" + u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" + u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" + u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" + u"\u0061\u00F1\u006F\u006C", + "PorqunopuedensimplementehablarenEspaol-fmd56a"), + + # (K) Vietnamese: + # Tisaohkhngthch\ + # nitingVit + (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" + u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" + u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" + u"\u0056\u0069\u1EC7\u0074", + "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), + + + #(L) 3B + (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", + "3B-ww4c5e180e575a65lsy2b"), + + # (M) -with-SUPER-MONKEYS + (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" + u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" + u"\u004F\u004E\u004B\u0045\u0059\u0053", + "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), + + # (N) Hello-Another-Way- + (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" + u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" + u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240", + "Hello-Another-Way--fc4qua05auwb3674vfr0b"), + + # (O) 2 + (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", + "2-u9tlzr9756bt3uc0v"), + + # (P) MajiKoi5 + (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" + u"\u308B\u0035\u79D2\u524D", + "MajiKoi5-783gue6qz075azm5e"), + + # (Q) de + (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", + "de-jg4avhby1noc0d"), + + # (R) + (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", + "d9juau41awczczp"), + + # (S) -> $1.00 <- + (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" + u"\u003C\u002D", + "-> $1.00 <--") + ] + +for i in punycode_testcases: + if len(i)!=2: + print repr(i) + +class PunycodeTest(unittest.TestCase): + def test_encode(self): + for uni, puny in punycode_testcases: + # Need to convert both strings to lower case, since + # some of the extended encodings use upper case, but our + # code produces only lower case. Converting just puny to + # lower is also insufficient, since some of the input characters + # are upper case. + self.assertEquals(uni.encode("punycode").lower(), puny.lower()) + + def test_decode(self): + for uni, puny in punycode_testcases: + self.assertEquals(uni, puny.decode("punycode")) + +# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html +nameprep_tests = [ + # 3.1 Map to nothing. + ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' + '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' + '\xb8\x8f\xef\xbb\xbf', + 'foobarbaz'), + # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. + ('CAFE', + 'cafe'), + # 3.3 Case folding 8bit U+00DF (german sharp s). + # The original test case is bogus; it says \xc3\xdf + ('\xc3\x9f', + 'ss'), + # 3.4 Case folding U+0130 (turkish capital I with dot). + ('\xc4\xb0', + 'i\xcc\x87'), + # 3.5 Case folding multibyte U+0143 U+037A. + ('\xc5\x83\xcd\xba', + '\xc5\x84 \xce\xb9'), + # 3.6 Case folding U+2121 U+33C6 U+1D7BB. + # XXX: skip this as it fails in UCS-2 mode + #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', + # 'telc\xe2\x88\x95kg\xcf\x83'), + (None, None), + # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. + ('j\xcc\x8c\xc2\xa0\xc2\xaa', + '\xc7\xb0 a'), + # 3.8 Case folding U+1FB7 and normalization. + ('\xe1\xbe\xb7', + '\xe1\xbe\xb6\xce\xb9'), + # 3.9 Self-reverting case folding U+01F0 and normalization. + # The original test case is bogus, it says `\xc7\xf0' + ('\xc7\xb0', + '\xc7\xb0'), + # 3.10 Self-reverting case folding U+0390 and normalization. + ('\xce\x90', + '\xce\x90'), + # 3.11 Self-reverting case folding U+03B0 and normalization. + ('\xce\xb0', + '\xce\xb0'), + # 3.12 Self-reverting case folding U+1E96 and normalization. + ('\xe1\xba\x96', + '\xe1\xba\x96'), + # 3.13 Self-reverting case folding U+1F56 and normalization. + ('\xe1\xbd\x96', + '\xe1\xbd\x96'), + # 3.14 ASCII space character U+0020. + (' ', + ' '), + # 3.15 Non-ASCII 8bit space character U+00A0. + ('\xc2\xa0', + ' '), + # 3.16 Non-ASCII multibyte space character U+1680. + ('\xe1\x9a\x80', + None), + # 3.17 Non-ASCII multibyte space character U+2000. + ('\xe2\x80\x80', + ' '), + # 3.18 Zero Width Space U+200b. + ('\xe2\x80\x8b', + ''), + # 3.19 Non-ASCII multibyte space character U+3000. + ('\xe3\x80\x80', + ' '), + # 3.20 ASCII control characters U+0010 U+007F. + ('\x10\x7f', + '\x10\x7f'), + # 3.21 Non-ASCII 8bit control character U+0085. + ('\xc2\x85', + None), + # 3.22 Non-ASCII multibyte control character U+180E. + ('\xe1\xa0\x8e', + None), + # 3.23 Zero Width No-Break Space U+FEFF. + ('\xef\xbb\xbf', + ''), + # 3.24 Non-ASCII control character U+1D175. + ('\xf0\x9d\x85\xb5', + None), + # 3.25 Plane 0 private use character U+F123. + ('\xef\x84\xa3', + None), + # 3.26 Plane 15 private use character U+F1234. + ('\xf3\xb1\x88\xb4', + None), + # 3.27 Plane 16 private use character U+10F234. + ('\xf4\x8f\x88\xb4', + None), + # 3.28 Non-character code point U+8FFFE. + ('\xf2\x8f\xbf\xbe', + None), + # 3.29 Non-character code point U+10FFFF. + ('\xf4\x8f\xbf\xbf', + None), + # 3.30 Surrogate code U+DF42. + ('\xed\xbd\x82', + None), + # 3.31 Non-plain text character U+FFFD. + ('\xef\xbf\xbd', + None), + # 3.32 Ideographic description character U+2FF5. + ('\xe2\xbf\xb5', + None), + # 3.33 Display property character U+0341. + ('\xcd\x81', + '\xcc\x81'), + # 3.34 Left-to-right mark U+200E. + ('\xe2\x80\x8e', + None), + # 3.35 Deprecated U+202A. + ('\xe2\x80\xaa', + None), + # 3.36 Language tagging character U+E0001. + ('\xf3\xa0\x80\x81', + None), + # 3.37 Language tagging character U+E0042. + ('\xf3\xa0\x81\x82', + None), + # 3.38 Bidi: RandALCat character U+05BE and LCat characters. + ('foo\xd6\xbebar', + None), + # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. + ('foo\xef\xb5\x90bar', + None), + # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. + ('foo\xef\xb9\xb6bar', + 'foo \xd9\x8ebar'), + # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. + ('\xd8\xa71', + None), + # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. + ('\xd8\xa71\xd8\xa8', + '\xd8\xa71\xd8\xa8'), + # 3.43 Unassigned code point U+E0002. + ('\xf3\xa0\x80\x82', + None), + # 3.44 Larger test (shrinking). + # Original test case reads \xc3\xdf + ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' + '\xaa\xce\xb0\xe2\x80\x80', + 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), + # 3.45 Larger test (expanding). + # Original test case reads \xc3\x9f + ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' + '\x80', + 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' + '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' + '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') + ] + + +class NameprepTest(unittest.TestCase): + def test_nameprep(self): + from encodings.idna import nameprep + for pos, (orig, prepped) in enumerate(nameprep_tests): + if orig is None: + # Skipped + continue + # The Unicode strings are given in UTF-8 + orig = unicode(orig, "utf-8") + if prepped is None: + # Input contains prohibited characters + self.assertRaises(UnicodeError, nameprep, orig) + else: + prepped = unicode(prepped, "utf-8") + try: + self.assertEquals(nameprep(orig), prepped) + except Exception,e: + raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) + def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(UTF16Test)) suite.addTest(unittest.makeSuite(EscapeDecodeTest)) suite.addTest(unittest.makeSuite(RecodingTest)) + suite.addTest(unittest.makeSuite(PunycodeTest)) + suite.addTest(unittest.makeSuite(NameprepTest)) test_support.run_suite(suite) diff --git a/Misc/NEWS b/Misc/NEWS index 3a19a20e0d0..07a9bc6b062 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -118,6 +118,11 @@ Extension modules Library ------- +- Support for internationalized domain names has been added through + the 'idna' and 'punycode' encodings, the 'stringprep' module, the + 'mkstringprep' tool, and enhancements to the socket and httplib + modules. + - htmlentitydefs has two new dictionaries: name2codepoint maps HTML entity names to Unicode codepoints (as integers). codepoint2name is the reverse mapping. See SF patch #722017. diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c index 9d8d6140d80..ba64cb86864 100644 --- a/Modules/socketmodule.c +++ b/Modules/socketmodule.c @@ -874,7 +874,8 @@ getsockaddrarg(PySocketSockObject *s, PyObject *args, args->ob_type->tp_name); return 0; } - if (!PyArg_ParseTuple(args, "si:getsockaddrarg", &host, &port)) + if (!PyArg_ParseTuple(args, "eti:getsockaddrarg", + "idna", &host, &port)) return 0; if (setipaddr(host, (struct sockaddr *)addr, sizeof(*addr), AF_INET) < 0) return 0; @@ -893,7 +894,8 @@ getsockaddrarg(PySocketSockObject *s, PyObject *args, int port, flowinfo, scope_id; addr = (struct sockaddr_in6*)&(s->sock_addr).in6; flowinfo = scope_id = 0; - if (!PyArg_ParseTuple(args, "si|ii", &host, &port, &flowinfo, + if (!PyArg_ParseTuple(args, "eti|ii", + "idna", &host, &port, &flowinfo, &scope_id)) { return 0; } @@ -2782,6 +2784,7 @@ socket_getaddrinfo(PyObject *self, PyObject *args) { struct addrinfo hints, *res; struct addrinfo *res0 = NULL; + PyObject *hobj = NULL; PyObject *pobj = (PyObject *)NULL; char pbuf[30]; char *hptr, *pptr; @@ -2789,12 +2792,27 @@ socket_getaddrinfo(PyObject *self, PyObject *args) int error; PyObject *all = (PyObject *)NULL; PyObject *single = (PyObject *)NULL; + PyObject *idna = NULL; family = socktype = protocol = flags = 0; family = AF_UNSPEC; - if (!PyArg_ParseTuple(args, "zO|iiii:getaddrinfo", - &hptr, &pobj, &family, &socktype, - &protocol, &flags)) { + if (!PyArg_ParseTuple(args, "OO|iiii:getaddrinfo", + &hobj, &pobj, &family, &socktype, + &protocol, &flags)) { + return NULL; + } + if (hobj == Py_None) { + hptr = NULL; + } else if (PyUnicode_Check(hobj)) { + idna = PyObject_CallMethod(hobj, "encode", "s", "idna"); + if (!idna) + return NULL; + hptr = PyString_AsString(idna); + } else if (PyString_Check(hobj)) { + hptr = PyString_AsString(hobj); + } else { + PyErr_SetString(PyExc_TypeError, + "getaddrinfo() argument 1 must be string or None"); return NULL; } if (PyInt_Check(pobj)) { @@ -2838,12 +2856,14 @@ socket_getaddrinfo(PyObject *self, PyObject *args) goto err; Py_XDECREF(single); } + Py_XDECREF(idna); if (res0) freeaddrinfo(res0); return all; err: Py_XDECREF(single); Py_XDECREF(all); + Py_XDECREF(idna); if (res0) freeaddrinfo(res0); return (PyObject *)NULL; diff --git a/Tools/unicode/mkstringprep.py b/Tools/unicode/mkstringprep.py new file mode 100644 index 00000000000..1dd1e04b7ed --- /dev/null +++ b/Tools/unicode/mkstringprep.py @@ -0,0 +1,433 @@ +import re, unicodedata, sys, sets +from sets import Set + +if sys.maxunicode == 65535: + raise RuntimeError, "need UCS-4 Python" + +def gen_category(cats): + for i in range(0, 0x110000): + if unicodedata.category(unichr(i)) in cats: + yield(i) + +def gen_bidirectional(cats): + for i in range(0, 0x110000): + if unicodedata.bidirectional(unichr(i)) in cats: + yield(i) + +def compact_set(l): + single = [] + tuple = [] + prev = None + span = 0 + for e in l: + if prev is None: + prev = e + span = 0 + continue + if prev+span+1 != e: + if span > 2: + tuple.append((prev,prev+span+1)) + else: + for i in range(prev, prev+span+1): + single.append(i) + prev = e + span = 0 + else: + span += 1 + if span: + tuple.append((prev,prev+span+1)) + else: + single.append(prev) + tuple = " + ".join(["range(%d,%d)" % t for t in tuple]) + if not single: + return "sets.Set(%s)" % tuple + if not tuple: + return "sets.Set(%s)" % repr(single) + return "sets.Set(%s + %s)" % (repr(single),tuple) + +############## Read the tables in the RFC ####################### + +data = open("rfc3454.txt").readlines() + +tables = [] +curname = None +for l in data: + l = l.strip() + if not l: + continue + # Skip RFC page breaks + if l.startswith("Hoffman & Blanchet") or\ + l.startswith("RFC 3454"): + continue + # Find start/end lines + m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l) + if m: + if m.group(1) == "Start": + if curname: + raise "Double Start",(curname, l) + curname = m.group(2) + table = {} + tables.append((curname, table)) + continue + else: + if not curname: + raise "End without start", l + curname = None + continue + if not curname: + continue + # Now we are in a table + fields = l.split(";") + if len(fields) > 1: + # Drop comment field + fields = fields[:-1] + if len(fields) == 1: + fields = fields[0].split("-") + if len(fields) > 1: + # range + try: + start, end = fields + except ValueError: + raise "Unpacking problem", l + else: + start = end = fields[0] + start = int(start, 16) + end = int(end, 16) + for i in range(start, end+1): + table[i] = i + else: + code, value = fields + value = value.strip() + if value: + value = [int(v, 16) for v in value.split(" ")] + else: + # table B.1 + value = None + table[int(code, 16)] = value + +########### Generate compact Python versions of the tables ############# + +print """# This file is generated by mkstringprep.py. DO NOT EDIT. +\"\"\"Library that exposes various tables found in the StringPrep RFC 3454. + +There are two kinds of tables: sets, for which a member test is provided, +and mappings, for which a mapping function is provided. +\"\"\" + +import unicodedata, sets +""" + +print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version) + +# A.1 is the table of unassigned characters +# XXX Plane 15 PUA is listed as unassigned in Python. +name, table = tables[0] +del tables[0] +assert name == "A.1" +table = Set(table.keys()) +Cn = Set(gen_category(["Cn"])) + +# FDD0..FDEF are process internal codes +Cn -= Set(range(0xFDD0, 0xFDF0)) +# not a character +Cn -= Set(range(0xFFFE, 0x110000, 0x10000)) +Cn -= Set(range(0xFFFF, 0x110000, 0x10000)) + +# assert table == Cn + +print """ +def in_table_a1(code): + if unicodedata.category(code) != 'Cn': return False + c = ord(code) + if 0xFDD0 <= c < 0xFDF0: return False + return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) +""" + +# B.1 cannot easily be derived +name, table = tables[0] +del tables[0] +assert name == "B.1" +table = table.keys() +table.sort() +print """ +b1_set = """ + compact_set(table) + """ +def in_table_b1(code): + return ord(code) in b1_set +""" + +# B.2 and B.3 is case folding. +# It takes CaseFolding.txt into account, which is +# not available in the Python database. Since +# B.2 is derived from B.3, we process B.3 first. +# B.3 supposedly *is* CaseFolding-3.2.0.txt. + +name, table_b2 = tables[0] +del tables[0] +assert name == "B.2" + +name, table_b3 = tables[0] +del tables[0] +assert name == "B.3" + +# B.3 is mostly Python's .lower, except for a number +# of special cases, e.g. considering canonical forms. + +b3_exceptions = {} + +for k,v in table_b2.items(): + if map(ord, unichr(k).lower()) != v: + b3_exceptions[k] = u"".join(map(unichr,v)) + +b3 = b3_exceptions.items() +b3.sort() + +print """ +b3_exceptions = {""" +for i,(k,v) in enumerate(b3): + print "0x%x:%s," % (k, repr(v)), + if i % 4 == 3: + print +print "}" + +print """ +def map_table_b3(code): + r = b3_exceptions.get(ord(code)) + if r is not None: return r + return code.lower() +""" + +def map_table_b3(code): + r = b3_exceptions.get(ord(code)) + if r is not None: return r + return code.lower() + +# B.2 is case folding for NFKC. This is the same as B.3, +# except where NormalizeWithKC(Fold(a)) != +# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a)))) + +def map_table_b2(a): + al = map_table_b3(a) + b = unicodedata.normalize("NFKC", al) + bl = u"".join([map_table_b3(ch) for ch in b]) + c = unicodedata.normalize("NFKC", bl) + if b != c: + return c + else: + return al + +specials = {} +for k,v in table_b2.items(): + if map(ord, map_table_b2(unichr(k))) != v: + specials[k] = v + +# B.3 should not add any additional special cases +assert specials == {} + +print """ +def map_table_b2(a): + al = map_table_b3(a) + b = unicodedata.normalize("NFKC", al) + bl = u"".join([map_table_b3(ch) for ch in b]) + c = unicodedata.normalize("NFKC", bl) + if b != c: + return c + else: + return al +""" + +# C.1.1 is a table with a single character +name, table = tables[0] +del tables[0] +assert name == "C.1.1" +assert table == {0x20:0x20} + +print """ +def in_table_c11(code): + return code == u" " +""" + +# C.1.2 is the rest of all space characters +name, table = tables[0] +del tables[0] +assert name == "C.1.2" + +# table = Set(table.keys()) +# Zs = Set(gen_category(["Zs"])) - Set([0x20]) +# assert Zs == table + +print """ +def in_table_c12(code): + return unicodedata.category(code) == "Zs" and code != u" " + +def in_table_c11_c12(code): + return unicodedata.category(code) == "Zs" +""" + +# C.2.1 ASCII control characters +name, table_c21 = tables[0] +del tables[0] +assert name == "C.2.1" + +Cc = Set(gen_category(["Cc"])) +Cc_ascii = Cc & Set(range(128)) +table_c21 = Set(table_c21.keys()) +assert Cc_ascii == table_c21 + +print """ +def in_table_c21(code): + return ord(code) < 128 and unicodedata.category(code) == "Cc" +""" + +# C.2.2 Non-ASCII control characters. It also includes +# a number of characters in category Cf. +name, table_c22 = tables[0] +del tables[0] +assert name == "C.2.2" + +Cc_nonascii = Cc - Cc_ascii +table_c22 = Set(table_c22.keys()) +assert len(Cc_nonascii - table_c22) == 0 + +specials = list(table_c22 - Cc_nonascii) +specials.sort() + +print """c22_specials = """ + compact_set(specials) + """ +def in_table_c22(code): + c = ord(code) + if c < 128: return False + if unicodedata.category(code) == "Cc": return True + return c in c22_specials + +def in_table_c21_c22(code): + return unicodedata.category(code) == "Cc" or \\ + ord(code) in c22_specials +""" + +# C.3 Private use +name, table = tables[0] +del tables[0] +assert name == "C.3" + +Co = Set(gen_category(["Co"])) +assert Set(table.keys()) == Co + +print """ +def in_table_c3(code): + return unicodedata.category(code) == "Co" +""" + +# C.4 Non-character code points, xFFFE, xFFFF +# plus process internal codes +name, table = tables[0] +del tables[0] +assert name == "C.4" + +nonchar = Set(range(0xFDD0,0xFDF0) + + range(0xFFFE,0x110000,0x10000) + + range(0xFFFF,0x110000,0x10000)) +table = Set(table.keys()) +assert table == nonchar + +print """ +def in_table_c4(code): + c = ord(code) + if c < 0xFDD0: return False + if c < 0xFDF0: return True + return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) +""" + +# C.5 Surrogate codes +name, table = tables[0] +del tables[0] +assert name == "C.5" + +Cs = Set(gen_category(["Cs"])) +assert Set(table.keys()) == Cs + +print """ +def in_table_c5(code): + return unicodedata.category(code) == "Cs" +""" + +# C.6 Inappropriate for plain text +name, table = tables[0] +del tables[0] +assert name == "C.6" + +table = table.keys() +table.sort() + +print """ +c6_set = """ + compact_set(table) + """ +def in_table_c6(code): + return ord(code) in c6_set +""" + +# C.7 Inappropriate for canonical representation +name, table = tables[0] +del tables[0] +assert name == "C.7" + +table = table.keys() +table.sort() + +print """ +c7_set = """ + compact_set(table) + """ +def in_table_c7(code): + return ord(code) in c7_set +""" + +# C.8 Change display properties or are deprecated +name, table = tables[0] +del tables[0] +assert name == "C.8" + +table = table.keys() +table.sort() + +print """ +c8_set = """ + compact_set(table) + """ +def in_table_c8(code): + return ord(code) in c8_set +""" + +# C.9 Tagging characters +name, table = tables[0] +del tables[0] +assert name == "C.9" + +table = table.keys() +table.sort() + +print """ +c9_set = """ + compact_set(table) + """ +def in_table_c9(code): + return ord(code) in c9_set +""" + +# D.1 Characters with bidirectional property "R" or "AL" +name, table = tables[0] +del tables[0] +assert name == "D.1" + +RandAL = Set(gen_bidirectional(["R","AL"])) +assert Set(table.keys()) == RandAL + +print """ +def in_table_d1(code): + return unicodedata.bidirectional(code) in ("R","AL") +""" + +# D.2 Characters with bidirectional property "L" +name, table = tables[0] +del tables[0] +assert name == "D.2" + +L = Set(gen_bidirectional(["L"])) +assert Set(table.keys()) == L + +print """ +def in_table_d2(code): + return unicodedata.bidirectional(code) == "L" +""" +