2003-04-18 07:39:54 -03:00
|
|
|
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
|
|
|
|
2006-03-09 19:38:20 -04:00
|
|
|
import stringprep, re, codecs
|
2006-03-10 07:20:04 -04:00
|
|
|
from unicodedata import ucd_3_2_0 as unicodedata
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# IDNA section 3.1
|
2007-05-02 16:09:54 -03:00
|
|
|
dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# IDNA section 5
|
2007-05-11 07:32:57 -03:00
|
|
|
ace_prefix = b"xn--"
|
|
|
|
sace_prefix = "xn--"
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# This assumes query strings, so AllowUnassigned is true
|
|
|
|
def nameprep(label):
|
|
|
|
# Map
|
|
|
|
newlabel = []
|
|
|
|
for c in label:
|
|
|
|
if stringprep.in_table_b1(c):
|
|
|
|
# Map to nothing
|
|
|
|
continue
|
|
|
|
newlabel.append(stringprep.map_table_b2(c))
|
2007-05-02 16:09:54 -03:00
|
|
|
label = "".join(newlabel)
|
2003-04-24 13:02:54 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
# Normalize
|
|
|
|
label = unicodedata.normalize("NFKC", label)
|
2003-04-24 13:02:54 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
# Prohibit
|
|
|
|
for c in label:
|
|
|
|
if stringprep.in_table_c12(c) or \
|
|
|
|
stringprep.in_table_c22(c) or \
|
|
|
|
stringprep.in_table_c3(c) or \
|
|
|
|
stringprep.in_table_c4(c) or \
|
|
|
|
stringprep.in_table_c5(c) or \
|
|
|
|
stringprep.in_table_c6(c) or \
|
|
|
|
stringprep.in_table_c7(c) or \
|
|
|
|
stringprep.in_table_c8(c) or \
|
|
|
|
stringprep.in_table_c9(c):
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("Invalid character %r" % c)
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# Check bidi
|
Merged revisions 56125-56153 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/p3yk
........
r56127 | georg.brandl | 2007-06-30 09:32:49 +0200 (Sat, 30 Jun 2007) | 2 lines
Fix a place where floor division would be in order.
........
r56135 | guido.van.rossum | 2007-07-01 06:13:54 +0200 (Sun, 01 Jul 2007) | 28 lines
Make map() and filter() identical to itertools.imap() and .ifilter(),
respectively.
I fixed two bootstrap issues, due to the dynamic import of itertools:
1. Starting python requires that map() and filter() are not used until
site.py has added build/lib.<arch> to sys.path.
2. Building python requires that setup.py and distutils and everything
they use is free of map() and filter() calls.
Beyond this, I only fixed the tests in test_builtin.py.
Others, please help fixing the remaining tests that are now broken!
The fixes are usually simple:
a. map(None, X) -> list(X)
b. map(F, X) -> list(map(F, X))
c. map(lambda x: F(x), X) -> [F(x) for x in X]
d. filter(F, X) -> list(filter(F, X))
e. filter(lambda x: P(x), X) -> [x for x in X if P(x)]
Someone, please also contribute a fixer for 2to3 to do this.
It can leave map()/filter() calls alone that are already
inside a list() or sorted() call or for-loop.
Only in rare cases have I seen code that depends on map() of lists
of different lengths going to the end of the longest, or on filter()
of a string or tuple returning an object of the same type; these
will need more thought to fix.
........
r56136 | guido.van.rossum | 2007-07-01 06:22:01 +0200 (Sun, 01 Jul 2007) | 3 lines
Make it so that test_decimal fails instead of hangs, to help automated
test runners.
........
r56139 | georg.brandl | 2007-07-01 18:20:58 +0200 (Sun, 01 Jul 2007) | 2 lines
Fix a few test cases after the map->imap change.
........
r56142 | neal.norwitz | 2007-07-02 06:38:12 +0200 (Mon, 02 Jul 2007) | 1 line
Get a bunch more tests passing after converting map/filter to return iterators.
........
r56147 | guido.van.rossum | 2007-07-02 15:32:02 +0200 (Mon, 02 Jul 2007) | 4 lines
Fix the remaining failing unit tests (at least on OSX).
Also tweaked urllib2 so it doesn't raise socket.gaierror when
all network interfaces are turned off.
........
2007-07-03 05:25:58 -03:00
|
|
|
RandAL = [stringprep.in_table_d1(x) for x in label]
|
2022-11-07 20:54:41 -04:00
|
|
|
if any(RandAL):
|
|
|
|
# There is a RandAL char in the string. Must perform further
|
|
|
|
# tests:
|
|
|
|
# 1) The characters in section 5.8 MUST be prohibited.
|
|
|
|
# This is table C.8, which was already checked
|
|
|
|
# 2) If a string contains any RandALCat character, the string
|
|
|
|
# MUST NOT contain any LCat character.
|
|
|
|
if any(stringprep.in_table_d2(x) for x in label):
|
|
|
|
raise UnicodeError("Violation of BIDI requirement 2")
|
|
|
|
# 3) If a string contains any RandALCat character, a
|
|
|
|
# RandALCat character MUST be the first character of the
|
|
|
|
# string, and a RandALCat character MUST be the last
|
|
|
|
# character of the string.
|
|
|
|
if not RandAL[0] or not RandAL[-1]:
|
|
|
|
raise UnicodeError("Violation of BIDI requirement 3")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
return label
|
|
|
|
|
|
|
|
def ToASCII(label):
|
|
|
|
try:
|
|
|
|
# Step 1: try ASCII
|
|
|
|
label = label.encode("ascii")
|
|
|
|
except UnicodeError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# Skip to step 3: UseSTD3ASCIIRules is false, so
|
|
|
|
# Skip to step 8.
|
|
|
|
if 0 < len(label) < 64:
|
|
|
|
return label
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("label empty or too long")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# Step 2: nameprep
|
|
|
|
label = nameprep(label)
|
|
|
|
|
|
|
|
# Step 3: UseSTD3ASCIIRules is false
|
|
|
|
# Step 4: try ASCII
|
|
|
|
try:
|
|
|
|
label = label.encode("ascii")
|
|
|
|
except UnicodeError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# Skip to step 8.
|
|
|
|
if 0 < len(label) < 64:
|
|
|
|
return label
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("label empty or too long")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# Step 5: Check ACE prefix
|
2007-05-11 07:32:57 -03:00
|
|
|
if label.startswith(sace_prefix):
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("Label starts with ACE prefix")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# Step 6: Encode with PUNYCODE
|
|
|
|
label = label.encode("punycode")
|
|
|
|
|
|
|
|
# Step 7: Prepend ACE prefix
|
|
|
|
label = ace_prefix + label
|
|
|
|
|
|
|
|
# Step 8: Check size
|
|
|
|
if 0 < len(label) < 64:
|
|
|
|
return label
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("label empty or too long")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
def ToUnicode(label):
|
2022-11-07 20:54:41 -04:00
|
|
|
if len(label) > 1024:
|
|
|
|
# Protection from https://github.com/python/cpython/issues/98433.
|
|
|
|
# https://datatracker.ietf.org/doc/html/rfc5894#section-6
|
|
|
|
# doesn't specify a label size limit prior to NAMEPREP. But having
|
|
|
|
# one makes practical sense.
|
|
|
|
# This leaves ample room for nameprep() to remove Nothing characters
|
|
|
|
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
|
|
|
|
# preventing us from wasting time decoding a big thing that'll just
|
|
|
|
# hit the actual <= 63 length limit in Step 6.
|
|
|
|
raise UnicodeError("label way too long")
|
2003-04-18 07:39:54 -03:00
|
|
|
# Step 1: Check for ASCII
|
2007-05-09 20:40:37 -03:00
|
|
|
if isinstance(label, bytes):
|
2003-04-18 07:39:54 -03:00
|
|
|
pure_ascii = True
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
label = label.encode("ascii")
|
|
|
|
pure_ascii = True
|
|
|
|
except UnicodeError:
|
|
|
|
pure_ascii = False
|
|
|
|
if not pure_ascii:
|
|
|
|
# Step 2: Perform nameprep
|
|
|
|
label = nameprep(label)
|
|
|
|
# It doesn't say this, but apparently, it should be ASCII now
|
|
|
|
try:
|
|
|
|
label = label.encode("ascii")
|
|
|
|
except UnicodeError:
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("Invalid character in IDN label")
|
2003-04-18 07:39:54 -03:00
|
|
|
# Step 3: Check for ACE prefix
|
|
|
|
if not label.startswith(ace_prefix):
|
2007-05-02 16:09:54 -03:00
|
|
|
return str(label, "ascii")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# Step 4: Remove ACE prefix
|
|
|
|
label1 = label[len(ace_prefix):]
|
|
|
|
|
|
|
|
# Step 5: Decode using PUNYCODE
|
|
|
|
result = label1.decode("punycode")
|
|
|
|
|
|
|
|
# Step 6: Apply ToASCII
|
|
|
|
label2 = ToASCII(result)
|
|
|
|
|
|
|
|
# Step 7: Compare the result of step 6 with the one of step 3
|
|
|
|
# label2 will already be in lower case.
|
2007-05-11 07:32:57 -03:00
|
|
|
if str(label, "ascii").lower() != str(label2, "ascii"):
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("IDNA does not round-trip", label, label2)
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
# Step 8: return the result of step 5
|
|
|
|
return result
|
2003-04-24 13:02:54 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
### Codec APIs
|
|
|
|
|
|
|
|
class Codec(codecs.Codec):
|
2007-05-11 07:32:57 -03:00
|
|
|
def encode(self, input, errors='strict'):
|
2003-04-18 07:39:54 -03:00
|
|
|
|
|
|
|
if errors != 'strict':
|
|
|
|
# IDNA is quite clear that implementations must be strict
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("unsupported error handling "+errors)
|
2003-04-18 07:39:54 -03:00
|
|
|
|
2005-08-25 08:03:38 -03:00
|
|
|
if not input:
|
2007-11-06 17:34:58 -04:00
|
|
|
return b'', 0
|
2005-08-25 08:03:38 -03:00
|
|
|
|
2011-11-10 17:49:20 -04:00
|
|
|
try:
|
|
|
|
result = input.encode('ascii')
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# ASCII name: fast path
|
|
|
|
labels = result.split(b'.')
|
|
|
|
for label in labels[:-1]:
|
|
|
|
if not (0 < len(label) < 64):
|
|
|
|
raise UnicodeError("label empty or too long")
|
|
|
|
if len(labels[-1]) >= 64:
|
|
|
|
raise UnicodeError("label too long")
|
|
|
|
return result, len(input)
|
|
|
|
|
2007-11-21 15:29:53 -04:00
|
|
|
result = bytearray()
|
2003-08-05 03:19:47 -03:00
|
|
|
labels = dots.split(input)
|
2007-05-11 07:32:57 -03:00
|
|
|
if labels and not labels[-1]:
|
2007-05-09 20:40:37 -03:00
|
|
|
trailing_dot = b'.'
|
2003-08-05 03:19:47 -03:00
|
|
|
del labels[-1]
|
|
|
|
else:
|
2007-05-09 20:40:37 -03:00
|
|
|
trailing_dot = b''
|
2003-08-05 03:19:47 -03:00
|
|
|
for label in labels:
|
2007-05-11 07:32:57 -03:00
|
|
|
if result:
|
|
|
|
# Join with U+002E
|
|
|
|
result.extend(b'.')
|
|
|
|
result.extend(ToASCII(label))
|
2007-11-06 17:34:58 -04:00
|
|
|
return bytes(result+trailing_dot), len(input)
|
2003-04-18 07:39:54 -03:00
|
|
|
|
2007-05-11 07:32:57 -03:00
|
|
|
def decode(self, input, errors='strict'):
|
2003-04-24 13:02:54 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
if errors != 'strict':
|
2006-04-21 07:40:58 -03:00
|
|
|
raise UnicodeError("Unsupported error handling "+errors)
|
2003-04-18 07:39:54 -03:00
|
|
|
|
2005-08-25 08:03:38 -03:00
|
|
|
if not input:
|
2007-05-02 16:09:54 -03:00
|
|
|
return "", 0
|
2005-08-25 08:03:38 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
# IDNA allows decoding to operate on Unicode strings, too.
|
2008-08-19 14:56:33 -03:00
|
|
|
if not isinstance(input, bytes):
|
|
|
|
# XXX obviously wrong, see #3232
|
2007-05-09 20:40:37 -03:00
|
|
|
input = bytes(input)
|
2011-11-10 17:49:20 -04:00
|
|
|
|
|
|
|
if ace_prefix not in input:
|
|
|
|
# Fast path
|
|
|
|
try:
|
|
|
|
return input.decode('ascii'), len(input)
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
pass
|
|
|
|
|
2008-08-19 14:56:33 -03:00
|
|
|
labels = input.split(b".")
|
2003-04-18 07:39:54 -03:00
|
|
|
|
2003-08-05 03:19:47 -03:00
|
|
|
if labels and len(labels[-1]) == 0:
|
2007-05-02 16:09:54 -03:00
|
|
|
trailing_dot = '.'
|
2003-08-05 03:19:47 -03:00
|
|
|
del labels[-1]
|
|
|
|
else:
|
2007-05-02 16:09:54 -03:00
|
|
|
trailing_dot = ''
|
2003-08-05 03:19:47 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
result = []
|
|
|
|
for label in labels:
|
|
|
|
result.append(ToUnicode(label))
|
|
|
|
|
2007-05-02 16:09:54 -03:00
|
|
|
return ".".join(result)+trailing_dot, len(input)
|
2003-04-18 07:39:54 -03:00
|
|
|
|
2006-04-21 07:40:58 -03:00
|
|
|
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
|
|
|
def _buffer_encode(self, input, errors, final):
|
|
|
|
if errors != 'strict':
|
|
|
|
# IDNA is quite clear that implementations must be strict
|
|
|
|
raise UnicodeError("unsupported error handling "+errors)
|
|
|
|
|
|
|
|
if not input:
|
2007-05-11 07:32:57 -03:00
|
|
|
return (b'', 0)
|
2006-04-21 07:40:58 -03:00
|
|
|
|
|
|
|
labels = dots.split(input)
|
2007-05-11 07:32:57 -03:00
|
|
|
trailing_dot = b''
|
2006-04-21 07:40:58 -03:00
|
|
|
if labels:
|
|
|
|
if not labels[-1]:
|
2007-05-11 07:32:57 -03:00
|
|
|
trailing_dot = b'.'
|
2006-04-21 07:40:58 -03:00
|
|
|
del labels[-1]
|
|
|
|
elif not final:
|
|
|
|
# Keep potentially unfinished label until the next call
|
|
|
|
del labels[-1]
|
|
|
|
if labels:
|
2007-05-11 07:32:57 -03:00
|
|
|
trailing_dot = b'.'
|
2006-04-21 07:40:58 -03:00
|
|
|
|
2007-11-21 15:29:53 -04:00
|
|
|
result = bytearray()
|
2006-04-21 07:40:58 -03:00
|
|
|
size = 0
|
|
|
|
for label in labels:
|
|
|
|
if size:
|
2007-05-11 07:32:57 -03:00
|
|
|
# Join with U+002E
|
|
|
|
result.extend(b'.')
|
2006-04-21 07:40:58 -03:00
|
|
|
size += 1
|
2007-05-11 07:32:57 -03:00
|
|
|
result.extend(ToASCII(label))
|
2006-04-21 07:40:58 -03:00
|
|
|
size += len(label)
|
|
|
|
|
2007-05-11 07:32:57 -03:00
|
|
|
result += trailing_dot
|
2006-04-21 07:40:58 -03:00
|
|
|
size += len(trailing_dot)
|
2007-11-06 17:34:58 -04:00
|
|
|
return (bytes(result), size)
|
2006-04-21 07:40:58 -03:00
|
|
|
|
|
|
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
|
|
|
def _buffer_decode(self, input, errors, final):
|
|
|
|
if errors != 'strict':
|
|
|
|
raise UnicodeError("Unsupported error handling "+errors)
|
|
|
|
|
|
|
|
if not input:
|
2007-05-02 16:09:54 -03:00
|
|
|
return ("", 0)
|
2006-04-21 07:40:58 -03:00
|
|
|
|
|
|
|
# IDNA allows decoding to operate on Unicode strings, too.
|
2007-05-02 16:09:54 -03:00
|
|
|
if isinstance(input, str):
|
2006-04-21 07:40:58 -03:00
|
|
|
labels = dots.split(input)
|
|
|
|
else:
|
|
|
|
# Must be ASCII string
|
2007-05-11 07:32:57 -03:00
|
|
|
input = str(input, "ascii")
|
2006-04-21 07:40:58 -03:00
|
|
|
labels = input.split(".")
|
|
|
|
|
2007-05-02 16:09:54 -03:00
|
|
|
trailing_dot = ''
|
2006-04-21 07:40:58 -03:00
|
|
|
if labels:
|
|
|
|
if not labels[-1]:
|
2007-05-02 16:09:54 -03:00
|
|
|
trailing_dot = '.'
|
2006-04-21 07:40:58 -03:00
|
|
|
del labels[-1]
|
|
|
|
elif not final:
|
|
|
|
# Keep potentially unfinished label until the next call
|
|
|
|
del labels[-1]
|
|
|
|
if labels:
|
2007-05-02 16:09:54 -03:00
|
|
|
trailing_dot = '.'
|
2006-04-21 07:40:58 -03:00
|
|
|
|
|
|
|
result = []
|
|
|
|
size = 0
|
|
|
|
for label in labels:
|
|
|
|
result.append(ToUnicode(label))
|
|
|
|
if size:
|
|
|
|
size += 1
|
|
|
|
size += len(label)
|
2006-04-21 06:43:23 -03:00
|
|
|
|
2007-05-02 16:09:54 -03:00
|
|
|
result = ".".join(result) + trailing_dot
|
2006-04-21 07:40:58 -03:00
|
|
|
size += len(trailing_dot)
|
|
|
|
return (result, size)
|
2006-04-21 06:43:23 -03:00
|
|
|
|
2003-04-18 07:39:54 -03:00
|
|
|
class StreamWriter(Codec,codecs.StreamWriter):
|
|
|
|
pass
|
|
|
|
|
|
|
|
class StreamReader(Codec,codecs.StreamReader):
|
|
|
|
pass
|
|
|
|
|
|
|
|
### encodings module API
|
|
|
|
|
|
|
|
def getregentry():
|
2006-04-21 06:43:23 -03:00
|
|
|
return codecs.CodecInfo(
|
|
|
|
name='idna',
|
|
|
|
encode=Codec().encode,
|
|
|
|
decode=Codec().decode,
|
|
|
|
incrementalencoder=IncrementalEncoder,
|
|
|
|
incrementaldecoder=IncrementalDecoder,
|
|
|
|
streamwriter=StreamWriter,
|
|
|
|
streamreader=StreamReader,
|
|
|
|
)
|