cpython/Lib/encodings/idna.py

388 lines
13 KiB
Python

# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
import stringprep, re, codecs
from unicodedata import ucd_3_2_0 as unicodedata
# IDNA section 3.1
dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
# IDNA section 5
ace_prefix = b"xn--"
sace_prefix = "xn--"
# This assumes query strings, so AllowUnassigned is true
def nameprep(label): # type: (str) -> str
# Map
newlabel = []
for c in label:
if stringprep.in_table_b1(c):
# Map to nothing
continue
newlabel.append(stringprep.map_table_b2(c))
label = "".join(newlabel)
# Normalize
label = unicodedata.normalize("NFKC", label)
# Prohibit
for i, c in enumerate(label):
if stringprep.in_table_c12(c) or \
stringprep.in_table_c22(c) or \
stringprep.in_table_c3(c) or \
stringprep.in_table_c4(c) or \
stringprep.in_table_c5(c) or \
stringprep.in_table_c6(c) or \
stringprep.in_table_c7(c) or \
stringprep.in_table_c8(c) or \
stringprep.in_table_c9(c):
raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")
# Check bidi
RandAL = [stringprep.in_table_d1(x) for x in label]
if any(RandAL):
# There is a RandAL char in the string. Must perform further
# tests:
# 1) The characters in section 5.8 MUST be prohibited.
# This is table C.8, which was already checked
# 2) If a string contains any RandALCat character, the string
# MUST NOT contain any LCat character.
for i, x in enumerate(label):
if stringprep.in_table_d2(x):
raise UnicodeEncodeError("idna", label, i, i+1,
"Violation of BIDI requirement 2")
# 3) If a string contains any RandALCat character, a
# RandALCat character MUST be the first character of the
# string, and a RandALCat character MUST be the last
# character of the string.
if not RandAL[0]:
raise UnicodeEncodeError("idna", label, 0, 1,
"Violation of BIDI requirement 3")
if not RandAL[-1]:
raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
"Violation of BIDI requirement 3")
return label
def ToASCII(label): # type: (str) -> bytes
try:
# Step 1: try ASCII
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if 0 < len(label_ascii) < 64:
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
# Step 2: nameprep
label = nameprep(label)
# Step 3: UseSTD3ASCIIRules is false
# Step 4: try ASCII
try:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 8.
if 0 < len(label) < 64:
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
# Step 5: Check ACE prefix
if label.lower().startswith(sace_prefix):
raise UnicodeEncodeError(
"idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")
# Step 6: Encode with PUNYCODE
label_ascii = label.encode("punycode")
# Step 7: Prepend ACE prefix
label_ascii = ace_prefix + label_ascii
# Step 8: Check size
# do not check for empty as we prepend ace_prefix.
if len(label_ascii) < 64:
return label_ascii
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
def ToUnicode(label):
if len(label) > 1024:
# Protection from https://github.com/python/cpython/issues/98433.
# https://datatracker.ietf.org/doc/html/rfc5894#section-6
# doesn't specify a label size limit prior to NAMEPREP. But having
# one makes practical sense.
# This leaves ample room for nameprep() to remove Nothing characters
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
# preventing us from wasting time decoding a big thing that'll just
# hit the actual <= 63 length limit in Step 6.
if isinstance(label, str):
label = label.encode("utf-8", errors="backslashreplace")
raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
# Step 1: Check for ASCII
if isinstance(label, bytes):
pure_ascii = True
else:
try:
label = label.encode("ascii")
pure_ascii = True
except UnicodeEncodeError:
pure_ascii = False
if not pure_ascii:
assert isinstance(label, str)
# Step 2: Perform nameprep
label = nameprep(label)
# It doesn't say this, but apparently, it should be ASCII now
try:
label = label.encode("ascii")
except UnicodeEncodeError as exc:
raise UnicodeEncodeError("idna", label, exc.start, exc.end,
"Invalid character in IDN label")
# Step 3: Check for ACE prefix
assert isinstance(label, bytes)
if not label.lower().startswith(ace_prefix):
return str(label, "ascii")
# Step 4: Remove ACE prefix
label1 = label[len(ace_prefix):]
# Step 5: Decode using PUNYCODE
try:
result = label1.decode("punycode")
except UnicodeDecodeError as exc:
offset = len(ace_prefix)
raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)
# Step 6: Apply ToASCII
label2 = ToASCII(result)
# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if str(label, "ascii").lower() != str(label2, "ascii"):
raise UnicodeDecodeError("idna", label, 0, len(label),
f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")
# Step 8: return the result of step 5
return result
### Codec APIs
class Codec(codecs.Codec):
def encode(self, input, errors='strict'):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError(f"Unsupported error handling: {errors}")
if not input:
return b'', 0
try:
result = input.encode('ascii')
except UnicodeEncodeError:
pass
else:
# ASCII name: fast path
labels = result.split(b'.')
for i, label in enumerate(labels[:-1]):
if len(label) == 0:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+1,
"label empty")
for i, label in enumerate(labels):
if len(label) >= 64:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+len(label),
"label too long")
return result, len(input)
result = bytearray()
labels = dots.split(input)
if labels and not labels[-1]:
trailing_dot = b'.'
del labels[-1]
else:
trailing_dot = b''
for i, label in enumerate(labels):
if result:
# Join with U+002E
result.extend(b'.')
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError(
"idna",
input,
offset + exc.start,
offset + exc.end,
exc.reason,
)
return bytes(result+trailing_dot), len(input)
def decode(self, input, errors='strict'):
if errors != 'strict':
raise UnicodeError(f"Unsupported error handling: {errors}")
if not input:
return "", 0
# IDNA allows decoding to operate on Unicode strings, too.
if not isinstance(input, bytes):
# XXX obviously wrong, see #3232
input = bytes(input)
if ace_prefix not in input.lower():
# Fast path
try:
return input.decode('ascii'), len(input)
except UnicodeDecodeError:
pass
labels = input.split(b".")
if labels and len(labels[-1]) == 0:
trailing_dot = '.'
del labels[-1]
else:
trailing_dot = ''
result = []
for i, label in enumerate(labels):
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
raise UnicodeDecodeError(
"idna", input, offset+exc.start, offset+exc.end, exc.reason)
else:
result.append(u_label)
return ".".join(result)+trailing_dot, len(input)
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
def _buffer_encode(self, input, errors, final):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError(f"Unsupported error handling: {errors}")
if not input:
return (b'', 0)
labels = dots.split(input)
trailing_dot = b''
if labels:
if not labels[-1]:
trailing_dot = b'.'
del labels[-1]
elif not final:
# Keep potentially unfinished label until the next call
del labels[-1]
if labels:
trailing_dot = b'.'
result = bytearray()
size = 0
for label in labels:
if size:
# Join with U+002E
result.extend(b'.')
size += 1
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeEncodeError(
"idna",
input,
size + exc.start,
size + exc.end,
exc.reason,
)
size += len(label)
result += trailing_dot
size += len(trailing_dot)
return (bytes(result), size)
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
if errors != 'strict':
raise UnicodeError("Unsupported error handling: {errors}")
if not input:
return ("", 0)
# IDNA allows decoding to operate on Unicode strings, too.
if isinstance(input, str):
labels = dots.split(input)
else:
# Must be ASCII string
try:
input = str(input, "ascii")
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError("idna", input,
exc.start, exc.end, exc.reason)
labels = input.split(".")
trailing_dot = ''
if labels:
if not labels[-1]:
trailing_dot = '.'
del labels[-1]
elif not final:
# Keep potentially unfinished label until the next call
del labels[-1]
if labels:
trailing_dot = '.'
result = []
size = 0
for label in labels:
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError(
"idna",
input.encode("ascii", errors="backslashreplace"),
size + exc.start,
size + exc.end,
exc.reason,
)
else:
result.append(u_label)
if size:
size += 1
size += len(label)
result = ".".join(result) + trailing_dot
size += len(trailing_dot)
return (result, size)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='idna',
encode=Codec().encode,
decode=Codec().decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
)