# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) import stringprep, re, codecs from unicodedata import ucd_3_2_0 as unicodedata # IDNA section 3.1 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") # IDNA section 5 ace_prefix = b"xn--" sace_prefix = "xn--" # This assumes query strings, so AllowUnassigned is true def nameprep(label): # type: (str) -> str # Map newlabel = [] for c in label: if stringprep.in_table_b1(c): # Map to nothing continue newlabel.append(stringprep.map_table_b2(c)) label = "".join(newlabel) # Normalize label = unicodedata.normalize("NFKC", label) # Prohibit for i, c in enumerate(label): if stringprep.in_table_c12(c) or \ stringprep.in_table_c22(c) or \ stringprep.in_table_c3(c) or \ stringprep.in_table_c4(c) or \ stringprep.in_table_c5(c) or \ stringprep.in_table_c6(c) or \ stringprep.in_table_c7(c) or \ stringprep.in_table_c8(c) or \ stringprep.in_table_c9(c): raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}") # Check bidi RandAL = [stringprep.in_table_d1(x) for x in label] if any(RandAL): # There is a RandAL char in the string. Must perform further # tests: # 1) The characters in section 5.8 MUST be prohibited. # This is table C.8, which was already checked # 2) If a string contains any RandALCat character, the string # MUST NOT contain any LCat character. for i, x in enumerate(label): if stringprep.in_table_d2(x): raise UnicodeEncodeError("idna", label, i, i+1, "Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a # RandALCat character MUST be the first character of the # string, and a RandALCat character MUST be the last # character of the string. if not RandAL[0]: raise UnicodeEncodeError("idna", label, 0, 1, "Violation of BIDI requirement 3") if not RandAL[-1]: raise UnicodeEncodeError("idna", label, len(label)-1, len(label), "Violation of BIDI requirement 3") return label def ToASCII(label): # type: (str) -> bytes try: # Step 1: try ASCII label_ascii = label.encode("ascii") except UnicodeEncodeError: pass else: # Skip to step 3: UseSTD3ASCIIRules is false, so # Skip to step 8. if 0 < len(label_ascii) < 64: return label_ascii if len(label) == 0: raise UnicodeEncodeError("idna", label, 0, 1, "label empty") else: raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") # Step 2: nameprep label = nameprep(label) # Step 3: UseSTD3ASCIIRules is false # Step 4: try ASCII try: label_ascii = label.encode("ascii") except UnicodeEncodeError: pass else: # Skip to step 8. if 0 < len(label) < 64: return label_ascii if len(label) == 0: raise UnicodeEncodeError("idna", label, 0, 1, "label empty") else: raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") # Step 5: Check ACE prefix if label.lower().startswith(sace_prefix): raise UnicodeEncodeError( "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix") # Step 6: Encode with PUNYCODE label_ascii = label.encode("punycode") # Step 7: Prepend ACE prefix label_ascii = ace_prefix + label_ascii # Step 8: Check size # do not check for empty as we prepend ace_prefix. if len(label_ascii) < 64: return label_ascii raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") def ToUnicode(label): if len(label) > 1024: # Protection from https://github.com/python/cpython/issues/98433. # https://datatracker.ietf.org/doc/html/rfc5894#section-6 # doesn't specify a label size limit prior to NAMEPREP. But having # one makes practical sense. # This leaves ample room for nameprep() to remove Nothing characters # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still # preventing us from wasting time decoding a big thing that'll just # hit the actual <= 63 length limit in Step 6. if isinstance(label, str): label = label.encode("utf-8", errors="backslashreplace") raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long") # Step 1: Check for ASCII if isinstance(label, bytes): pure_ascii = True else: try: label = label.encode("ascii") pure_ascii = True except UnicodeEncodeError: pure_ascii = False if not pure_ascii: assert isinstance(label, str) # Step 2: Perform nameprep label = nameprep(label) # It doesn't say this, but apparently, it should be ASCII now try: label = label.encode("ascii") except UnicodeEncodeError as exc: raise UnicodeEncodeError("idna", label, exc.start, exc.end, "Invalid character in IDN label") # Step 3: Check for ACE prefix assert isinstance(label, bytes) if not label.lower().startswith(ace_prefix): return str(label, "ascii") # Step 4: Remove ACE prefix label1 = label[len(ace_prefix):] # Step 5: Decode using PUNYCODE try: result = label1.decode("punycode") except UnicodeDecodeError as exc: offset = len(ace_prefix) raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason) # Step 6: Apply ToASCII label2 = ToASCII(result) # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. if str(label, "ascii").lower() != str(label2, "ascii"): raise UnicodeDecodeError("idna", label, 0, len(label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") # Step 8: return the result of step 5 return result ### Codec APIs class Codec(codecs.Codec): def encode(self, input, errors='strict'): if errors != 'strict': # IDNA is quite clear that implementations must be strict raise UnicodeError(f"Unsupported error handling: {errors}") if not input: return b'', 0 try: result = input.encode('ascii') except UnicodeEncodeError: pass else: # ASCII name: fast path labels = result.split(b'.') for i, label in enumerate(labels[:-1]): if len(label) == 0: offset = sum(len(l) for l in labels[:i]) + i raise UnicodeEncodeError("idna", input, offset, offset+1, "label empty") for i, label in enumerate(labels): if len(label) >= 64: offset = sum(len(l) for l in labels[:i]) + i raise UnicodeEncodeError("idna", input, offset, offset+len(label), "label too long") return result, len(input) result = bytearray() labels = dots.split(input) if labels and not labels[-1]: trailing_dot = b'.' del labels[-1] else: trailing_dot = b'' for i, label in enumerate(labels): if result: # Join with U+002E result.extend(b'.') try: result.extend(ToASCII(label)) except (UnicodeEncodeError, UnicodeDecodeError) as exc: offset = sum(len(l) for l in labels[:i]) + i raise UnicodeEncodeError( "idna", input, offset + exc.start, offset + exc.end, exc.reason, ) return bytes(result+trailing_dot), len(input) def decode(self, input, errors='strict'): if errors != 'strict': raise UnicodeError(f"Unsupported error handling: {errors}") if not input: return "", 0 # IDNA allows decoding to operate on Unicode strings, too. if not isinstance(input, bytes): # XXX obviously wrong, see #3232 input = bytes(input) if ace_prefix not in input.lower(): # Fast path try: return input.decode('ascii'), len(input) except UnicodeDecodeError: pass labels = input.split(b".") if labels and len(labels[-1]) == 0: trailing_dot = '.' del labels[-1] else: trailing_dot = '' result = [] for i, label in enumerate(labels): try: u_label = ToUnicode(label) except (UnicodeEncodeError, UnicodeDecodeError) as exc: offset = sum(len(x) for x in labels[:i]) + len(labels[:i]) raise UnicodeDecodeError( "idna", input, offset+exc.start, offset+exc.end, exc.reason) else: result.append(u_label) return ".".join(result)+trailing_dot, len(input) class IncrementalEncoder(codecs.BufferedIncrementalEncoder): def _buffer_encode(self, input, errors, final): if errors != 'strict': # IDNA is quite clear that implementations must be strict raise UnicodeError(f"Unsupported error handling: {errors}") if not input: return (b'', 0) labels = dots.split(input) trailing_dot = b'' if labels: if not labels[-1]: trailing_dot = b'.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: trailing_dot = b'.' result = bytearray() size = 0 for label in labels: if size: # Join with U+002E result.extend(b'.') size += 1 try: result.extend(ToASCII(label)) except (UnicodeEncodeError, UnicodeDecodeError) as exc: raise UnicodeEncodeError( "idna", input, size + exc.start, size + exc.end, exc.reason, ) size += len(label) result += trailing_dot size += len(trailing_dot) return (bytes(result), size) class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def _buffer_decode(self, input, errors, final): if errors != 'strict': raise UnicodeError("Unsupported error handling: {errors}") if not input: return ("", 0) # IDNA allows decoding to operate on Unicode strings, too. if isinstance(input, str): labels = dots.split(input) else: # Must be ASCII string try: input = str(input, "ascii") except (UnicodeEncodeError, UnicodeDecodeError) as exc: raise UnicodeDecodeError("idna", input, exc.start, exc.end, exc.reason) labels = input.split(".") trailing_dot = '' if labels: if not labels[-1]: trailing_dot = '.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: trailing_dot = '.' result = [] size = 0 for label in labels: try: u_label = ToUnicode(label) except (UnicodeEncodeError, UnicodeDecodeError) as exc: raise UnicodeDecodeError( "idna", input.encode("ascii", errors="backslashreplace"), size + exc.start, size + exc.end, exc.reason, ) else: result.append(u_label) if size: size += 1 size += len(label) result = ".".join(result) + trailing_dot size += len(trailing_dot) return (result, size) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): pass ### encodings module API def getregentry(): return codecs.CodecInfo( name='idna', encode=Codec().encode, decode=Codec().decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, )