diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b4737c806e1..b34c58bf85d 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -96,90 +96,6 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' -# -# Accumulator for header folding -# - -class _Folded: - - def __init__(self, maxlen, policy): - self.maxlen = maxlen - self.policy = policy - self.lastlen = 0 - self.stickyspace = None - self.firstline = True - self.done = [] - self.current = [] - - def newline(self): - self.done.extend(self.current) - self.done.append(self.policy.linesep) - self.current.clear() - self.lastlen = 0 - - def finalize(self): - if self.current: - self.newline() - - def __str__(self): - return ''.join(self.done) - - def append(self, stoken): - self.current.append(stoken) - - def append_if_fits(self, token, stoken=None): - if stoken is None: - stoken = str(token) - l = len(stoken) - if self.stickyspace is not None: - stickyspace_len = len(self.stickyspace) - if self.lastlen + stickyspace_len + l <= self.maxlen: - self.current.append(self.stickyspace) - self.lastlen += stickyspace_len - self.current.append(stoken) - self.lastlen += l - self.stickyspace = None - self.firstline = False - return True - if token.has_fws: - ws = token.pop_leading_fws() - if ws is not None: - self.stickyspace += str(ws) - stickyspace_len += len(ws) - token._fold(self) - return True - if stickyspace_len and l + 1 <= self.maxlen: - margin = self.maxlen - l - if 0 < margin < stickyspace_len: - trim = stickyspace_len - margin - self.current.append(self.stickyspace[:trim]) - self.stickyspace = self.stickyspace[trim:] - stickyspace_len = trim - self.newline() - self.current.append(self.stickyspace) - self.current.append(stoken) - self.lastlen = l + stickyspace_len - self.stickyspace = None - self.firstline = False - return True - if not self.firstline: - self.newline() - self.current.append(self.stickyspace) - self.current.append(stoken) - self.stickyspace = None - self.firstline = False - return True - if self.lastlen + l <= self.maxlen: - self.current.append(stoken) - self.lastlen += l - return True - if l < self.maxlen: - self.newline() - self.current.append(stoken) - self.lastlen = l - return True - return False - # # TokenList and its subclasses # @@ -187,6 +103,8 @@ class _Folded: class TokenList(list): token_type = None + syntactic_break = True + ew_combine_allowed = True def __init__(self, *args, **kw): super().__init__(*args, **kw) @@ -207,84 +125,13 @@ class TokenList(list): def all_defects(self): return sum((x.all_defects for x in self), self.defects) - # - # Folding API - # - # parts(): - # - # return a list of objects that constitute the "higher level syntactic - # objects" specified by the RFC as the best places to fold a header line. - # The returned objects must include leading folding white space, even if - # this means mutating the underlying parse tree of the object. Each object - # is only responsible for returning *its* parts, and should not drill down - # to any lower level except as required to meet the leading folding white - # space constraint. - # - # _fold(folded): - # - # folded: the result accumulator. This is an instance of _Folded. - # (XXX: I haven't finished factoring this out yet, the folding code - # pretty much uses this as a state object.) When the folded.current - # contains as much text as will fit, the _fold method should call - # folded.newline. - # folded.lastlen: the current length of the test stored in folded.current. - # folded.maxlen: The maximum number of characters that may appear on a - # folded line. Differs from the policy setting in that "no limit" is - # represented by +inf, which means it can be used in the trivially - # logical fashion in comparisons. - # - # Currently no subclasses implement parts, and I think this will remain - # true. A subclass only needs to implement _fold when the generic version - # isn't sufficient. _fold will need to be implemented primarily when it is - # possible for encoded words to appear in the specialized token-list, since - # there is no generic algorithm that can know where exactly the encoded - # words are allowed. A _fold implementation is responsible for filling - # lines in the same general way that the top level _fold does. It may, and - # should, call the _fold method of sub-objects in a similar fashion to that - # of the top level _fold. - # - # XXX: I'm hoping it will be possible to factor the existing code further - # to reduce redundancy and make the logic clearer. - - @property - def parts(self): - klass = self.__class__ - this = [] - for token in self: - if token.startswith_fws(): - if this: - yield this[0] if len(this)==1 else klass(this) - this.clear() - end_ws = token.pop_trailing_ws() - this.append(token) - if end_ws: - yield klass(this) - this = [end_ws] - if this: - yield this[0] if len(this)==1 else klass(this) - def startswith_fws(self): return self[0].startswith_fws() - def pop_leading_fws(self): - if self[0].token_type == 'fws': - return self.pop(0) - return self[0].pop_leading_fws() - - def pop_trailing_ws(self): - if self[-1].token_type == 'cfws': - return self.pop(-1) - return self[-1].pop_trailing_ws() - @property - def has_fws(self): - for part in self: - if part.has_fws: - return True - return False - - def has_leading_comment(self): - return self[0].has_leading_comment() + def as_ew_allowed(self): + """True if all top level tokens of this part may be RFC2047 encoded.""" + return all(part.as_ew_allowed for part in self) @property def comments(self): @@ -294,69 +141,13 @@ class TokenList(list): return comments def fold(self, *, policy): - # max_line_length 0/None means no limit, ie: infinitely long. - maxlen = policy.max_line_length or float("+inf") - folded = _Folded(maxlen, policy) - self._fold(folded) - folded.finalize() - return str(folded) - - def as_encoded_word(self, charset): - # This works only for things returned by 'parts', which include - # the leading fws, if any, that should be used. - res = [] - ws = self.pop_leading_fws() - if ws: - res.append(ws) - trailer = self.pop(-1) if self[-1].token_type=='fws' else '' - res.append(_ew.encode(str(self), charset)) - res.append(trailer) - return ''.join(res) - - def cte_encode(self, charset, policy): - res = [] - for part in self: - res.append(part.cte_encode(charset, policy)) - return ''.join(res) - - def _fold(self, folded): - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - tlen = len(tstr) - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - # XXX: this should be a policy setting when utf8 is False. - charset = 'utf-8' - tstr = part.cte_encode(charset, folded.policy) - tlen = len(tstr) - if folded.append_if_fits(part, tstr): - continue - # Peel off the leading whitespace if any and make it sticky, to - # avoid infinite recursion. - ws = part.pop_leading_fws() - if ws is not None: - folded.stickyspace = str(ws) - if folded.append_if_fits(part): - continue - if part.has_fws: - part._fold(folded) - continue - # There are no fold points in this one; it is too long for a single - # line and can't be split...we just have to put it on its own line. - folded.append(tstr) - folded.newline() + return _refold_parse_tree(self, policy=policy) def pprint(self, indent=''): - print('\n'.join(self._pp(indent=''))) + print(self.ppstr(indent=indent)) def ppstr(self, indent=''): - return '\n'.join(self._pp(indent='')) + return '\n'.join(self._pp(indent=indent)) def _pp(self, indent=''): yield '{}{}/{}('.format( @@ -391,173 +182,11 @@ class UnstructuredTokenList(TokenList): token_type = 'unstructured' - def _fold(self, folded): - last_ew = None - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - is_ew = False - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - charset = 'utf-8' - if last_ew is not None: - # We've already done an EW, combine this one with it - # if there's room. - chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) - oldlastlen = sum(len(x) for x in folded.current[:last_ew]) - schunk = str(chunk) - lchunk = len(schunk) - if oldlastlen + lchunk <= folded.maxlen: - del folded.current[last_ew:] - folded.append(schunk) - folded.lastlen = oldlastlen + lchunk - continue - tstr = part.as_encoded_word(charset) - is_ew = True - if folded.append_if_fits(part, tstr): - if is_ew: - last_ew = len(folded.current) - 1 - continue - if is_ew or last_ew: - # It's too big to fit on the line, but since we've - # got encoded words we can use encoded word folding. - part._fold_as_ew(folded) - continue - # Peel off the leading whitespace if any and make it sticky, to - # avoid infinite recursion. - ws = part.pop_leading_fws() - if ws is not None: - folded.stickyspace = str(ws) - if folded.append_if_fits(part): - continue - if part.has_fws: - part._fold(folded) - continue - # It can't be split...we just have to put it on its own line. - folded.append(tstr) - folded.newline() - last_ew = None - - def cte_encode(self, charset, policy): - res = [] - last_ew = None - for part in self: - spart = str(part) - try: - spart.encode('us-ascii') - res.append(spart) - except UnicodeEncodeError: - if last_ew is None: - res.append(part.cte_encode(charset, policy)) - last_ew = len(res) - else: - tl = get_unstructured(''.join(res[last_ew:] + [spart])) - res.append(tl.as_encoded_word(charset)) - return ''.join(res) - class Phrase(TokenList): token_type = 'phrase' - def _fold(self, folded): - # As with Unstructured, we can have pure ASCII with or without - # surrogateescape encoded bytes, or we could have unicode. But this - # case is more complicated, since we have to deal with the various - # sub-token types and how they can be composed in the face of - # unicode-that-needs-CTE-encoding, and the fact that if a token a - # comment that becomes a barrier across which we can't compose encoded - # words. - last_ew = None - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - tlen = len(tstr) - has_ew = False - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - charset = 'utf-8' - if last_ew is not None and not part.has_leading_comment(): - # We've already done an EW, let's see if we can combine - # this one with it. The last_ew logic ensures that all we - # have at this point is atoms, no comments or quoted - # strings. So we can treat the text between the last - # encoded word and the content of this token as - # unstructured text, and things will work correctly. But - # we have to strip off any trailing comment on this token - # first, and if it is a quoted string we have to pull out - # the content (we're encoding it, so it no longer needs to - # be quoted). - if part[-1].token_type == 'cfws' and part.comments: - remainder = part.pop(-1) - else: - remainder = '' - for i, token in enumerate(part): - if token.token_type == 'bare-quoted-string': - part[i] = UnstructuredTokenList(token[:]) - chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) - schunk = str(chunk) - lchunk = len(schunk) - if last_ew + lchunk <= folded.maxlen: - del folded.current[last_ew:] - folded.append(schunk) - folded.lastlen = sum(len(x) for x in folded.current) - continue - tstr = part.as_encoded_word(charset) - tlen = len(tstr) - has_ew = True - if folded.append_if_fits(part, tstr): - if has_ew and not part.comments: - last_ew = len(folded.current) - 1 - elif part.comments or part.token_type == 'quoted-string': - # If a comment is involved we can't combine EWs. And if a - # quoted string is involved, it's not worth the effort to - # try to combine them. - last_ew = None - continue - part._fold(folded) - - def cte_encode(self, charset, policy): - res = [] - last_ew = None - is_ew = False - for part in self: - spart = str(part) - try: - spart.encode('us-ascii') - res.append(spart) - except UnicodeEncodeError: - is_ew = True - if last_ew is None: - if not part.comments: - last_ew = len(res) - res.append(part.cte_encode(charset, policy)) - elif not part.has_leading_comment(): - if part[-1].token_type == 'cfws' and part.comments: - remainder = part.pop(-1) - else: - remainder = '' - for i, token in enumerate(part): - if token.token_type == 'bare-quoted-string': - part[i] = UnstructuredTokenList(token[:]) - tl = get_unstructured(''.join(res[last_ew:] + [spart])) - res[last_ew:] = [tl.as_encoded_word(charset)] - if part.comments or (not is_ew and part.token_type == 'quoted-string'): - last_ew = None - return ''.join(res) - class Word(TokenList): token_type = 'word' @@ -567,9 +196,6 @@ class CFWSList(WhiteSpaceTokenList): token_type = 'cfws' - def has_leading_comment(self): - return bool(self.comments) - class Atom(TokenList): @@ -579,6 +205,7 @@ class Atom(TokenList): class Token(TokenList): token_type = 'token' + encode_as_ew = False class EncodedWord(TokenList): @@ -588,13 +215,6 @@ class EncodedWord(TokenList): charset = None lang = None - @property - def encoded(self): - if self.cte is not None: - return self.cte - _ew.encode(str(self), self.charset) - - class QuotedString(TokenList): @@ -865,6 +485,7 @@ class InvalidMailbox(TokenList): class Domain(TokenList): token_type = 'domain' + as_ew_allowed = False @property def domain(self): @@ -879,11 +500,13 @@ class DotAtom(TokenList): class DotAtomText(TokenList): token_type = 'dot-atom-text' + as_ew_allowed = True class AddrSpec(TokenList): token_type = 'addr-spec' + as_ew_allowed = False @property def local_part(self): @@ -916,11 +539,13 @@ class AddrSpec(TokenList): class ObsLocalPart(TokenList): token_type = 'obs-local-part' + as_ew_allowed = False class DisplayName(Phrase): token_type = 'display-name' + ew_combine_allowed = False @property def display_name(self): @@ -960,6 +585,7 @@ class DisplayName(Phrase): class LocalPart(TokenList): token_type = 'local-part' + as_ew_allowed = False @property def value(self): @@ -995,6 +621,7 @@ class LocalPart(TokenList): class DomainLiteral(TokenList): token_type = 'domain-literal' + as_ew_allowed = False @property def domain(self): @@ -1081,6 +708,7 @@ class Value(TokenList): class MimeParameters(TokenList): token_type = 'mime-parameters' + syntactic_break = False @property def params(self): @@ -1165,6 +793,10 @@ class MimeParameters(TokenList): class ParameterizedHeaderValue(TokenList): + # Set this false so that the value doesn't wind up on a new line even + # if it and the parameters would fit there but not on the first line. + syntactic_break = False + @property def params(self): for token in reversed(self): @@ -1172,18 +804,11 @@ class ParameterizedHeaderValue(TokenList): return token.params return {} - @property - def parts(self): - if self and self[-1].token_type == 'mime-parameters': - # We don't want to start a new line if all of the params don't fit - # after the value, so unwrap the parameter list. - return TokenList(self[:-1] + self[-1]) - return TokenList(self).parts - class ContentType(ParameterizedHeaderValue): token_type = 'content-type' + as_ew_allowed = False maintype = 'text' subtype = 'plain' @@ -1191,40 +816,27 @@ class ContentType(ParameterizedHeaderValue): class ContentDisposition(ParameterizedHeaderValue): token_type = 'content-disposition' + as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): token_type = 'content-transfer-encoding' + as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): token_type = 'header-label' + as_ew_allowed = False class Header(TokenList): token_type = 'header' - def _fold(self, folded): - folded.append(str(self.pop(0))) - folded.lastlen = len(folded.current[0]) - # The first line of the header is different from all others: we don't - # want to start a new object on a new line if it has any fold points in - # it that would allow part of it to be on the first header line. - # Further, if the first fold point would fit on the new line, we want - # to do that, but if it doesn't we want to put it on the first line. - # Folded supports this via the stickyspace attribute. If this - # attribute is not None, it does the special handling. - folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else '' - rest = self.pop(0) - if self: - raise ValueError("Malformed Header token list") - rest._fold(folded) - # # Terminal classes and instances @@ -1232,6 +844,10 @@ class Header(TokenList): class Terminal(str): + as_ew_allowed = True + ew_combine_allowed = True + syntactic_break = True + def __new__(cls, value, token_type): self = super().__new__(cls, value) self.token_type = token_type @@ -1241,6 +857,9 @@ class Terminal(str): def __repr__(self): return "{}({})".format(self.__class__.__name__, super().__repr__()) + def pprint(self): + print(self.__class__.__name__ + '/' + self.token_type) + @property def all_defects(self): return list(self.defects) @@ -1254,29 +873,14 @@ class Terminal(str): '' if not self.defects else ' {}'.format(self.defects), )] - def cte_encode(self, charset, policy): - value = str(self) - try: - value.encode('us-ascii') - return value - except UnicodeEncodeError: - return _ew.encode(value, charset) - def pop_trailing_ws(self): # This terminates the recursion. return None - def pop_leading_fws(self): - # This terminates the recursion. - return None - @property def comments(self): return [] - def has_leading_comment(self): - return False - def __getnewargs__(self): return(str(self), self.token_type) @@ -1290,8 +894,6 @@ class WhiteSpaceTerminal(Terminal): def startswith_fws(self): return True - has_fws = True - class ValueTerminal(Terminal): @@ -1302,11 +904,6 @@ class ValueTerminal(Terminal): def startswith_fws(self): return False - has_fws = False - - def as_encoded_word(self, charset): - return _ew.encode(str(self), charset) - class EWWhiteSpaceTerminal(WhiteSpaceTerminal): @@ -1314,15 +911,9 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal): def value(self): return '' - @property - def encoded(self): - return self[:] - def __str__(self): return '' - has_fws = True - # XXX these need to become classes and used as instances so # that a program can't change them in a parse tree and screw @@ -2751,7 +2342,7 @@ def get_parameter(value): if value[0] != "'": raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " "delimiter, but found {!r}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) value = value[1:] if value and value[0] != "'": token, value = get_attrtext(value) @@ -2760,7 +2351,7 @@ def get_parameter(value): if not value or value[0] != "'": raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " "delimiter, but found {}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) value = value[1:] if remainder is not None: # Treat the rest of value as bare quoted string content. @@ -2965,3 +2556,255 @@ def parse_content_transfer_encoding_header(value): token, value = get_phrase(value) cte_header.append(token) return cte_header + + +# +# Header folding +# +# Header folding is complex, with lots of rules and corner cases. The +# following code does its best to obey the rules and handle the corner +# cases, but you can be sure there are few bugs:) +# +# This folder generally canonicalizes as it goes, preferring the stringified +# version of each token. The tokens contain information that supports the +# folder, including which tokens can be encoded in which ways. +# +# Folded text is accumulated in a simple list of strings ('lines'), each +# one of which should be less than policy.max_line_length ('maxlen'). +# + +def _steal_trailing_WSP_if_exists(lines): + wsp = '' + if lines and lines[-1] and lines[-1][-1] in WSP: + wsp = lines[-1][-1] + lines[-1] = lines[-1][:-1] + return wsp + +def _refold_parse_tree(parse_tree, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ + # max_line_length 0/None means no limit, ie: infinitely long. + maxlen = policy.max_line_length or float("+inf") + encoding = 'utf-8' if policy.utf8 else 'us-ascii' + lines = [''] + last_ew = None + wrap_as_ew_blocked = 0 + want_encoding = False + end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') + parts = list(parse_tree) + while parts: + part = parts.pop(0) + if part is end_ew_not_allowed: + wrap_as_ew_blocked -= 1 + continue + tstr = str(part) + try: + tstr.encode(encoding) + charset = encoding + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + charset = 'unknown-8bit' + else: + # If policy.utf8 is false this should really be taken from a + # 'charset' property on the policy. + charset = 'utf-8' + want_encoding = True + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + continue + if want_encoding and not wrap_as_ew_blocked: + if not part.as_ew_allowed: + want_encoding = False + last_ew = None + if part.syntactic_break: + encoded_part = part.fold(policy=policy)[:-1] # strip nl + if policy.linesep not in encoded_part: + # It fits on a single line + if len(encoded_part) > maxlen - len(lines[-1]): + # But not on this one, so start a new one. + newline = _steal_trailing_WSP_if_exists(lines) + # XXX what if encoded_part has no leading FWS? + lines.append(newline) + lines[-1] += encoded_part + continue + # Either this is not a major syntactic break, so we don't + # want it on a line by itself even if it fits, or it + # doesn't fit on a line by itself. Either way, fall through + # to unpacking the subparts and wrapping them. + if not hasattr(part, 'encode'): + # It's not a Terminal, do each piece individually. + parts = list(part) + parts + else: + # It's a terminal, wrap it as an encoded word, possibly + # combining it with previously encoded words if allowed. + last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, + part.ew_combine_allowed, charset) + want_encoding = False + continue + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + if not part.as_ew_allowed: + wrap_as_ew_blocked += 1 + newparts.append(end_ew_not_allowed) + parts = newparts + parts + continue + if part.as_ew_allowed and not wrap_as_ew_blocked: + # It doesn't need CTE encoding, but encode it anyway so we can + # wrap it. + parts.insert(0, part) + want_encoding = True + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return policy.linesep.join(lines) + policy.linesep + +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): + """Fold string to_encode into lines as encoded word, combining if allowed. + Return the new value for last_ew, or None if ew_combine_allowed is False. + + If there is already an encoded word in the last line of lines (indicated by + a non-None value for last_ew) and ew_combine_allowed is true, decode the + existing ew, combine it with to_encode, and re-encode. Otherwise, encode + to_encode. In either case, split to_encode as necessary so that the + encoded segments fit within maxlen. + + """ + if last_ew is not None and ew_combine_allowed: + to_encode = str( + get_unstructured(lines[-1][last_ew:] + to_encode)) + lines[-1] = lines[-1][:last_ew] + if to_encode[0] in WSP: + # We're joining this to non-encoded text, so don't encode + # the leading blank. + leading_wsp = to_encode[0] + to_encode = to_encode[1:] + if (len(lines[-1]) == maxlen): + lines.append(_steal_trailing_WSP_if_exists(lines)) + lines[-1] += leading_wsp + trailing_wsp = '' + if to_encode[-1] in WSP: + # Likewise for the trailing space. + trailing_wsp = to_encode[-1] + to_encode = to_encode[:-1] + new_last_ew = len(lines[-1]) if last_ew is None else last_ew + while to_encode: + remaining_space = maxlen - len(lines[-1]) + # The RFC2047 chrome takes up 7 characters plus the length + # of the charset name. + encode_as = 'utf-8' if charset == 'us-ascii' else charset + text_space = remaining_space - len(encode_as) - 7 + if text_space <= 0: + lines.append(' ') + # XXX We'll get an infinite loop here if maxlen is <= 7 + continue + first_part = to_encode[:text_space] + ew = _ew.encode(first_part, charset=encode_as) + excess = len(ew) - remaining_space + if excess > 0: + # encode always chooses the shortest encoding, so this + # is guaranteed to fit at this point. + first_part = first_part[:-excess] + ew = _ew.encode(first_part) + lines[-1] += ew + to_encode = to_encode[len(first_part):] + if to_encode: + lines.append(' ') + new_last_ew = len(lines[-1]) + lines[-1] += trailing_wsp + return new_last_ew if ew_combine_allowed else None + +def _fold_mime_parameters(part, lines, maxlen, encoding): + """Fold TokenList 'part' into the 'lines' list as mime parameters. + + Using the decoded list of parameters and values, format them according to + the RFC rules, including using RFC2231 encoding if the value cannot be + expressed in 'encoding' and/or the paramter+value is too long to fit within + 'maxlen'. + + """ + # Special case for RFC2231 encoding: start from decoded values and use + # RFC2231 encoding iff needed. + # + # Note that the 1 and 2s being added to the length calculations are + # accounting for the possibly-needed spaces and semicolons we'll be adding. + # + for name, value in part.params: + # XXX What if this ';' puts us over maxlen the first time through the + # loop? We should split the header value onto a newline in that case, + # but to do that we need to recognize the need earlier or reparse the + # header, so I'm going to ignore that bug for now. It'll only put us + # one character over. + if not lines[-1].rstrip().endswith(';'): + lines[-1] += ';' + charset = encoding + error_handler = 'strict' + try: + value.encode(encoding) + encoding_required = False + except UnicodeEncodeError: + encoding_required = True + if utils._has_surrogates(value): + charset = 'unknown-8bit' + error_handler = 'surrogateescape' + else: + charset = 'utf-8' + if encoding_required: + encoded_value = urllib.parse.quote( + value, safe='', errors=error_handler) + tstr = "{}*={}''{}".format(name, charset, encoded_value) + else: + tstr = '{}={}'.format(name, quote_string(value)) + if len(lines[-1]) + len(tstr) + 1 < maxlen: + lines[-1] = lines[-1] + ' ' + tstr + continue + elif len(tstr) + 2 <= maxlen: + lines.append(' ' + tstr) + continue + # We need multiple sections. We are allowed to mix encoded and + # non-encoded sections, but we aren't going to. We'll encode them all. + section = 0 + extra_chrome = charset + "''" + while value: + chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) + if maxlen <= chrome_len + 3: + # We need room for the leading blank, the trailing semicolon, + # and at least one character of the value. If we don't + # have that, we'd be stuck, so in that case fall back to + # the RFC standard width. + maxlen = 78 + splitpoint = maxchars = maxlen - chrome_len - 2 + while True: + partial = value[:splitpoint] + encoded_value = urllib.parse.quote( + partial, safe='', errors=error_handler) + if len(encoded_value) <= maxchars: + break + splitpoint -= 1 + lines.append(" {}*{}*={}{}".format( + name, section, extra_chrome, encoded_value)) + extra_chrome = '' + section += 1 + value = value[splitpoint:] + if value: + lines[-1] += ';' diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 81fee146dcc..00652049f2f 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -245,13 +245,16 @@ class BaseHeader(str): the header name and the ': ' separator. """ - # At some point we need to only put fws here if it was in the source. + # At some point we need to put fws here iif it was in the source. header = parser.Header([ parser.HeaderLabel([ parser.ValueTerminal(self.name, 'header-name'), parser.ValueTerminal(':', 'header-sep')]), - parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), - self._parse_tree]) + ]) + if self._parse_tree: + header.append( + parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')])) + header.append(self._parse_tree) return header.fold(policy=policy) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e0ec87d2080..1667617b9e4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -14,18 +14,7 @@ class TestTokens(TestEmailBase): self.assertEqual(x, ' \t') self.assertEqual(str(x), '') self.assertEqual(x.value, '') - self.assertEqual(x.encoded, ' \t') - - # UnstructuredTokenList - - def test_undecodable_bytes_error_preserved(self): - badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape') - unst = parser.get_unstructured(badstr) - self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect]) - parts = list(unst.parts) - self.assertDefectsEqual(parts[0].all_defects, []) - self.assertDefectsEqual(parts[1].all_defects, []) - self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect]) + self.assertEqual(x.token_type, 'fws') class TestParserMixin: @@ -139,7 +128,6 @@ class TestParser(TestParserMixin, TestEmailBase): 'first second', [], '') - self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=') self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.lang, 'jive') @@ -150,7 +138,6 @@ class TestParser(TestParserMixin, TestEmailBase): 'first second', [], '') - self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=') self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.lang, '') @@ -2700,28 +2687,37 @@ class TestFolding(TestEmailBase): # and with unicode tokens in the comments. Spaces inside the quotes # currently don't do the right thing. - def test_initial_whitespace_splitting(self): + def test_split_at_whitespace_after_header_before_long_token(self): body = parser.get_unstructured(' ' + 'x'*77) header = parser.Header([ parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]), parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body]) self._test(header, 'test: \n ' + 'x'*77 + '\n') - def test_whitespace_splitting(self): + def test_split_at_whitespace_before_long_token(self): self._test(parser.get_unstructured('xxx ' + 'y'*77), 'xxx \n ' + 'y'*77 + '\n') + def test_overlong_encodeable_is_wrapped(self): + first_token_with_whitespace = 'xxx ' + chrome_leader = '=?utf-8?q?' + len_chrome = len(chrome_leader) + 2 + len_non_y = len_chrome + len(first_token_with_whitespace) + self._test(parser.get_unstructured(first_token_with_whitespace + + 'y'*80), + first_token_with_whitespace + chrome_leader + + 'y'*(78-len_non_y) + '?=\n' + + ' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n') + def test_long_filename_attachment(self): - folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"') - self.assertEqual( - 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n', - folded - ) - folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"') - self.assertEqual( - 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n', - folded - ) + self._test(parser.parse_content_disposition_header( + 'attachment; filename="TEST_TEST_TEST_TEST' + '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'), + "attachment;\n" + " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST" + "_TEST_TEST_TEST_TEST_TEST;\n" + " filename*1*=_TEST_TES.txt\n", + ) if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index c4f182903af..c1aeaefab77 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -27,7 +27,6 @@ class TestGeneratorBase: None """), - # From is wrapped because wrapped it fits in 40. 40: textwrap.dedent("""\ To: whom_it_may_concern@example.com From: @@ -40,11 +39,11 @@ class TestGeneratorBase: None """), - # Neither to nor from fit even if put on a new line, - # so we leave them sticking out on the first line. 20: textwrap.dedent("""\ - To: whom_it_may_concern@example.com - From: nobody_you_want_to_know@example.com + To: + whom_it_may_concern@example.com + From: + nobody_you_want_to_know@example.com Subject: We the willing led by the unknowing are doing @@ -169,6 +168,53 @@ class TestGeneratorBase: g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0])) + def test_rfc2231_wrapping(self): + # This is pretty much just to make sure we don't have an infinite + # loop; I don't expect anyone to hit this in the field. + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename="afilenamelongenoghtowraphere" + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename*0*=us-ascii''afilename; + filename*1*=longenoghtowraphere + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=33)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + + def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self): + # This is just to make sure we don't have an infinite loop; I don't + # expect anyone to hit this in the field, so I'm not bothering to make + # the result optimal (the encoding isn't needed). + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename="afilenamelongenoghtowraphere" + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Content-Disposition: + attachment; + filename*0*=us-ascii''afilenamelongenoghtowraphere + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=20)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + class TestGenerator(TestGeneratorBase, TestEmailBase): diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index af836dc9726..30ce0ba54e4 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -229,14 +229,14 @@ class TestContentTypeHeader(TestHeaderBase): defects = args[1] if l>1 else [] decoded = args[2] if l>2 and args[2] is not DITTO else source header = 'Content-Type:' + ' ' if source else '' - folded = args[3] if l>3 else header + source + '\n' + folded = args[3] if l>3 else header + decoded + '\n' h = self.make_header('Content-Type', source) self.assertEqual(h.content_type, content_type) self.assertEqual(h.maintype, maintype) self.assertEqual(h.subtype, subtype) self.assertEqual(h.params, parmdict) with self.assertRaises(TypeError): - h.params['abc'] = 'xyz' # params is read-only. + h.params['abc'] = 'xyz' # make sure params is read-only. self.assertDefectsEqual(h.defects, defects) self.assertEqual(h, decoded) self.assertEqual(h.fold(policy=policy.default), folded) @@ -373,9 +373,10 @@ class TestContentTypeHeader(TestHeaderBase): 'text/plain; Charset="utf-8"'), # Since this is pretty much the ur-mimeheader, we'll put all the tests - # that exercise the parameter parsing and formatting here. - # - # XXX: question: is minimal quoting preferred? + # that exercise the parameter parsing and formatting here. Note that + # when we refold we may canonicalize, so things like whitespace, + # quoting, and rfc2231 encoding may change from what was in the input + # header. 'unquoted_param_value': ( 'text/plain; title=foo', @@ -384,7 +385,8 @@ class TestContentTypeHeader(TestHeaderBase): 'plain', {'title': 'foo'}, [], - 'text/plain; title="foo"'), + 'text/plain; title="foo"', + ), 'param_value_with_tspecials': ( 'text/plain; title="(bar)foo blue"', @@ -415,7 +417,8 @@ class TestContentTypeHeader(TestHeaderBase): 'mixed', {'boundary': 'CPIMSSMTPC06p5f3tG'}, [], - 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"'), + 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"', + ), 'spaces_around_semis': ( ('image/jpeg; name="wibble.JPG" ; x-mac-type="4A504547" ; ' @@ -429,14 +432,31 @@ class TestContentTypeHeader(TestHeaderBase): [], ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' 'x-mac-creator="474B4F4E"'), - # XXX: it could be that we will eventually prefer to fold starting - # from the decoded value, in which case these spaces and similar - # spaces in other tests will be wrong. - ('Content-Type: image/jpeg; name="wibble.JPG" ; ' - 'x-mac-type="4A504547" ;\n' + ('Content-Type: image/jpeg; name="wibble.JPG";' + ' x-mac-type="4A504547";\n' ' x-mac-creator="474B4F4E"\n'), ), + 'lots_of_mime_params': ( + ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' + 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'), + 'image/jpeg', + 'image', + 'jpeg', + {'name': 'wibble.JPG', + 'x-mac-type': '4A504547', + 'x-mac-creator': '474B4F4E', + 'x-extrastuff': 'make it longer'}, + [], + ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' + 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'), + # In this case the whole of the MimeParameters does *not* fit + # one one line, so we break at a lower syntactic level. + ('Content-Type: image/jpeg; name="wibble.JPG";' + ' x-mac-type="4A504547";\n' + ' x-mac-creator="474B4F4E"; x-extrastuff="make it longer"\n'), + ), + 'semis_inside_quotes': ( 'image/jpeg; name="Jim&&Jill"', 'image/jpeg', @@ -460,19 +480,25 @@ class TestContentTypeHeader(TestHeaderBase): [], r'image/jpeg; name="Jim \"Bob\" Jill"'), - # XXX: This test works except for the refolding of the header. I'll - # deal with that bug when I deal with the other folding bugs. - #'non_ascii_in_params': ( - # ('foo\xa7/bar; b\xa7r=two; ' - # 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii', - # 'surrogateescape')), - # 'foo\uFFFD/bar', - # 'foo\uFFFD', - # 'bar', - # {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'}, - # [errors.UndecodableBytesDefect]*3, - # 'foo�/bar; b�r="two"; baz="thr�e"', - # ), + 'non_ascii_in_params': ( + ('foo\xa7/bar; b\xa7r=two; ' + 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii', + 'surrogateescape')), + 'foo\uFFFD/bar', + 'foo\uFFFD', + 'bar', + {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'}, + [errors.UndecodableBytesDefect]*3, + 'foo�/bar; b�r="two"; baz="thr�e"', + # XXX Two bugs here: the mime type is not allowed to be an encoded + # word, and we shouldn't be emitting surrogates in the parameter + # names. But I don't know what the behavior should be here, so I'm + # punting for now. In practice this is unlikely to be encountered + # since headers with binary in them only come from a binary source + # and are almost certain to be re-emitted without refolding. + 'Content-Type: =?unknown-8bit?q?foo=A7?=/bar; b\udca7r="two";\n' + " baz*=unknown-8bit''thr%A7e\n", + ), # RFC 2231 parameter tests. @@ -494,19 +520,20 @@ class TestContentTypeHeader(TestHeaderBase): [], r'image/jpeg; bar="baz\"foobar\"baz"'), - # XXX: This test works except for the refolding of the header. I'll - # deal with that bug when I deal with the other folding bugs. - #'non_ascii_rfc2231_value': ( - # ('text/plain; charset=us-ascii; ' - # "title*=us-ascii'en'This%20is%20" - # 'not%20f\xa7n').encode('latin-1').decode('us-ascii', - # 'surrogateescape'), - # 'text/plain', - # 'text', - # 'plain', - # {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'}, - # [errors.UndecodableBytesDefect], - # 'text/plain; charset="us-ascii"; title="This is not f�n"'), + 'non_ascii_rfc2231_value': ( + ('text/plain; charset=us-ascii; ' + "title*=us-ascii'en'This%20is%20" + 'not%20f\xa7n').encode('latin-1').decode('us-ascii', + 'surrogateescape'), + 'text/plain', + 'text', + 'plain', + {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'}, + [errors.UndecodableBytesDefect], + 'text/plain; charset="us-ascii"; title="This is not f�n"', + 'Content-Type: text/plain; charset="us-ascii";\n' + " title*=unknown-8bit''This%20is%20not%20f%A7n\n", + ), 'rfc2231_encoded_charset': ( 'text/plain; charset*=ansi-x3.4-1968\'\'us-ascii', @@ -529,8 +556,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': 'This is ***fun*** is it not.pdf'}, [], 'text/plain; name="This is ***fun*** is it not.pdf"', - ('Content-Type: text/plain;\tname*0*=\'\'This%20is%20;\n' - '\tname*1*=%2A%2A%2Afun%2A%2A%2A%20;\tname*2="is it not.pdf"\n'), ), # Make sure we also handle it if there are spurious double quotes. @@ -545,9 +570,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': 'This is even more ***fun*** is it not.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it not.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="us-ascii\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it not.pdf"\n'), ), 'rfc2231_single_quote_inside_double_quotes': ( @@ -562,9 +584,8 @@ class TestContentTypeHeader(TestHeaderBase): [errors.InvalidHeaderDefect]*2, ('text/plain; charset="us-ascii"; ' 'title="This is really ***fun*** isn\'t it!"'), - ('Content-Type: text/plain; charset=us-ascii;\n' - '\ttitle*0*="us-ascii\'en\'This%20is%20really%20";\n' - '\ttitle*1*="%2A%2A%2Afun%2A%2A%2A%20";\ttitle*2="isn\'t it!"\n'), + ('Content-Type: text/plain; charset="us-ascii";\n' + ' title="This is really ***fun*** isn\'t it!"\n'), ), 'rfc2231_single_quote_in_value_with_charset_and_lang': ( @@ -576,9 +597,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': "Frank's Document"}, [errors.InvalidHeaderDefect]*2, 'application/x-foo; name="Frank\'s Document"', - ('Content-Type: application/x-foo;\t' - 'name*0*="us-ascii\'en-us\'Frank\'s";\n' - ' name*1*=" Document"\n'), ), 'rfc2231_single_quote_in_non_encoded_value': ( @@ -590,9 +608,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': "us-ascii'en-us'Frank's Document"}, [], 'application/x-foo; name="us-ascii\'en-us\'Frank\'s Document"', - ('Content-Type: application/x-foo;\t' - 'name*0="us-ascii\'en-us\'Frank\'s";\n' - ' name*1=" Document"\n'), ), 'rfc2231_no_language_or_charset': ( @@ -615,12 +630,8 @@ class TestContentTypeHeader(TestHeaderBase): {'name': 'This is even more ***fun*** is it.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), ), - # XXX: see below...the first name line here should be *0 not *0*. 'rfc2231_partly_encoded': ( ("text/plain;" '\tname*0*="\'\'This%20is%20even%20more%20";' @@ -632,9 +643,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': 'This is even more ***fun*** is it.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), ), 'rfc2231_partly_encoded_2': ( @@ -647,10 +655,11 @@ class TestContentTypeHeader(TestHeaderBase): 'plain', {'name': 'This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf'}, [errors.InvalidHeaderDefect], - 'text/plain; name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), + ('text/plain;' + ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"'), + ('Content-Type: text/plain;\n' + ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is' + ' it.pdf"\n'), ), 'rfc2231_unknown_charset_treated_as_ascii': ( @@ -669,9 +678,12 @@ class TestContentTypeHeader(TestHeaderBase): 'plain', {'charset': 'utf-8\uFFFD\uFFFD\uFFFD'}, [errors.UndecodableBytesDefect], - 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"'), + 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"', + "Content-Type: text/plain;" + " charset*=unknown-8bit''utf-8%F1%F2%F3\n", + ), - 'rfc2231_utf_8_in_supposedly_ascii_charset_parameter_value': ( + 'rfc2231_utf8_in_supposedly_ascii_charset_parameter_value': ( "text/plain; charset*=ascii''utf-8%E2%80%9D", 'text/plain', 'text', @@ -679,9 +691,11 @@ class TestContentTypeHeader(TestHeaderBase): {'charset': 'utf-8”'}, [errors.UndecodableBytesDefect], 'text/plain; charset="utf-8”"', + # XXX Should folding change the charset to utf8? Currently it just + # reproduces the original, which is arguably fine. + "Content-Type: text/plain;" + " charset*=unknown-8bit''utf-8%E2%80%9D\n", ), - # XXX: if the above were *re*folded, it would get tagged as utf-8 - # instead of ascii in the param, since it now contains non-ASCII. 'rfc2231_encoded_then_unencoded_segments': ( ('application/x-foo;' @@ -694,9 +708,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': 'My Document For You'}, [errors.InvalidHeaderDefect], 'application/x-foo; name="My Document For You"', - ('Content-Type: application/x-foo;\t' - 'name*0*="us-ascii\'en-us\'My";\n' - '\tname*1=" Document";\tname*2=" For You"\n'), ), # My reading of the RFC is that this is an invalid header. The RFC @@ -713,11 +724,6 @@ class TestContentTypeHeader(TestHeaderBase): {'name': 'My Document For You'}, [errors.InvalidHeaderDefect]*3, 'application/x-foo; name="My Document For You"', - ("Content-Type: application/x-foo;\tname*0=us-ascii'en-us'My;\t" - # XXX: the newline is in the wrong place, come back and fix - # this when the rest of tests pass. - 'name*1*=" Document"\n;' - '\tname*2*=" For You"\n'), ), # XXX: I would say this one should default to ascii/en for the @@ -730,8 +736,7 @@ class TestContentTypeHeader(TestHeaderBase): # charset'lang'value pattern exactly *and* there is at least one # encoded segment. Implementing that algorithm will require some # refactoring, so I haven't done it (yet). - - 'rfc2231_qouted_unencoded_then_encoded_segments': ( + 'rfc2231_quoted_unencoded_then_encoded_segments': ( ('application/x-foo;' '\tname*0="us-ascii\'en-us\'My";' '\tname*1*=" Document";' @@ -742,9 +747,25 @@ class TestContentTypeHeader(TestHeaderBase): {'name': "us-ascii'en-us'My Document For You"}, [errors.InvalidHeaderDefect]*2, 'application/x-foo; name="us-ascii\'en-us\'My Document For You"', - ('Content-Type: application/x-foo;\t' - 'name*0="us-ascii\'en-us\'My";\n' - '\tname*1*=" Document";\tname*2*=" For You"\n'), + ), + + # Make sure our folding algorithm produces multiple sections correctly. + # We could mix encoded and non-encoded segments, but we don't, we just + # make them all encoded. It might be worth fixing that, since the + # sections can get used for wrapping ascii text. + 'rfc2231_folded_segments_correctly_formatted': ( + ('application/x-foo;' + '\tname="' + "with spaces"*8 + '"'), + 'application/x-foo', + 'application', + 'x-foo', + {'name': "with spaces"*8}, + [], + 'application/x-foo; name="' + "with spaces"*8 + '"', + "Content-Type: application/x-foo;\n" + " name*0*=us-ascii''with%20spaceswith%20spaceswith%20spaceswith" + "%20spaceswith;\n" + " name*1*=%20spaceswith%20spaceswith%20spaceswith%20spaces\n" ), } @@ -827,8 +848,8 @@ class TestContentDisposition(TestHeaderBase): [], ('attachment; filename="genome.jpeg"; ' 'modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'), - ('Content-Disposition: attachment; filename=genome.jpeg;\n' - ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500";\n'), + ('Content-Disposition: attachment; filename="genome.jpeg";\n' + ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"\n'), ), 'no_value': ( @@ -873,7 +894,7 @@ class TestMIMEVersionHeader(TestHeaderBase): if source: source = ' ' + source self.assertEqual(h.fold(policy=policy.default), - 'MIME-Version:' + source + '\n') + 'MIME-Version:' + source + '\n') version_string_params = { @@ -1546,15 +1567,39 @@ class TestFolding(TestHeaderBase): 'singlewordthatwontfit') self.assertEqual( h.fold(policy=policy.default.clone(max_line_length=20)), - 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n') + 'Subject: \n' + ' =?utf-8?q?thisisa?=\n' + ' =?utf-8?q?verylon?=\n' + ' =?utf-8?q?glineco?=\n' + ' =?utf-8?q?nsistin?=\n' + ' =?utf-8?q?gofasin?=\n' + ' =?utf-8?q?gleword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit?=\n' + ) def test_fold_unstructured_with_two_overlong_words(self): h = self.make_header('Subject', 'thisisaverylonglineconsistingofa' 'singlewordthatwontfit plusanotherverylongwordthatwontfit') self.assertEqual( h.fold(policy=policy.default.clone(max_line_length=20)), - 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n' - ' plusanotherverylongwordthatwontfit\n') + 'Subject: \n' + ' =?utf-8?q?thisisa?=\n' + ' =?utf-8?q?verylon?=\n' + ' =?utf-8?q?glineco?=\n' + ' =?utf-8?q?nsistin?=\n' + ' =?utf-8?q?gofasin?=\n' + ' =?utf-8?q?gleword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit_pl?=\n' + ' =?utf-8?q?usanoth?=\n' + ' =?utf-8?q?erveryl?=\n' + ' =?utf-8?q?ongword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit?=\n' + ) + + # XXX Need test for when max_line_length is less than the chrome size. def test_fold_unstructured_with_slightly_long_word(self): h = self.make_header('Subject', 'thislongwordislessthanmaxlinelen') @@ -1590,6 +1635,18 @@ class TestFolding(TestHeaderBase): self.assertEqual(h.fold(policy=policy.default), 'Date: Sat, 02 Feb 2002 17:00:06 -0800\n') + def test_fold_overlong_words_using_RFC2047(self): + h = self.make_header( + 'X-Report-Abuse', + '') + self.assertEqual( + h.fold(policy=policy.default), + 'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E' + 'com/report=5F?=\n' + ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx' + 'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n' + ' =?utf-8?q?xx-xx=3E?=\n') if __name__ == '__main__': diff --git a/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst new file mode 100644 index 00000000000..c933ee7d916 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst @@ -0,0 +1,3 @@ +The header folding algorithm for the new email policies has been rewritten, +which also fixes bpo-30788, bpo-31831, and bpo-32182. In particular, RFC2231 +folding is now done correctly.