More RFC 2231 improvements for the email 4.0 package. As Mark Sapiro rightly

points out there are really two types of continued headers defined in this
RFC (i.e. "encoded" parameters with the form "name*0*=" and unencoded
parameters with the form "name*0="), but we were were handling them both the
same way and that isn't correct.

This patch should be much more RFC compliant in that only encoded params are
%-decoded and the charset/language information is only extract if there are
any encoded params in the segments.  If there are no encoded params then the
RFC says that there will be no charset/language parts.

Note however that this will change the return value for Message.get_param() in
some cases.  For example, whereas before if you had all unencoded param
continuations you would have still gotten a 3-tuple back from this method
(with charset and language == None), you will now get just a string.  I don't
believe this is a backward incompatible change though because the
documentation for this method already indicates that either return value is
possible and that you must do an isinstance(val, tuple) check to discriminate
between the two.  (Yeah that API kind of sucks but we can't change /that/
without breaking code.)

Test cases, some documentation updates, and a NEWS item accompany this patch.
This commit is contained in:
Barry Warsaw 2006-07-21 14:51:07 +00:00
parent d12bd012a6
commit b110bad2d9
5 changed files with 299 additions and 42 deletions

View File

@ -105,7 +105,7 @@ of the package.
\lineiii{4.0}{Python 2.5}{Python 2.3 to 2.5}
\end{tableiii}
Here are the major differences between \module{email} verson 4 and version 3:
Here are the major differences between \module{email} version 4 and version 3:
\begin{itemize}
\item All modules have been renamed according to \pep{8} standards. For
@ -126,6 +126,15 @@ Here are the major differences between \module{email} verson 4 and version 3:
\item Methods that were deprecated in version 3 have been removed. These
include \method{Generator.__call__()}, \method{Message.get_type()},
\method{Message.get_main_type()}, \method{Message.get_subtype()}.
\item Fixes have been added for \rfc{2231} support which can change some of
the return types for \function{Message.get_param()} and friends. Under
some circumstances, values which used to return a 3-tuple now return
simple strings (specifically, if all extended parameter segments were
unencoded, there is no language and charset designation expected, so the
return type is now a simple string). Also, \%-decoding used to be done
for both encoded and unencoded segments; this decoding is now done only
for encoded segments.
\end{itemize}
Here are the major differences between \module{email} version 3 and version 2:

View File

@ -3005,26 +3005,67 @@ Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOC
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_param('NAME'),
(None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm'))
param = msg.get_param('NAME')
self.failIf(isinstance(param, tuple))
self.assertEqual(
param,
'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')
def test_rfc2231_no_language_or_charset_in_filename(self):
m = '''\
Content-Disposition: inline;
\tfilename*0*="''This%20is%20even%20more%20";
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
def test_rfc2231_no_language_or_charset_in_filename_encoded(self):
m = '''\
Content-Disposition: inline;
\tfilename*0*="''This%20is%20even%20more%20";
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
def test_rfc2231_partly_encoded(self):
m = '''\
Content-Disposition: inline;
\tfilename*0="''This%20is%20even%20more%20";
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(
msg.get_filename(),
'This%20is%20even%20more%20***fun*** is it not.pdf')
def test_rfc2231_partly_nonencoded(self):
m = '''\
Content-Disposition: inline;
\tfilename*0="This%20is%20even%20more%20";
\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
self.assertEqual(
msg.get_filename(),
'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20is it not.pdf')
def test_rfc2231_no_language_or_charset_in_boundary(self):
m = '''\
Content-Type: multipart/alternative;
\tboundary*0="This%20is%20even%20more%20";
\tboundary*1="%2A%2A%2Afun%2A%2A%2A%20";
\tboundary*0*="''This%20is%20even%20more%20";
\tboundary*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tboundary*2="is it not.pdf"
'''
@ -3036,8 +3077,8 @@ Content-Type: multipart/alternative;
# This is a nonsensical charset value, but tests the code anyway
m = '''\
Content-Type: text/plain;
\tcharset*0="This%20is%20even%20more%20";
\tcharset*1="%2A%2A%2Afun%2A%2A%2A%20";
\tcharset*0*="This%20is%20even%20more%20";
\tcharset*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tcharset*2="is it not.pdf"
'''
@ -3048,12 +3089,98 @@ Content-Type: text/plain;
def test_rfc2231_unknown_encoding(self):
m = """\
Content-Transfer-Encoding: 8bit
Content-Disposition: inline; filename*0=X-UNKNOWN''myfile.txt
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
"""
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(), 'myfile.txt')
def test_rfc2231_single_tick_in_filename_extended(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0*=\"Frank's\"; name*1*=\" Document\"
"""
msg = email.message_from_string(m)
charset, language, s = msg.get_param('name')
eq(charset, None)
eq(language, None)
eq(s, "Frank's Document")
def test_rfc2231_single_tick_in_filename(self):
m = """\
Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
"""
msg = email.message_from_string(m)
param = msg.get_param('name')
self.failIf(isinstance(param, tuple))
self.assertEqual(param, "Frank's Document")
def test_rfc2231_tick_attack_extended(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0*=\"us-ascii'en-us'Frank's\"; name*1*=\" Document\"
"""
msg = email.message_from_string(m)
charset, language, s = msg.get_param('name')
eq(charset, 'us-ascii')
eq(language, 'en-us')
eq(s, "Frank's Document")
def test_rfc2231_tick_attack(self):
m = """\
Content-Type: application/x-foo;
\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\"
"""
msg = email.message_from_string(m)
param = msg.get_param('name')
self.failIf(isinstance(param, tuple))
self.assertEqual(param, "us-ascii'en-us'Frank's Document")
def test_rfc2231_no_extended_values(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo; name=\"Frank's Document\"
"""
msg = email.message_from_string(m)
eq(msg.get_param('name'), "Frank's Document")
def test_rfc2231_encoded_then_unencoded_segments(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0*=\"us-ascii'en-us'My\";
\tname*1=\" Document\";
\tname*2*=\" For You\"
"""
msg = email.message_from_string(m)
charset, language, s = msg.get_param('name')
eq(charset, 'us-ascii')
eq(language, 'en-us')
eq(s, 'My Document For You')
def test_rfc2231_unencoded_then_encoded_segments(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0=\"us-ascii'en-us'My\";
\tname*1*=\" Document\";
\tname*2*=\" For You\"
"""
msg = email.message_from_string(m)
charset, language, s = msg.get_param('name')
eq(charset, 'us-ascii')
eq(language, 'en-us')
eq(s, 'My Document For You')
def _testclasses():

View File

@ -3011,26 +3011,67 @@ Content-Type: text/html; NAME*0=file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOC
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_param('NAME'),
(None, None, 'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm'))
param = msg.get_param('NAME')
self.failIf(isinstance(param, tuple))
self.assertEqual(
param,
'file____C__DOCUMENTS_20AND_20SETTINGS_FABIEN_LOCAL_20SETTINGS_TEMP_nsmail.htm')
def test_rfc2231_no_language_or_charset_in_filename(self):
m = '''\
Content-Disposition: inline;
\tfilename*0*="''This%20is%20even%20more%20";
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
def test_rfc2231_no_language_or_charset_in_filename_encoded(self):
m = '''\
Content-Disposition: inline;
\tfilename*0*="''This%20is%20even%20more%20";
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
def test_rfc2231_partly_encoded(self):
m = '''\
Content-Disposition: inline;
\tfilename*0="''This%20is%20even%20more%20";
\tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(
msg.get_filename(),
'This%20is%20even%20more%20***fun*** is it not.pdf')
def test_rfc2231_partly_nonencoded(self):
m = '''\
Content-Disposition: inline;
\tfilename*0="This%20is%20even%20more%20";
\tfilename*1="%2A%2A%2Afun%2A%2A%2A%20";
\tfilename*2="is it not.pdf"
'''
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(),
'This is even more ***fun*** is it not.pdf')
self.assertEqual(
msg.get_filename(),
'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20is it not.pdf')
def test_rfc2231_no_language_or_charset_in_boundary(self):
m = '''\
Content-Type: multipart/alternative;
\tboundary*0="This%20is%20even%20more%20";
\tboundary*1="%2A%2A%2Afun%2A%2A%2A%20";
\tboundary*0*="''This%20is%20even%20more%20";
\tboundary*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tboundary*2="is it not.pdf"
'''
@ -3042,8 +3083,8 @@ Content-Type: multipart/alternative;
# This is a nonsensical charset value, but tests the code anyway
m = '''\
Content-Type: text/plain;
\tcharset*0="This%20is%20even%20more%20";
\tcharset*1="%2A%2A%2Afun%2A%2A%2A%20";
\tcharset*0*="This%20is%20even%20more%20";
\tcharset*1*="%2A%2A%2Afun%2A%2A%2A%20";
\tcharset*2="is it not.pdf"
'''
@ -3054,16 +3095,17 @@ Content-Type: text/plain;
def test_rfc2231_unknown_encoding(self):
m = """\
Content-Transfer-Encoding: 8bit
Content-Disposition: inline; filename*0=X-UNKNOWN''myfile.txt
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
"""
msg = email.message_from_string(m)
self.assertEqual(msg.get_filename(), 'myfile.txt')
def test_rfc2231_single_tick_in_filename(self):
def test_rfc2231_single_tick_in_filename_extended(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
Content-Type: application/x-foo;
\tname*0*=\"Frank's\"; name*1*=\" Document\"
"""
msg = email.message_from_string(m)
@ -3072,11 +3114,21 @@ Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
eq(language, None)
eq(s, "Frank's Document")
def test_rfc2231_tick_attack(self):
def test_rfc2231_single_tick_in_filename(self):
m = """\
Content-Type: application/x-foo; name*0=\"Frank's\"; name*1=\" Document\"
"""
msg = email.message_from_string(m)
param = msg.get_param('name')
self.failIf(isinstance(param, tuple))
self.assertEqual(param, "Frank's Document")
def test_rfc2231_tick_attack_extended(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\"
\tname*0*=\"us-ascii'en-us'Frank's\"; name*1*=\" Document\"
"""
msg = email.message_from_string(m)
@ -3085,6 +3137,17 @@ Content-Type: application/x-foo;
eq(language, 'en-us')
eq(s, "Frank's Document")
def test_rfc2231_tick_attack(self):
m = """\
Content-Type: application/x-foo;
\tname*0=\"us-ascii'en-us'Frank's\"; name*1=\" Document\"
"""
msg = email.message_from_string(m)
param = msg.get_param('name')
self.failIf(isinstance(param, tuple))
self.assertEqual(param, "us-ascii'en-us'Frank's Document")
def test_rfc2231_no_extended_values(self):
eq = self.assertEqual
m = """\
@ -3094,6 +3157,36 @@ Content-Type: application/x-foo; name=\"Frank's Document\"
msg = email.message_from_string(m)
eq(msg.get_param('name'), "Frank's Document")
def test_rfc2231_encoded_then_unencoded_segments(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0*=\"us-ascii'en-us'My\";
\tname*1=\" Document\";
\tname*2*=\" For You\"
"""
msg = email.message_from_string(m)
charset, language, s = msg.get_param('name')
eq(charset, 'us-ascii')
eq(language, 'en-us')
eq(s, 'My Document For You')
def test_rfc2231_unencoded_then_encoded_segments(self):
eq = self.assertEqual
m = """\
Content-Type: application/x-foo;
\tname*0=\"us-ascii'en-us'My\";
\tname*1*=\" Document\";
\tname*2*=\" For You\"
"""
msg = email.message_from_string(m)
charset, language, s = msg.get_param('name')
eq(charset, 'us-ascii')
eq(language, 'en-us')
eq(s, 'My Document For You')
def _testclasses():

View File

@ -25,6 +25,7 @@ import time
import base64
import random
import socket
import urllib
import warnings
from cStringIO import StringIO
@ -231,16 +232,14 @@ def unquote(str):
# RFC2231-related functions - parameter encoding and decoding
def decode_rfc2231(s):
"""Decode string according to RFC 2231"""
import urllib
parts = s.split(TICK, 2)
if len(parts) <= 2:
return None, None, urllib.unquote(s)
return None, None, s
if len(parts) > 3:
charset, language = parts[:2]
s = TICK.join(parts[2:])
else:
charset, language, s = parts
return charset, language, urllib.unquote(s)
return charset, language, s
return parts
def encode_rfc2231(s, charset=None, language=None):
@ -264,37 +263,54 @@ rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
def decode_params(params):
"""Decode parameters list according to RFC 2231.
params is a sequence of 2-tuples containing (content type, string value).
params is a sequence of 2-tuples containing (param name, string value).
"""
# Copy params so we don't mess with the original
params = params[:]
new_params = []
# maps parameter's name to a list of continuations
# Map parameter's name to a list of continuations. The values are a
# 3-tuple of the continuation number, the string value, and a flag
# specifying whether a particular segment is %-encoded.
rfc2231_params = {}
# params is a sequence of 2-tuples containing (content_type, string value)
name, value = params[0]
name, value = params.pop(0)
new_params.append((name, value))
# Cycle through each of the rest of the parameters.
for name, value in params[1:]:
while params:
name, value = params.pop(0)
if name.endswith('*'):
encoded = True
else:
encoded = False
value = unquote(value)
mo = rfc2231_continuation.match(name)
if mo:
name, num = mo.group('name', 'num')
if num is not None:
num = int(num)
rfc2231_param1 = rfc2231_params.setdefault(name, [])
rfc2231_param1.append((num, value))
rfc2231_params.setdefault(name, []).append((num, value, encoded))
else:
new_params.append((name, '"%s"' % quote(value)))
if rfc2231_params:
for name, continuations in rfc2231_params.items():
value = []
extended = False
# Sort by number
continuations.sort()
# And now append all values in num order
for num, continuation in continuations:
value.append(continuation)
charset, language, value = decode_rfc2231(EMPTYSTRING.join(value))
new_params.append(
(name, (charset, language, '"%s"' % quote(value))))
# And now append all values in numerical order, converting
# %-encodings for the encoded segments. If any of the
# continuation names ends in a *, then the entire string, after
# decoding segments and concatenating, must have the charset and
# language specifiers at the beginning of the string.
for num, s, encoded in continuations:
if encoded:
s = urllib.unquote(s)
extended = True
value.append(s)
value = quote(EMPTYSTRING.join(value))
if extended:
charset, language, value = decode_rfc2231(value)
new_params.append((name, (charset, language, '"%s"' % value)))
else:
new_params.append((name, '"%s"' % value))
return new_params
def collapse_rfc2231_value(value, errors='replace',

View File

@ -44,6 +44,18 @@ Library
- Patch #1220874: Update the binhex module for Mach-O.
- The email package has improved RFC 2231 support, specifically for
recognizing the difference between encoded (name*0*=<blah>) and non-encoded
(name*0=<blah>) parameter continuations. This may change the types of
values returned from email.message.Message.get_param() and friends.
Specifically in some cases where non-encoded continuations were used,
get_param() used to return a 3-tuple of (None, None, string) whereas now it
will just return the string (since non-encoded continuations don't have
charset and language parts).
Also, whereas % values were decoded in all parameter continuations, they are
now only decoded in encoded parameter parts.
Extension Modules
-----------------