Added a feature suggested by Martin v Loewis, where a new header

encoding flag SHORTEST means to return the shortest encoding between
base64 and qp.  This is used for the header_enc for utf-8.  SHORTEST
isn't legal for body_enc.

Also some code cleanup:

- use True/False everywhere
- use == instead of `is' in a few places
- added _unicode() and make consistent the "is unicode" checks
- update docstrings
This commit is contained in:
Barry Warsaw 2002-09-28 17:47:56 +00:00
parent 176916a989
commit 5932c9bedd
1 changed files with 55 additions and 37 deletions

View File

@ -1,26 +1,27 @@
# Copyright (C) 2001,2002 Python Software Foundation # Copyright (C) 2001,2002 Python Software Foundation
# Author: che@debian.org (Ben Gertzfield) # Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
try:
unicode
except NameError:
def _is_unicode(x):
return 1==0
else:
# Use UnicodeType instead of built-in unicode for Py2.1 compatibility
from types import UnicodeType
def _is_unicode(x):
return isinstance(x, UnicodeType)
from types import UnicodeType
from email.Encoders import encode_7or8bit from email.Encoders import encode_7or8bit
import email.base64MIME import email.base64MIME
import email.quopriMIME import email.quopriMIME
def _isunicode(s):
return isinstance(s, UnicodeType)
# Python 2.2.1 and beyond has these symbols
try:
True, False
except NameError:
True = 1
False = 0
# Flags for types of header encodings # Flags for types of header encodings
QP = 1 # Quoted-Printable QP = 1 # Quoted-Printable
BASE64 = 2 # Base64 BASE64 = 2 # Base64
SHORTEST = 3 # the shorter of QP and base64, but only for headers
# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
MISC_LEN = 7 MISC_LEN = 7
@ -41,7 +42,7 @@ CHARSETS = {
'shift_jis': (BASE64, None, 'iso-2022-jp'), 'shift_jis': (BASE64, None, 'iso-2022-jp'),
'iso-2022-jp': (BASE64, None, None), 'iso-2022-jp': (BASE64, None, None),
'koi8-r': (BASE64, BASE64, None), 'koi8-r': (BASE64, BASE64, None),
'utf-8': (BASE64, BASE64, 'utf-8'), 'utf-8': (SHORTEST, BASE64, 'utf-8'),
} }
# Aliases for other commonly-used names for character sets. Map # Aliases for other commonly-used names for character sets. Map
@ -90,9 +91,11 @@ def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
character set. character set.
Optional header_enc and body_enc is either Charset.QP for Optional header_enc and body_enc is either Charset.QP for
quoted-printable, Charset.BASE64 for base64 encoding, or None for no quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
encoding. It describes how message headers and message bodies in the the shortest of qp or base64 encoding, or None for no encoding. SHORTEST
input charset are to be encoded. Default is no encoding. is only valid for header_enc. It describes how message headers and
message bodies in the input charset are to be encoded. Default is no
encoding.
Optional output_charset is the character set that the output should be Optional output_charset is the character set that the output should be
in. Conversions will proceed from input charset, to Unicode, to the in. Conversions will proceed from input charset, to Unicode, to the
@ -104,6 +107,8 @@ def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
to add codecs the module does not know about. See the codec module's to add codecs the module does not know about. See the codec module's
documentation for more information. documentation for more information.
""" """
if body_enc == SHORTEST:
raise ValueError, 'SHORTEST not allowed for body_enc'
CHARSETS[charset] = (header_enc, body_enc, output_charset) CHARSETS[charset] = (header_enc, body_enc, output_charset)
@ -147,12 +152,14 @@ class Charset:
header_encoding: If the character set must be encoded before it can be header_encoding: If the character set must be encoded before it can be
used in an email header, this attribute will be set to used in an email header, this attribute will be set to
Charset.QP (for quoted-printable) or Charset.BASE64 (for Charset.QP (for quoted-printable), Charset.BASE64 (for
base64 encoding). Otherwise, it will be None. base64 encoding), or Charset.SHORTEST for the shortest of
QP or BASE64 encoding. Otherwise, it will be None.
body_encoding: Same as header_encoding, but describes the encoding for the body_encoding: Same as header_encoding, but describes the encoding for the
mail message's body, which indeed may be different than the mail message's body, which indeed may be different than the
header encoding. header encoding. Charset.SHORTEST is not allowed for
body_encoding.
output_charset: Some character sets must be converted before the can be output_charset: Some character sets must be converted before the can be
used in email headers or bodies. If the input_charset is used in email headers or bodies. If the input_charset is
@ -175,7 +182,7 @@ class Charset:
# charset_map dictionary. Try that first, but let the user override # charset_map dictionary. Try that first, but let the user override
# it. # it.
henc, benc, conv = CHARSETS.get(self.input_charset, henc, benc, conv = CHARSETS.get(self.input_charset,
(BASE64, BASE64, None)) (SHORTEST, SHORTEST, None))
# Set the attributes, allowing the arguments to override the default. # Set the attributes, allowing the arguments to override the default.
self.header_encoding = henc self.header_encoding = henc
self.body_encoding = benc self.body_encoding = benc
@ -209,6 +216,7 @@ class Charset:
Returns "base64" if self.body_encoding is BASE64. Returns "base64" if self.body_encoding is BASE64.
Returns "7bit" otherwise. Returns "7bit" otherwise.
""" """
assert self.body_encoding <> SHORTEST
if self.body_encoding == QP: if self.body_encoding == QP:
return 'quoted-printable' return 'quoted-printable'
elif self.body_encoding == BASE64: elif self.body_encoding == BASE64:
@ -236,7 +244,7 @@ class Charset:
Characters that could not be converted to Unicode will be replaced Characters that could not be converted to Unicode will be replaced
with the Unicode replacement character U+FFFD. with the Unicode replacement character U+FFFD.
""" """
if _is_unicode(s) or self.input_codec is None: if _isunicode(s) or self.input_codec is None:
return s return s
try: try:
return unicode(s, self.input_codec, 'replace') return unicode(s, self.input_codec, 'replace')
@ -245,7 +253,7 @@ class Charset:
# string unchanged. # string unchanged.
return s return s
def from_splittable(self, ustr, to_output=1): def from_splittable(self, ustr, to_output=True):
"""Convert a splittable string back into an encoded string. """Convert a splittable string back into an encoded string.
Uses the proper codec to try and convert the string from Uses the proper codec to try and convert the string from
@ -256,15 +264,14 @@ class Charset:
Characters that could not be converted from Unicode will be replaced Characters that could not be converted from Unicode will be replaced
with an appropriate character (usually '?'). with an appropriate character (usually '?').
If to_output is true, uses output_codec to convert to an encoded If to_output is True (the default), uses output_codec to convert to an
format. If to_output is false, uses input_codec. to_output defaults encoded format. If to_output is False, uses input_codec.
to 1.
""" """
if to_output: if to_output:
codec = self.output_codec codec = self.output_codec
else: else:
codec = self.input_codec codec = self.input_codec
if not _is_unicode(ustr) or codec is None: if not _isunicode(ustr) or codec is None:
return ustr return ustr
try: try:
return ustr.encode(codec, 'replace') return ustr.encode(codec, 'replace')
@ -284,22 +291,26 @@ class Charset:
"""Return the length of the encoded header string.""" """Return the length of the encoded header string."""
cset = self.get_output_charset() cset = self.get_output_charset()
# The len(s) of a 7bit encoding is len(s) # The len(s) of a 7bit encoding is len(s)
if self.header_encoding is BASE64: if self.header_encoding == BASE64:
return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
elif self.header_encoding is QP: elif self.header_encoding == QP:
return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
elif self.header_encoding == SHORTEST:
lenb64 = email.base64MIME.base64_len(s)
lenqp = email.quopriMIME.header_quopri_len(s)
return min(lenb64, lenqp) + len(cset) + MISC_LEN
else: else:
return len(s) return len(s)
def header_encode(self, s, convert=0): def header_encode(self, s, convert=False):
"""Header-encode a string, optionally converting it to output_charset. """Header-encode a string, optionally converting it to output_charset.
If convert is true, the string will be converted from the input If convert is True, the string will be converted from the input
charset to the output charset automatically. This is not useful for charset to the output charset automatically. This is not useful for
multibyte character sets, which have line length issues (multibyte multibyte character sets, which have line length issues (multibyte
characters must be split on a character, not a byte boundary); use the characters must be split on a character, not a byte boundary); use the
high-level Header class to deal with these issues. convert defaults high-level Header class to deal with these issues. convert defaults
to 0. to False.
The type of encoding (base64 or quoted-printable) will be based on The type of encoding (base64 or quoted-printable) will be based on
self.header_encoding. self.header_encoding.
@ -308,17 +319,24 @@ class Charset:
if convert: if convert:
s = self.convert(s) s = self.convert(s)
# 7bit/8bit encodings return the string unchanged (modulo conversions) # 7bit/8bit encodings return the string unchanged (modulo conversions)
if self.header_encoding is BASE64: if self.header_encoding == BASE64:
return email.base64MIME.header_encode(s, cset) return email.base64MIME.header_encode(s, cset)
elif self.header_encoding is QP: elif self.header_encoding == QP:
return email.quopriMIME.header_encode(s, cset)
elif self.header_encoding == SHORTEST:
lenb64 = email.base64MIME.base64_len(s)
lenqp = email.quopriMIME.header_quopri_len(s)
if lenb64 < lenqp:
return email.base64MIME.header_encode(s, cset)
else:
return email.quopriMIME.header_encode(s, cset) return email.quopriMIME.header_encode(s, cset)
else: else:
return s return s
def body_encode(self, s, convert=1): def body_encode(self, s, convert=True):
"""Body-encode a string and convert it to output_charset. """Body-encode a string and convert it to output_charset.
If convert is true (the default), the string will be converted from If convert is True (the default), the string will be converted from
the input charset to output charset automatically. Unlike the input charset to output charset automatically. Unlike
header_encode(), there are no issues with byte boundaries and header_encode(), there are no issues with byte boundaries and
multibyte charsets in email bodies, so this is usually pretty safe. multibyte charsets in email bodies, so this is usually pretty safe.