mirror of https://github.com/python/cpython
Issue #4163: Use unicode-friendly word splitting in the textwrap functions when given an unicode string.
This commit is contained in:
parent
9f35070a6b
commit
74af3bbfbd
|
@ -174,7 +174,7 @@ What a mess!
|
|||
text = ("Python 1.0.0 was released on 1994-01-26. Python 1.0.1 was\n"
|
||||
"released on 1994-02-15.")
|
||||
|
||||
self.check_wrap(text, 30, ['Python 1.0.0 was released on',
|
||||
self.check_wrap(text, 35, ['Python 1.0.0 was released on',
|
||||
'1994-01-26. Python 1.0.1 was',
|
||||
'released on 1994-02-15.'])
|
||||
self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.',
|
||||
|
@ -353,6 +353,14 @@ What a mess!
|
|||
otext = self.wrapper.fill(text)
|
||||
assert isinstance(otext, unicode)
|
||||
|
||||
def test_no_split_at_umlaut(self):
|
||||
text = u"Die Empf\xe4nger-Auswahl"
|
||||
self.check_wrap(text, 13, [u"Die", u"Empf\xe4nger-", u"Auswahl"])
|
||||
|
||||
def test_umlaut_followed_by_dash(self):
|
||||
text = u"aa \xe4\xe4-\xe4\xe4"
|
||||
self.check_wrap(text, 7, [u"aa \xe4\xe4-", u"\xe4\xe4"])
|
||||
|
||||
def test_split(self):
|
||||
# Ensure that the standard _split() method works as advertised
|
||||
# in the comments
|
||||
|
|
|
@ -84,16 +84,16 @@ class TextWrapper:
|
|||
# splits into
|
||||
# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
|
||||
# (after stripping out empty strings).
|
||||
wordsep_re = re.compile(
|
||||
wordsep_re = (
|
||||
r'(\s+|' # any whitespace
|
||||
r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
|
||||
r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
|
||||
r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
|
||||
|
||||
# This less funky little regex just split on recognized spaces. E.g.
|
||||
# "Hello there -- you goof-ball, use the -b option!"
|
||||
# splits into
|
||||
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
|
||||
wordsep_simple_re = re.compile(r'(\s+)')
|
||||
wordsep_simple_re = r'(\s+)'
|
||||
|
||||
# XXX this is not locale- or charset-aware -- string.lowercase
|
||||
# is US-ASCII only (and therefore English-only)
|
||||
|
@ -160,10 +160,12 @@ class TextWrapper:
|
|||
'use', ' ', 'the', ' ', '-b', ' ', option!'
|
||||
otherwise.
|
||||
"""
|
||||
if self.break_on_hyphens is True:
|
||||
chunks = self.wordsep_re.split(text)
|
||||
flags = re.UNICODE if isinstance(text, unicode) else 0
|
||||
if self.break_on_hyphens:
|
||||
pat = self.wordsep_re
|
||||
else:
|
||||
chunks = self.wordsep_simple_re.split(text)
|
||||
pat = self.wordsep_simple_re
|
||||
chunks = re.compile(pat, flags).split(text)
|
||||
chunks = filter(None, chunks) # remove empty chunks
|
||||
return chunks
|
||||
|
||||
|
|
|
@ -74,6 +74,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #4163: Use unicode-friendly word splitting in the textwrap functions
|
||||
when given an unicode string.
|
||||
|
||||
- Issue #4616: TarFile.utime(): Restore directory times on Windows.
|
||||
|
||||
- Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
|
||||
|
|
Loading…
Reference in New Issue