diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 1eab90cfc44..c91e242dfdb 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -174,7 +174,7 @@ What a mess! text = ("Python 1.0.0 was released on 1994-01-26. Python 1.0.1 was\n" "released on 1994-02-15.") - self.check_wrap(text, 30, ['Python 1.0.0 was released on', + self.check_wrap(text, 35, ['Python 1.0.0 was released on', '1994-01-26. Python 1.0.1 was', 'released on 1994-02-15.']) self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.', @@ -353,6 +353,14 @@ What a mess! otext = self.wrapper.fill(text) assert isinstance(otext, unicode) + def test_no_split_at_umlaut(self): + text = u"Die Empf\xe4nger-Auswahl" + self.check_wrap(text, 13, [u"Die", u"Empf\xe4nger-", u"Auswahl"]) + + def test_umlaut_followed_by_dash(self): + text = u"aa \xe4\xe4-\xe4\xe4" + self.check_wrap(text, 7, [u"aa \xe4\xe4-", u"\xe4\xe4"]) + def test_split(self): # Ensure that the standard _split() method works as advertised # in the comments diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 53f2f1bac7c..192b43b1df9 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -84,16 +84,16 @@ class TextWrapper: # splits into # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! # (after stripping out empty strings). - wordsep_re = re.compile( + wordsep_re = ( r'(\s+|' # any whitespace - r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words + r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash # This less funky little regex just split on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" # splits into # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ - wordsep_simple_re = re.compile(r'(\s+)') + wordsep_simple_re = r'(\s+)' # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) @@ -160,10 +160,12 @@ class TextWrapper: 'use', ' ', 'the', ' ', '-b', ' ', option!' otherwise. """ - if self.break_on_hyphens is True: - chunks = self.wordsep_re.split(text) + flags = re.UNICODE if isinstance(text, unicode) else 0 + if self.break_on_hyphens: + pat = self.wordsep_re else: - chunks = self.wordsep_simple_re.split(text) + pat = self.wordsep_simple_re + chunks = re.compile(pat, flags).split(text) chunks = filter(None, chunks) # remove empty chunks return chunks diff --git a/Misc/NEWS b/Misc/NEWS index 9a3e33813f2..1ebcfe32466 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -74,6 +74,9 @@ Core and Builtins Library ------- +- Issue #4163: Use unicode-friendly word splitting in the textwrap functions + when given an unicode string. + - Issue #4616: TarFile.utime(): Restore directory times on Windows. - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to