diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index a44184fbc77..5a33c151642 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -444,6 +444,37 @@ What a mess! text = "aa \xe4\xe4-\xe4\xe4" self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"]) + def test_non_breaking_space(self): + text = 'This is a sentence with non-breaking\N{NO-BREAK SPACE}space.' + + self.check_wrap(text, 20, + ['This is a sentence', + 'with non-', + 'breaking\N{NO-BREAK SPACE}space.'], + break_on_hyphens=True) + + self.check_wrap(text, 20, + ['This is a sentence', + 'with', + 'non-breaking\N{NO-BREAK SPACE}space.'], + break_on_hyphens=False) + + def test_narrow_non_breaking_space(self): + text = ('This is a sentence with non-breaking' + '\N{NARROW NO-BREAK SPACE}space.') + + self.check_wrap(text, 20, + ['This is a sentence', + 'with non-', + 'breaking\N{NARROW NO-BREAK SPACE}space.'], + break_on_hyphens=True) + + self.check_wrap(text, 20, + ['This is a sentence', + 'with', + 'non-breaking\N{NARROW NO-BREAK SPACE}space.'], + break_on_hyphens=False) + class MaxLinesTestCase(BaseTestCase): text = "Hello there, how are you this fine day? I'm glad to hear it!" diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 05e030673a6..0c18dc582e1 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -10,13 +10,8 @@ import re __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] # Hardcode the recognized whitespace characters to the US-ASCII -# whitespace characters. The main reason for doing this is that in -# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales -# that character winds up in string.whitespace. Respecting -# string.whitespace in those cases would 1) make textwrap treat 0xa0 the -# same as any other whitespace char, which is clearly wrong (it's a -# *non-breaking* space), 2) possibly cause problems with Unicode, -# since 0xa0 is not in range(128). +# whitespace characters. The main reason for doing this is that +# some Unicode spaces (like \u00a0) are non-breaking whitespaces. _whitespace = '\t\n\x0b\x0c\r ' class TextWrapper: @@ -81,29 +76,34 @@ class TextWrapper: # (after stripping out empty strings). word_punct = r'[\w!"\'&.,?]' letter = r'[^\d\W]' + whitespace = r'[%s]' % re.escape(_whitespace) + nowhitespace = '[^' + whitespace[1:] wordsep_re = re.compile(r''' ( # any whitespace - \s+ + %(ws)s+ | # em-dash between words (?<=%(wp)s) -{2,} (?=\w) | # word, possibly hyphenated - \S+? (?: + %(nws)s+? (?: # hyphenated word -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) (?= %(lt)s -? %(lt)s) | # end of word - (?=\s|\Z) + (?=%(ws)s|\Z) | # em-dash (?<=%(wp)s) (?=-{2,}\w) ) - )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE) - del word_punct, letter + )''' % {'wp': word_punct, 'lt': letter, + 'ws': whitespace, 'nws': nowhitespace}, + re.VERBOSE) + del word_punct, letter, nowhitespace # This less funky little regex just split on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" # splits into # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ - wordsep_simple_re = re.compile(r'(\s+)') + wordsep_simple_re = re.compile(r'(%s+)' % whitespace) + del whitespace # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) @@ -112,7 +112,6 @@ class TextWrapper: r'[\"\']?' # optional end-of-quote r'\Z') # end of chunk - def __init__(self, width=70, initial_indent="", diff --git a/Misc/NEWS b/Misc/NEWS index ea90cbb9da6..c4405716347 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -113,6 +113,9 @@ Core and Builtins Library ------- +- Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces. + Based on patch by Kaarle Ritvanen. + - Issue #28353: os.fwalk() no longer fails on broken links. - Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin