Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.

Based on patch by Kaarle Ritvanen.
This commit is contained in:
Serhiy Storchaka 2016-10-25 14:44:54 +03:00
parent 42bababba6
commit f3ebc9fe3f
3 changed files with 47 additions and 14 deletions

View File

@ -444,6 +444,37 @@ What a mess!
text = "aa \xe4\xe4-\xe4\xe4" text = "aa \xe4\xe4-\xe4\xe4"
self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"]) self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"])
def test_non_breaking_space(self):
text = 'This is a sentence with non-breaking\N{NO-BREAK SPACE}space.'
self.check_wrap(text, 20,
['This is a sentence',
'with non-',
'breaking\N{NO-BREAK SPACE}space.'],
break_on_hyphens=True)
self.check_wrap(text, 20,
['This is a sentence',
'with',
'non-breaking\N{NO-BREAK SPACE}space.'],
break_on_hyphens=False)
def test_narrow_non_breaking_space(self):
text = ('This is a sentence with non-breaking'
'\N{NARROW NO-BREAK SPACE}space.')
self.check_wrap(text, 20,
['This is a sentence',
'with non-',
'breaking\N{NARROW NO-BREAK SPACE}space.'],
break_on_hyphens=True)
self.check_wrap(text, 20,
['This is a sentence',
'with',
'non-breaking\N{NARROW NO-BREAK SPACE}space.'],
break_on_hyphens=False)
class MaxLinesTestCase(BaseTestCase): class MaxLinesTestCase(BaseTestCase):
text = "Hello there, how are you this fine day? I'm glad to hear it!" text = "Hello there, how are you this fine day? I'm glad to hear it!"

View File

@ -10,13 +10,8 @@ import re
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
# Hardcode the recognized whitespace characters to the US-ASCII # Hardcode the recognized whitespace characters to the US-ASCII
# whitespace characters. The main reason for doing this is that in # whitespace characters. The main reason for doing this is that
# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
# that character winds up in string.whitespace. Respecting
# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
# same as any other whitespace char, which is clearly wrong (it's a
# *non-breaking* space), 2) possibly cause problems with Unicode,
# since 0xa0 is not in range(128).
_whitespace = '\t\n\x0b\x0c\r ' _whitespace = '\t\n\x0b\x0c\r '
class TextWrapper: class TextWrapper:
@ -81,29 +76,34 @@ class TextWrapper:
# (after stripping out empty strings). # (after stripping out empty strings).
word_punct = r'[\w!"\'&.,?]' word_punct = r'[\w!"\'&.,?]'
letter = r'[^\d\W]' letter = r'[^\d\W]'
whitespace = r'[%s]' % re.escape(_whitespace)
nowhitespace = '[^' + whitespace[1:]
wordsep_re = re.compile(r''' wordsep_re = re.compile(r'''
( # any whitespace ( # any whitespace
\s+ %(ws)s+
| # em-dash between words | # em-dash between words
(?<=%(wp)s) -{2,} (?=\w) (?<=%(wp)s) -{2,} (?=\w)
| # word, possibly hyphenated | # word, possibly hyphenated
\S+? (?: %(nws)s+? (?:
# hyphenated word # hyphenated word
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
(?= %(lt)s -? %(lt)s) (?= %(lt)s -? %(lt)s)
| # end of word | # end of word
(?=\s|\Z) (?=%(ws)s|\Z)
| # em-dash | # em-dash
(?<=%(wp)s) (?=-{2,}\w) (?<=%(wp)s) (?=-{2,}\w)
) )
)''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE) )''' % {'wp': word_punct, 'lt': letter,
del word_punct, letter 'ws': whitespace, 'nws': nowhitespace},
re.VERBOSE)
del word_punct, letter, nowhitespace
# This less funky little regex just split on recognized spaces. E.g. # This less funky little regex just split on recognized spaces. E.g.
# "Hello there -- you goof-ball, use the -b option!" # "Hello there -- you goof-ball, use the -b option!"
# splits into # splits into
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
wordsep_simple_re = re.compile(r'(\s+)') wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
del whitespace
# XXX this is not locale- or charset-aware -- string.lowercase # XXX this is not locale- or charset-aware -- string.lowercase
# is US-ASCII only (and therefore English-only) # is US-ASCII only (and therefore English-only)
@ -112,7 +112,6 @@ class TextWrapper:
r'[\"\']?' # optional end-of-quote r'[\"\']?' # optional end-of-quote
r'\Z') # end of chunk r'\Z') # end of chunk
def __init__(self, def __init__(self,
width=70, width=70,
initial_indent="", initial_indent="",

View File

@ -113,6 +113,9 @@ Core and Builtins
Library Library
------- -------
- Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
Based on patch by Kaarle Ritvanen.
- Issue #28353: os.fwalk() no longer fails on broken links. - Issue #28353: os.fwalk() no longer fails on broken links.
- Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin - Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin