Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
Based on patch by Kaarle Ritvanen.
This commit is contained in:
parent
42bababba6
commit
f3ebc9fe3f
|
@ -444,6 +444,37 @@ What a mess!
|
||||||
text = "aa \xe4\xe4-\xe4\xe4"
|
text = "aa \xe4\xe4-\xe4\xe4"
|
||||||
self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"])
|
self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"])
|
||||||
|
|
||||||
|
def test_non_breaking_space(self):
|
||||||
|
text = 'This is a sentence with non-breaking\N{NO-BREAK SPACE}space.'
|
||||||
|
|
||||||
|
self.check_wrap(text, 20,
|
||||||
|
['This is a sentence',
|
||||||
|
'with non-',
|
||||||
|
'breaking\N{NO-BREAK SPACE}space.'],
|
||||||
|
break_on_hyphens=True)
|
||||||
|
|
||||||
|
self.check_wrap(text, 20,
|
||||||
|
['This is a sentence',
|
||||||
|
'with',
|
||||||
|
'non-breaking\N{NO-BREAK SPACE}space.'],
|
||||||
|
break_on_hyphens=False)
|
||||||
|
|
||||||
|
def test_narrow_non_breaking_space(self):
|
||||||
|
text = ('This is a sentence with non-breaking'
|
||||||
|
'\N{NARROW NO-BREAK SPACE}space.')
|
||||||
|
|
||||||
|
self.check_wrap(text, 20,
|
||||||
|
['This is a sentence',
|
||||||
|
'with non-',
|
||||||
|
'breaking\N{NARROW NO-BREAK SPACE}space.'],
|
||||||
|
break_on_hyphens=True)
|
||||||
|
|
||||||
|
self.check_wrap(text, 20,
|
||||||
|
['This is a sentence',
|
||||||
|
'with',
|
||||||
|
'non-breaking\N{NARROW NO-BREAK SPACE}space.'],
|
||||||
|
break_on_hyphens=False)
|
||||||
|
|
||||||
|
|
||||||
class MaxLinesTestCase(BaseTestCase):
|
class MaxLinesTestCase(BaseTestCase):
|
||||||
text = "Hello there, how are you this fine day? I'm glad to hear it!"
|
text = "Hello there, how are you this fine day? I'm glad to hear it!"
|
||||||
|
|
|
@ -10,13 +10,8 @@ import re
|
||||||
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
|
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
|
||||||
|
|
||||||
# Hardcode the recognized whitespace characters to the US-ASCII
|
# Hardcode the recognized whitespace characters to the US-ASCII
|
||||||
# whitespace characters. The main reason for doing this is that in
|
# whitespace characters. The main reason for doing this is that
|
||||||
# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
|
# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
|
||||||
# that character winds up in string.whitespace. Respecting
|
|
||||||
# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
|
|
||||||
# same as any other whitespace char, which is clearly wrong (it's a
|
|
||||||
# *non-breaking* space), 2) possibly cause problems with Unicode,
|
|
||||||
# since 0xa0 is not in range(128).
|
|
||||||
_whitespace = '\t\n\x0b\x0c\r '
|
_whitespace = '\t\n\x0b\x0c\r '
|
||||||
|
|
||||||
class TextWrapper:
|
class TextWrapper:
|
||||||
|
@ -81,29 +76,34 @@ class TextWrapper:
|
||||||
# (after stripping out empty strings).
|
# (after stripping out empty strings).
|
||||||
word_punct = r'[\w!"\'&.,?]'
|
word_punct = r'[\w!"\'&.,?]'
|
||||||
letter = r'[^\d\W]'
|
letter = r'[^\d\W]'
|
||||||
|
whitespace = r'[%s]' % re.escape(_whitespace)
|
||||||
|
nowhitespace = '[^' + whitespace[1:]
|
||||||
wordsep_re = re.compile(r'''
|
wordsep_re = re.compile(r'''
|
||||||
( # any whitespace
|
( # any whitespace
|
||||||
\s+
|
%(ws)s+
|
||||||
| # em-dash between words
|
| # em-dash between words
|
||||||
(?<=%(wp)s) -{2,} (?=\w)
|
(?<=%(wp)s) -{2,} (?=\w)
|
||||||
| # word, possibly hyphenated
|
| # word, possibly hyphenated
|
||||||
\S+? (?:
|
%(nws)s+? (?:
|
||||||
# hyphenated word
|
# hyphenated word
|
||||||
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
|
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
|
||||||
(?= %(lt)s -? %(lt)s)
|
(?= %(lt)s -? %(lt)s)
|
||||||
| # end of word
|
| # end of word
|
||||||
(?=\s|\Z)
|
(?=%(ws)s|\Z)
|
||||||
| # em-dash
|
| # em-dash
|
||||||
(?<=%(wp)s) (?=-{2,}\w)
|
(?<=%(wp)s) (?=-{2,}\w)
|
||||||
)
|
)
|
||||||
)''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
|
)''' % {'wp': word_punct, 'lt': letter,
|
||||||
del word_punct, letter
|
'ws': whitespace, 'nws': nowhitespace},
|
||||||
|
re.VERBOSE)
|
||||||
|
del word_punct, letter, nowhitespace
|
||||||
|
|
||||||
# This less funky little regex just split on recognized spaces. E.g.
|
# This less funky little regex just split on recognized spaces. E.g.
|
||||||
# "Hello there -- you goof-ball, use the -b option!"
|
# "Hello there -- you goof-ball, use the -b option!"
|
||||||
# splits into
|
# splits into
|
||||||
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
|
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
|
||||||
wordsep_simple_re = re.compile(r'(\s+)')
|
wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
|
||||||
|
del whitespace
|
||||||
|
|
||||||
# XXX this is not locale- or charset-aware -- string.lowercase
|
# XXX this is not locale- or charset-aware -- string.lowercase
|
||||||
# is US-ASCII only (and therefore English-only)
|
# is US-ASCII only (and therefore English-only)
|
||||||
|
@ -112,7 +112,6 @@ class TextWrapper:
|
||||||
r'[\"\']?' # optional end-of-quote
|
r'[\"\']?' # optional end-of-quote
|
||||||
r'\Z') # end of chunk
|
r'\Z') # end of chunk
|
||||||
|
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
width=70,
|
width=70,
|
||||||
initial_indent="",
|
initial_indent="",
|
||||||
|
|
|
@ -113,6 +113,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
|
||||||
|
Based on patch by Kaarle Ritvanen.
|
||||||
|
|
||||||
- Issue #28353: os.fwalk() no longer fails on broken links.
|
- Issue #28353: os.fwalk() no longer fails on broken links.
|
||||||
|
|
||||||
- Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin
|
- Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin
|
||||||
|
|
Loading…
Reference in New Issue