From 6f95ae55b151f366396e7d99b829c27f27259a89 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Sun, 11 May 2008 10:42:28 +0000 Subject: [PATCH] #2659: add ``break_on_hyphens`` to TextWrapper. --- Doc/library/textwrap.rst | 18 +++++++++++++++++- Lib/test/test_textwrap.py | 8 ++++++++ Lib/textwrap.py | 23 +++++++++++++++++++++-- Misc/NEWS | 2 ++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/Doc/library/textwrap.rst b/Doc/library/textwrap.rst index 7a52eb9ba43..a2db567fb5b 100644 --- a/Doc/library/textwrap.rst +++ b/Doc/library/textwrap.rst @@ -41,6 +41,10 @@ instance and calling a single method on it. That instance is not reused, so for applications that wrap/fill many text strings, it will be more efficient for you to create your own :class:`TextWrapper` object. +Text is preferably wrapped on whitespaces and right after the hyphens in +hyphenated words; only then will long words be broken if necessary, unless +:attr:`TextWrapper.break_long_words` is set to false. + An additional utility function, :func:`dedent`, is provided to remove indentation from strings that have unwanted whitespace to the left of the text. @@ -174,10 +178,22 @@ indentation from strings that have unwanted whitespace to the left of the text. than :attr:`width`. (Long words will be put on a line by themselves, in order to minimize the amount by which :attr:`width` is exceeded.) + + .. attribute:: break_on_hyphens + + (default: ``True``) If true, wrapping will occur preferably on whitespaces + and right after hyphens in compound words, as it is customary in English. + If false, only whitespaces will be considered as potentially good places + for line breaks, but you need to set :attr:`break_long_words` to false if + you want truly insecable words. Default behaviour in previous versions + was to always allow breaking hyphenated words. + + .. versionadded:: 2.6 + + :class:`TextWrapper` also provides two public methods, analogous to the module-level convenience functions: - .. method:: wrap(text) Wraps the single paragraph in *text* (a string) so every line is at most diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index c1c09f6a3ee..1eab90cfc44 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -364,6 +364,14 @@ What a mess! ["Hello", " ", "there", " ", "--", " ", "you", " ", "goof-", "ball,", " ", "use", " ", "the", " ", "-b", " ", "option!"]) + def test_break_on_hyphens(self): + # Ensure that the break_on_hyphens attributes work + text = "yaba daba-doo" + self.check_wrap(text, 10, ["yaba daba-", "doo"], + break_on_hyphens=True) + self.check_wrap(text, 10, ["yaba", "daba-doo"], + break_on_hyphens=False) + def test_bad_width(self): # Ensure that width <= 0 is caught. text = "Whatever, it doesn't matter." diff --git a/Lib/textwrap.py b/Lib/textwrap.py index ffbb9d16341..7cd05971c0e 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -63,6 +63,10 @@ class TextWrapper: break_long_words (default: true) Break words longer than 'width'. If false, those words will not be broken, and some lines might be longer than 'width'. + break_on_hyphens (default: true) + Allow breaking hyphenated words. If true, wrapping will occur + preferably on whitespaces and right after hyphens part of + compound words. drop_whitespace (default: true) Drop leading and trailing whitespace from lines. """ @@ -85,6 +89,12 @@ class TextWrapper: r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash + # This less funky little regex just split on recognized spaces. E.g. + # "Hello there -- you goof-ball, use the -b option!" + # splits into + # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ + wordsep_simple_re = re.compile(r'(\s+)') + # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) sentence_end_re = re.compile(r'[%s]' # lowercase letter @@ -102,7 +112,8 @@ class TextWrapper: replace_whitespace=True, fix_sentence_endings=False, break_long_words=True, - drop_whitespace=True): + drop_whitespace=True, + break_on_hyphens=True): self.width = width self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent @@ -111,6 +122,7 @@ class TextWrapper: self.fix_sentence_endings = fix_sentence_endings self.break_long_words = break_long_words self.drop_whitespace = drop_whitespace + self.break_on_hyphens = break_on_hyphens # -- Private methods ----------------------------------------------- @@ -143,8 +155,15 @@ class TextWrapper: breaks into the following chunks: 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 'use', ' ', 'the', ' ', '-b', ' ', 'option!' + if break_on_hyphens is True, or in: + 'Look,', ' ', 'goof-ball', ' ', '--', ' ', + 'use', ' ', 'the', ' ', '-b', ' ', option!' + otherwise. """ - chunks = self.wordsep_re.split(text) + if self.break_on_hyphens is True: + chunks = self.wordsep_re.split(text) + else: + chunks = self.wordsep_simple_re.split(text) chunks = filter(None, chunks) # remove empty chunks return chunks diff --git a/Misc/NEWS b/Misc/NEWS index d796735e2c3..9007462f9a5 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -23,6 +23,8 @@ Extension Modules Library ------- +- #2659: Added ``break_on_hyphens`` option to textwrap TextWrapper class. + - The mhlib module has been deprecated for removal in Python 3.0. - The linuxaudiodev module has been deprecated for removal in Python 3.0.