bpo-36502: Correct documentation of str.isspace() (GH-15019)
The documented definition was much broader than the real one: there are tons of characters with general category "Other", and we don't (and shouldn't) treat most of them as whitespace. Rewrite the definition to agree with the comment on _PyUnicode_IsWhitespace, and with the logic in makeunicodedata.py, which is what generates that function and so ultimately governs. Add suitable breadcrumbs so that a reader who wants to pin down exactly what this definition means (what's a "bidirectional class" of "B"?) can do so. The `unicodedata` module documentation is an appropriate central place for our references to Unicode's own copious documentation, so point there. Also add to the isspace() test a thorough check that the implementation agrees with the intended definition.
This commit is contained in:
parent
077af8c2c9
commit
6bccbe7dfb
|
@ -1763,9 +1763,13 @@ expression support in the :mod:`re` module).
|
|||
.. method:: str.isspace()
|
||||
|
||||
Return true if there are only whitespace characters in the string and there is
|
||||
at least one character, false otherwise. Whitespace characters are those
|
||||
characters defined in the Unicode character database as "Other" or "Separator"
|
||||
and those with bidirectional property being one of "WS", "B", or "S".
|
||||
at least one character, false otherwise.
|
||||
|
||||
A character is *whitespace* if in the Unicode character database
|
||||
(see :mod:`unicodedata`), either its general category is ``Zs``
|
||||
("Separator, space"), or its bidirectional class is one of ``WS``,
|
||||
``B``, or ``S``.
|
||||
|
||||
|
||||
.. method:: str.istitle()
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ import operator
|
|||
import struct
|
||||
import sys
|
||||
import textwrap
|
||||
import unicodedata
|
||||
import unittest
|
||||
import warnings
|
||||
from test import support, string_tests
|
||||
|
@ -617,11 +618,21 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
self.checkequalnofix(True, '\u2000', 'isspace')
|
||||
self.checkequalnofix(True, '\u200a', 'isspace')
|
||||
self.checkequalnofix(False, '\u2014', 'isspace')
|
||||
# apparently there are no non-BMP spaces chars in Unicode 6
|
||||
# There are no non-BMP whitespace chars as of Unicode 12.
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
'\U0001F40D', '\U0001F46F']:
|
||||
self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
|
||||
|
||||
@support.requires_resource('cpu')
|
||||
def test_isspace_invariant(self):
|
||||
for codepoint in range(sys.maxunicode + 1):
|
||||
char = chr(codepoint)
|
||||
bidirectional = unicodedata.bidirectional(char)
|
||||
category = unicodedata.category(char)
|
||||
self.assertEqual(char.isspace(),
|
||||
(bidirectional in ('WS', 'B', 'S')
|
||||
or category == 'Zs'))
|
||||
|
||||
def test_isalnum(self):
|
||||
super().test_isalnum()
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
|
|
Loading…
Reference in New Issue