Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, unquote, unquote_to_bytes.

Recorded merge of revisions 81265 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81265 | florent.xicluna | 2010-05-17 15:35:09 +0200 (lun, 17 mai 2010) | 2 lines Issue #1285086: Speed up urllib.quote and urllib.unquote for simple cases. ........
2010-05-17 17:33:07 +00:00 · 2010-05-17 17:33:07 +00:00 · c7b8e8682e
parent 757445bee0
commit c7b8e8682e
2 changed files with 56 additions and 40 deletions
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@ -41,7 +41,7 @@ uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
-               'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
+               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
@ -307,17 +307,20 @@ def unquote_to_bytes(string):
    """unquote_to_bytes('abc%20def') -> b'abc def'."""
    # Note: strings are encoded as UTF-8. This is only an issue if it contains
    # unescaped non-ASCII characters, which URIs should not.
+    if not string:
+        return b''
    if isinstance(string, str):
        string = string.encode('utf-8')
    res = string.split(b'%')
-    res[0] = res[0]
-    for i in range(1, len(res)):
-        item = res[i]
+    if len(res) == 1:
+        return string
+    string = res[0]
+    for item in res[1:]:
        try:
-            res[i] = bytes([int(item[:2], 16)]) + item[2:]
+            string += bytes([int(item[:2], 16)]) + item[2:]
        except ValueError:
-            res[i] = b'%' + item
-    return b''.join(res)
+            string += b'%' + item
+    return string

 def unquote(string, encoding='utf-8', errors='replace'):
    """Replace %xx escapes by their single-character equivalent. The optional
@ -329,36 +332,39 @@ def unquote(string, encoding='utf-8', errors='replace'):

    unquote('abc%20def') -> 'abc def'.
    """
-    if encoding is None: encoding = 'utf-8'
-    if errors is None: errors = 'replace'
-    # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
-    # (list of single-byte bytes objects)
-    pct_sequence = []
+    if not string:
+        return string
    res = string.split('%')
-    for i in range(1, len(res)):
-        item = res[i]
+    if len(res) == 1:
+        return string
+    if encoding is None:
+        encoding = 'utf-8'
+    if errors is None:
+        errors = 'replace'
+    # pct_sequence: contiguous sequence of percent-encoded bytes
+    pct_sequence = b''
+    string = res[0]
+    for item in res[1:]:
        try:
-            if not item: raise ValueError
-            pct_sequence.append(bytes.fromhex(item[:2]))
+            if not item:
+                raise ValueError
+            pct_sequence += bytes.fromhex(item[:2])
            rest = item[2:]
+            if not rest:
+                # This segment was just a single percent-encoded character.
+                # May be part of a sequence of code units, so delay decoding.
+                # (Stored in pct_sequence).
+                continue
        except ValueError:
            rest = '%' + item
-        if not rest:
-            # This segment was just a single percent-encoded character.
-            # May be part of a sequence of code units, so delay decoding.
-            # (Stored in pct_sequence).
-            res[i] = ''
-        else:
-            # Encountered non-percent-encoded characters. Flush the current
-            # pct_sequence.
-            res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
-            pct_sequence = []
+        # Encountered non-percent-encoded characters. Flush the current
+        # pct_sequence.
+        string += pct_sequence.decode(encoding, errors) + rest
+        pct_sequence = b''
    if pct_sequence:
        # Flush the final pct_sequence
-        # res[-1] will always be empty if pct_sequence != []
-        assert not res[-1], "string=%r, res=%r" % (string, res)
-        res[-1] = b''.join(pct_sequence).decode(encoding, errors)
-    return ''.join(res)
+        string += pct_sequence.decode(encoding, errors)
+    return string

 def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
    """Parse a query given as a string argument.
@ -439,7 +445,8 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                         b'abcdefghijklmnopqrstuvwxyz'
                         b'0123456789'
                         b'_.-')
-_safe_quoters= {}
+_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
+_safe_quoters = {}

 class Quoter(collections.defaultdict):
    """A mapping from bytes (in range(0,256)) to strings.
@ -451,7 +458,7 @@ class Quoter(collections.defaultdict):
    # of cached keys don't call Python code at all).
    def __init__(self, safe):
        """safe: bytes object."""
-        self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
+        self.safe = _ALWAYS_SAFE.union(safe)

    def __repr__(self):
        # Without this, will just display as a defaultdict
@ -459,7 +466,7 @@ class Quoter(collections.defaultdict):

    def __missing__(self, b):
        # Handle a cache miss. Store quoted string in cache and return.
-        res = b in self.safe and chr(b) or ('%%%02X' % b)
+        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
        self[b] = res
        return res

@ -493,6 +500,8 @@ def quote(string, safe='/', encoding=None, errors=None):
    errors='strict' (unsupported characters raise a UnicodeEncodeError).
    """
    if isinstance(string, str):
+        if not string:
+            return string
        if encoding is None:
            encoding = 'utf-8'
        if errors is None:
@ -527,18 +536,22 @@ def quote_from_bytes(bs, safe='/'):
    not perform string-to-bytes encoding.  It always returns an ASCII string.
    quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
    """
+    if not isinstance(bs, (bytes, bytearray)):
+        raise TypeError("quote_from_bytes() expected bytes")
+    if not bs:
+        return ''
    if isinstance(safe, str):
        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
        safe = safe.encode('ascii', 'ignore')
-    cachekey = bytes(safe)  # In case it was a bytearray
-    if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
-        raise TypeError("quote_from_bytes() expected a bytes")
+    else:
+        safe = bytes([c for c in safe if c < 128])
+    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
+        return bs.decode()
    try:
-        quoter = _safe_quoters[cachekey]
+        quoter = _safe_quoters[safe]
    except KeyError:
-        quoter = Quoter(safe)
-        _safe_quoters[cachekey] = quoter
-    return ''.join([quoter[char] for char in bs])
+        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
+    return ''.join([quoter(char) for char in bs])

 def urlencode(query, doseq=False):
    """Encode a sequence of two-element tuples or dictionary into a URL query string.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -366,6 +366,9 @@ C-API
 Library
 -------

+- Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes,
+  unquote, unquote_to_bytes.
+
 - Issue #8688: Distutils now recalculates MANIFEST everytime.

 - Issue #8477: ssl.RAND_egd() and ssl._test_decode_cert() support str with