mirror of https://github.com/python/cpython
Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, unquote, unquote_to_bytes.
Recorded merge of revisions 81265 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81265 | florent.xicluna | 2010-05-17 15:35:09 +0200 (lun, 17 mai 2010) | 2 lines Issue #1285086: Speed up urllib.quote and urllib.unquote for simple cases. ........
This commit is contained in:
parent
757445bee0
commit
c7b8e8682e
|
@ -41,7 +41,7 @@ uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
|
||||||
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
|
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
|
||||||
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
|
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
|
||||||
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
|
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
|
||||||
'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
|
'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
|
||||||
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
|
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
|
||||||
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
|
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
|
||||||
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
|
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
|
||||||
|
@ -307,17 +307,20 @@ def unquote_to_bytes(string):
|
||||||
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
|
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
|
||||||
# Note: strings are encoded as UTF-8. This is only an issue if it contains
|
# Note: strings are encoded as UTF-8. This is only an issue if it contains
|
||||||
# unescaped non-ASCII characters, which URIs should not.
|
# unescaped non-ASCII characters, which URIs should not.
|
||||||
|
if not string:
|
||||||
|
return b''
|
||||||
if isinstance(string, str):
|
if isinstance(string, str):
|
||||||
string = string.encode('utf-8')
|
string = string.encode('utf-8')
|
||||||
res = string.split(b'%')
|
res = string.split(b'%')
|
||||||
res[0] = res[0]
|
if len(res) == 1:
|
||||||
for i in range(1, len(res)):
|
return string
|
||||||
item = res[i]
|
string = res[0]
|
||||||
|
for item in res[1:]:
|
||||||
try:
|
try:
|
||||||
res[i] = bytes([int(item[:2], 16)]) + item[2:]
|
string += bytes([int(item[:2], 16)]) + item[2:]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
res[i] = b'%' + item
|
string += b'%' + item
|
||||||
return b''.join(res)
|
return string
|
||||||
|
|
||||||
def unquote(string, encoding='utf-8', errors='replace'):
|
def unquote(string, encoding='utf-8', errors='replace'):
|
||||||
"""Replace %xx escapes by their single-character equivalent. The optional
|
"""Replace %xx escapes by their single-character equivalent. The optional
|
||||||
|
@ -329,36 +332,39 @@ def unquote(string, encoding='utf-8', errors='replace'):
|
||||||
|
|
||||||
unquote('abc%20def') -> 'abc def'.
|
unquote('abc%20def') -> 'abc def'.
|
||||||
"""
|
"""
|
||||||
if encoding is None: encoding = 'utf-8'
|
if not string:
|
||||||
if errors is None: errors = 'replace'
|
return string
|
||||||
# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
|
|
||||||
# (list of single-byte bytes objects)
|
|
||||||
pct_sequence = []
|
|
||||||
res = string.split('%')
|
res = string.split('%')
|
||||||
for i in range(1, len(res)):
|
if len(res) == 1:
|
||||||
item = res[i]
|
return string
|
||||||
|
if encoding is None:
|
||||||
|
encoding = 'utf-8'
|
||||||
|
if errors is None:
|
||||||
|
errors = 'replace'
|
||||||
|
# pct_sequence: contiguous sequence of percent-encoded bytes
|
||||||
|
pct_sequence = b''
|
||||||
|
string = res[0]
|
||||||
|
for item in res[1:]:
|
||||||
try:
|
try:
|
||||||
if not item: raise ValueError
|
if not item:
|
||||||
pct_sequence.append(bytes.fromhex(item[:2]))
|
raise ValueError
|
||||||
|
pct_sequence += bytes.fromhex(item[:2])
|
||||||
rest = item[2:]
|
rest = item[2:]
|
||||||
|
if not rest:
|
||||||
|
# This segment was just a single percent-encoded character.
|
||||||
|
# May be part of a sequence of code units, so delay decoding.
|
||||||
|
# (Stored in pct_sequence).
|
||||||
|
continue
|
||||||
except ValueError:
|
except ValueError:
|
||||||
rest = '%' + item
|
rest = '%' + item
|
||||||
if not rest:
|
# Encountered non-percent-encoded characters. Flush the current
|
||||||
# This segment was just a single percent-encoded character.
|
# pct_sequence.
|
||||||
# May be part of a sequence of code units, so delay decoding.
|
string += pct_sequence.decode(encoding, errors) + rest
|
||||||
# (Stored in pct_sequence).
|
pct_sequence = b''
|
||||||
res[i] = ''
|
|
||||||
else:
|
|
||||||
# Encountered non-percent-encoded characters. Flush the current
|
|
||||||
# pct_sequence.
|
|
||||||
res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
|
|
||||||
pct_sequence = []
|
|
||||||
if pct_sequence:
|
if pct_sequence:
|
||||||
# Flush the final pct_sequence
|
# Flush the final pct_sequence
|
||||||
# res[-1] will always be empty if pct_sequence != []
|
string += pct_sequence.decode(encoding, errors)
|
||||||
assert not res[-1], "string=%r, res=%r" % (string, res)
|
return string
|
||||||
res[-1] = b''.join(pct_sequence).decode(encoding, errors)
|
|
||||||
return ''.join(res)
|
|
||||||
|
|
||||||
def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
|
def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
|
||||||
"""Parse a query given as a string argument.
|
"""Parse a query given as a string argument.
|
||||||
|
@ -439,7 +445,8 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
b'abcdefghijklmnopqrstuvwxyz'
|
b'abcdefghijklmnopqrstuvwxyz'
|
||||||
b'0123456789'
|
b'0123456789'
|
||||||
b'_.-')
|
b'_.-')
|
||||||
_safe_quoters= {}
|
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
|
||||||
|
_safe_quoters = {}
|
||||||
|
|
||||||
class Quoter(collections.defaultdict):
|
class Quoter(collections.defaultdict):
|
||||||
"""A mapping from bytes (in range(0,256)) to strings.
|
"""A mapping from bytes (in range(0,256)) to strings.
|
||||||
|
@ -451,7 +458,7 @@ class Quoter(collections.defaultdict):
|
||||||
# of cached keys don't call Python code at all).
|
# of cached keys don't call Python code at all).
|
||||||
def __init__(self, safe):
|
def __init__(self, safe):
|
||||||
"""safe: bytes object."""
|
"""safe: bytes object."""
|
||||||
self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
|
self.safe = _ALWAYS_SAFE.union(safe)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
# Without this, will just display as a defaultdict
|
# Without this, will just display as a defaultdict
|
||||||
|
@ -459,7 +466,7 @@ class Quoter(collections.defaultdict):
|
||||||
|
|
||||||
def __missing__(self, b):
|
def __missing__(self, b):
|
||||||
# Handle a cache miss. Store quoted string in cache and return.
|
# Handle a cache miss. Store quoted string in cache and return.
|
||||||
res = b in self.safe and chr(b) or ('%%%02X' % b)
|
res = chr(b) if b in self.safe else '%{:02X}'.format(b)
|
||||||
self[b] = res
|
self[b] = res
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
@ -493,6 +500,8 @@ def quote(string, safe='/', encoding=None, errors=None):
|
||||||
errors='strict' (unsupported characters raise a UnicodeEncodeError).
|
errors='strict' (unsupported characters raise a UnicodeEncodeError).
|
||||||
"""
|
"""
|
||||||
if isinstance(string, str):
|
if isinstance(string, str):
|
||||||
|
if not string:
|
||||||
|
return string
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
if errors is None:
|
if errors is None:
|
||||||
|
@ -527,18 +536,22 @@ def quote_from_bytes(bs, safe='/'):
|
||||||
not perform string-to-bytes encoding. It always returns an ASCII string.
|
not perform string-to-bytes encoding. It always returns an ASCII string.
|
||||||
quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
|
quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
|
||||||
"""
|
"""
|
||||||
|
if not isinstance(bs, (bytes, bytearray)):
|
||||||
|
raise TypeError("quote_from_bytes() expected bytes")
|
||||||
|
if not bs:
|
||||||
|
return ''
|
||||||
if isinstance(safe, str):
|
if isinstance(safe, str):
|
||||||
# Normalize 'safe' by converting to bytes and removing non-ASCII chars
|
# Normalize 'safe' by converting to bytes and removing non-ASCII chars
|
||||||
safe = safe.encode('ascii', 'ignore')
|
safe = safe.encode('ascii', 'ignore')
|
||||||
cachekey = bytes(safe) # In case it was a bytearray
|
else:
|
||||||
if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
|
safe = bytes([c for c in safe if c < 128])
|
||||||
raise TypeError("quote_from_bytes() expected a bytes")
|
if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
|
||||||
|
return bs.decode()
|
||||||
try:
|
try:
|
||||||
quoter = _safe_quoters[cachekey]
|
quoter = _safe_quoters[safe]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
quoter = Quoter(safe)
|
_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
|
||||||
_safe_quoters[cachekey] = quoter
|
return ''.join([quoter(char) for char in bs])
|
||||||
return ''.join([quoter[char] for char in bs])
|
|
||||||
|
|
||||||
def urlencode(query, doseq=False):
|
def urlencode(query, doseq=False):
|
||||||
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
|
||||||
|
|
|
@ -366,6 +366,9 @@ C-API
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes,
|
||||||
|
unquote, unquote_to_bytes.
|
||||||
|
|
||||||
- Issue #8688: Distutils now recalculates MANIFEST everytime.
|
- Issue #8688: Distutils now recalculates MANIFEST everytime.
|
||||||
|
|
||||||
- Issue #8477: ssl.RAND_egd() and ssl._test_decode_cert() support str with
|
- Issue #8477: ssl.RAND_egd() and ssl._test_decode_cert() support str with
|
||||||
|
|
Loading…
Reference in New Issue