Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().

This commit is contained in:
Serhiy Storchaka 2013-03-14 21:31:09 +02:00
parent 2556c8388c
commit 923baea9f9
3 changed files with 58 additions and 18 deletions

View File

@ -28,6 +28,7 @@ import os
import time
import sys
import base64
import re
from urlparse import urljoin as basejoin
@ -1198,22 +1199,35 @@ def splitvalue(attr):
_hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a + b, chr(int(a + b, 16)))
for a in _hexdig for b in _hexdig)
_asciire = re.compile('([\x00-\x7f]+)')
def unquote(s):
"""unquote('abc%20def') -> 'abc def'."""
res = s.split('%')
if _is_unicode(s):
if '%' not in s:
return s
bits = _asciire.split(s)
res = [bits[0]]
append = res.append
for i in range(1, len(bits), 2):
append(unquote(str(bits[i])).decode('latin1'))
append(bits[i + 1])
return ''.join(res)
bits = s.split('%')
# fastpath
if len(res) == 1:
if len(bits) == 1:
return s
s = res[0]
for item in res[1:]:
res = [bits[0]]
append = res.append
for item in bits[1:]:
try:
s += _hextochr[item[:2]] + item[2:]
append(_hextochr[item[:2]])
append(item[2:])
except KeyError:
s += '%' + item
except UnicodeDecodeError:
s += unichr(int(item[:2], 16)) + item[2:]
return s
append('%')
append(item)
return ''.join(res)
def unquote_plus(s):
"""unquote('%7e/abc+def') -> '~/abc def'"""

View File

@ -28,6 +28,8 @@ test_urlparse.py provides a good indicator of parsing behavior.
"""
import re
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
@ -311,6 +313,15 @@ def urldefrag(url):
else:
return url, ''
try:
unicode
except NameError:
def _is_unicode(x):
return 0
else:
def _is_unicode(x):
return isinstance(x, unicode)
# unquote method for parse_qs and parse_qsl
# Cannot use directly from urllib as it would create a circular reference
# because urllib uses urlparse methods (urljoin). If you update this function,
@ -319,22 +330,35 @@ def urldefrag(url):
_hexdig = '0123456789ABCDEFabcdef'
_hextochr = dict((a+b, chr(int(a+b,16)))
for a in _hexdig for b in _hexdig)
_asciire = re.compile('([\x00-\x7f]+)')
def unquote(s):
"""unquote('abc%20def') -> 'abc def'."""
res = s.split('%')
if _is_unicode(s):
if '%' not in s:
return s
bits = _asciire.split(s)
res = [bits[0]]
append = res.append
for i in range(1, len(bits), 2):
append(unquote(str(bits[i])).decode('latin1'))
append(bits[i + 1])
return ''.join(res)
bits = s.split('%')
# fastpath
if len(res) == 1:
if len(bits) == 1:
return s
s = res[0]
for item in res[1:]:
res = [bits[0]]
append = res.append
for item in bits[1:]:
try:
s += _hextochr[item[:2]] + item[2:]
append(_hextochr[item[:2]])
append(item[2:])
except KeyError:
s += '%' + item
except UnicodeDecodeError:
s += unichr(int(item[:2], 16)) + item[2:]
return s
append('%')
append(item)
return ''.join(res)
def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
"""Parse a query given as a string argument.

View File

@ -214,6 +214,8 @@ Core and Builtins
Library
-------
- Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
- Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
a failure while decoding empty object literals when object_pairs_hook was
specified.