From b713ec2531b61568dd2e768455e37e160b578d76 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Tue, 23 May 2006 18:45:30 +0000 Subject: [PATCH] Bug #1334662 / patch #1335972: int(string, base) wrong answers. In rare cases of strings specifying true values near sys.maxint, and oddball bases (not decimal or a power of 2), int(string, base) could deliver insane answers. This repairs all such problems, and also speeds string->int significantly. On my box, here are % speedups for decimal strings of various lengths: length speedup ------ ------- 1 12.4% 2 15.7% 3 20.6% 4 28.1% 5 33.2% 6 37.5% 7 41.9% 8 46.3% 9 51.2% 10 19.5% 11 19.9% 12 23.9% 13 23.7% 14 23.3% 15 24.9% 16 25.3% 17 28.3% 18 27.9% 19 35.7% Note that the difference between 9 and 10 is the difference between short and long Python ints on a 32-bit box. The patch doesn't actually do anything to speed conversion to long: the speedup is due to detecting "unsigned long" overflow more quickly. This is a bugfix candidate, but it's a non-trivial patch and it would be painful to separate the "bug fix" from the "speed up" parts. --- Lib/test/test_builtin.py | 78 ++++++++++++ Misc/ACKS | 1 + Misc/NEWS | 5 + Python/mystrtoul.c | 266 ++++++++++++++++++++++++++------------- 4 files changed, 262 insertions(+), 88 deletions(-) diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 121da24b5bc..48d50d88be2 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -709,6 +709,84 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(int('0123', 0), 83) self.assertEqual(int('0x123', 16), 291) + # SF bug 1334662: int(string, base) wrong answers + # Various representations of 2**32 evaluated to 0 + # rather than 2**32 in previous versions + + self.assertEqual(int('100000000000000000000000000000000', 2), 4294967296L) + self.assertEqual(int('102002022201221111211', 3), 4294967296L) + self.assertEqual(int('10000000000000000', 4), 4294967296L) + self.assertEqual(int('32244002423141', 5), 4294967296L) + self.assertEqual(int('1550104015504', 6), 4294967296L) + self.assertEqual(int('211301422354', 7), 4294967296L) + self.assertEqual(int('40000000000', 8), 4294967296L) + self.assertEqual(int('12068657454', 9), 4294967296L) + self.assertEqual(int('4294967296', 10), 4294967296L) + self.assertEqual(int('1904440554', 11), 4294967296L) + self.assertEqual(int('9ba461594', 12), 4294967296L) + self.assertEqual(int('535a79889', 13), 4294967296L) + self.assertEqual(int('2ca5b7464', 14), 4294967296L) + self.assertEqual(int('1a20dcd81', 15), 4294967296L) + self.assertEqual(int('100000000', 16), 4294967296L) + self.assertEqual(int('a7ffda91', 17), 4294967296L) + self.assertEqual(int('704he7g4', 18), 4294967296L) + self.assertEqual(int('4f5aff66', 19), 4294967296L) + self.assertEqual(int('3723ai4g', 20), 4294967296L) + self.assertEqual(int('281d55i4', 21), 4294967296L) + self.assertEqual(int('1fj8b184', 22), 4294967296L) + self.assertEqual(int('1606k7ic', 23), 4294967296L) + self.assertEqual(int('mb994ag', 24), 4294967296L) + self.assertEqual(int('hek2mgl', 25), 4294967296L) + self.assertEqual(int('dnchbnm', 26), 4294967296L) + self.assertEqual(int('b28jpdm', 27), 4294967296L) + self.assertEqual(int('8pfgih4', 28), 4294967296L) + self.assertEqual(int('76beigg', 29), 4294967296L) + self.assertEqual(int('5qmcpqg', 30), 4294967296L) + self.assertEqual(int('4q0jto4', 31), 4294967296L) + self.assertEqual(int('4000000', 32), 4294967296L) + self.assertEqual(int('3aokq94', 33), 4294967296L) + self.assertEqual(int('2qhxjli', 34), 4294967296L) + self.assertEqual(int('2br45qb', 35), 4294967296L) + self.assertEqual(int('1z141z4', 36), 4294967296L) + + # SF bug 1334662: int(string, base) wrong answers + # Checks for proper evaluation of 2**32 + 1 + self.assertEqual(int('100000000000000000000000000000001', 2), 4294967297L) + self.assertEqual(int('102002022201221111212', 3), 4294967297L) + self.assertEqual(int('10000000000000001', 4), 4294967297L) + self.assertEqual(int('32244002423142', 5), 4294967297L) + self.assertEqual(int('1550104015505', 6), 4294967297L) + self.assertEqual(int('211301422355', 7), 4294967297L) + self.assertEqual(int('40000000001', 8), 4294967297L) + self.assertEqual(int('12068657455', 9), 4294967297L) + self.assertEqual(int('4294967297', 10), 4294967297L) + self.assertEqual(int('1904440555', 11), 4294967297L) + self.assertEqual(int('9ba461595', 12), 4294967297L) + self.assertEqual(int('535a7988a', 13), 4294967297L) + self.assertEqual(int('2ca5b7465', 14), 4294967297L) + self.assertEqual(int('1a20dcd82', 15), 4294967297L) + self.assertEqual(int('100000001', 16), 4294967297L) + self.assertEqual(int('a7ffda92', 17), 4294967297L) + self.assertEqual(int('704he7g5', 18), 4294967297L) + self.assertEqual(int('4f5aff67', 19), 4294967297L) + self.assertEqual(int('3723ai4h', 20), 4294967297L) + self.assertEqual(int('281d55i5', 21), 4294967297L) + self.assertEqual(int('1fj8b185', 22), 4294967297L) + self.assertEqual(int('1606k7id', 23), 4294967297L) + self.assertEqual(int('mb994ah', 24), 4294967297L) + self.assertEqual(int('hek2mgm', 25), 4294967297L) + self.assertEqual(int('dnchbnn', 26), 4294967297L) + self.assertEqual(int('b28jpdn', 27), 4294967297L) + self.assertEqual(int('8pfgih5', 28), 4294967297L) + self.assertEqual(int('76beigh', 29), 4294967297L) + self.assertEqual(int('5qmcpqh', 30), 4294967297L) + self.assertEqual(int('4q0jto5', 31), 4294967297L) + self.assertEqual(int('4000001', 32), 4294967297L) + self.assertEqual(int('3aokq95', 33), 4294967297L) + self.assertEqual(int('2qhxjlj', 34), 4294967297L) + self.assertEqual(int('2br45qc', 35), 4294967297L) + self.assertEqual(int('1z141z5', 36), 4294967297L) + def test_intconversion(self): # Test __int__() class Foo0: diff --git a/Misc/ACKS b/Misc/ACKS index fe4896aba52..e30cc6147cb 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -390,6 +390,7 @@ Fredrik Lundh Mark Lutz Jim Lynch Mikael Lyngvig +Alan McIntyre Andrew I MacIntyre Tim MacKenzie Nick Maclaren diff --git a/Misc/NEWS b/Misc/NEWS index 0a9f343c285..6cd40ea0519 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,11 @@ What's New in Python 2.5 alpha 3? Core and builtins ----------------- +- Bug #1334662: ``int(string, base)`` could deliver a wrong answer + when ``base`` was not 2, 4, 8, 10, 16 or 32, and ``string`` represented + an integer close to ``sys.maxint``. This was repaired by patch + #1335972, which also gives a nice speedup. + - Patch #1337051: reduced size of frame objects. - PyErr_NewException now accepts a tuple of base classes as its diff --git a/Python/mystrtoul.c b/Python/mystrtoul.c index 8e60c0c0726..272f827c8cf 100644 --- a/Python/mystrtoul.c +++ b/Python/mystrtoul.c @@ -15,6 +15,94 @@ /* strtol and strtoul, renamed to avoid conflicts */ + +#include +#ifndef DONT_HAVE_ERRNO_H +#include +#endif + +/* Static overflow check values for bases 2 through 36. + * smallmax[base] is the largest unsigned long i such that + * i * base doesn't overflow unsigned long. + */ +static unsigned long smallmax[] = { + 0, /* bases 0 and 1 are invalid */ + 0, + ULONG_MAX / 2, + ULONG_MAX / 3, + ULONG_MAX / 4, + ULONG_MAX / 5, + ULONG_MAX / 6, + ULONG_MAX / 7, + ULONG_MAX / 8, + ULONG_MAX / 9, + ULONG_MAX / 10, + ULONG_MAX / 11, + ULONG_MAX / 12, + ULONG_MAX / 13, + ULONG_MAX / 14, + ULONG_MAX / 15, + ULONG_MAX / 16, + ULONG_MAX / 17, + ULONG_MAX / 18, + ULONG_MAX / 19, + ULONG_MAX / 20, + ULONG_MAX / 21, + ULONG_MAX / 22, + ULONG_MAX / 23, + ULONG_MAX / 24, + ULONG_MAX / 25, + ULONG_MAX / 26, + ULONG_MAX / 27, + ULONG_MAX / 28, + ULONG_MAX / 29, + ULONG_MAX / 30, + ULONG_MAX / 31, + ULONG_MAX / 32, + ULONG_MAX / 33, + ULONG_MAX / 34, + ULONG_MAX / 35, + ULONG_MAX / 36, +}; + +/* maximum digits that can't ever overflow for bases 2 through 36, + * calculated by [int(math.floor(math.log(2**32, i))) for i in range(2, 37)]. + * Note that this is pessimistic if sizeof(long) > 4. + */ +static int digitlimit[] = { + 0, 0, 32, 20, 16, 13, 12, 11, 10, 10, /* 0 - 9 */ + 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, /* 10 - 19 */ + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, /* 20 - 29 */ + 6, 6, 6, 6, 6, 6, 6}; /* 30 - 36 */ + +/* char-to-digit conversion for bases 2-36; all non-digits are 37 */ +static int digitlookup[] = { + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 37, 37, 37, 37, 37, 37, + 37, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 37, 37, 37, 37, + 37, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37 +}; + /* ** strtoul ** This is a general purpose routine for converting @@ -28,98 +116,100 @@ ** Errors due to bad pointers will probably result in ** exceptions - we don't check for them. */ - -#include -#ifndef DONT_HAVE_ERRNO_H -#include -#endif - unsigned long PyOS_strtoul(register char *str, char **ptr, int base) { - register unsigned long result; /* return value of the function */ - register int c; /* current input character */ - register unsigned long temp; /* used in overflow testing */ - int ovf; /* true if overflow occurred */ + register unsigned long result = 0; /* return value of the function */ + register int c; /* current input character */ + register int ovlimit; /* required digits to overflow */ - result = 0; - ovf = 0; + /* skip leading white space */ + while (*str && isspace(Py_CHARMASK(*str))) + ++str; -/* catch silly bases */ - if (base != 0 && (base < 2 || base > 36)) - { + /* check for leading 0 or 0x for auto-base or base 16 */ + switch (base) { + case 0: /* look for leading 0, 0x or 0X */ + if (*str == '0') { + ++str; + if (*str == 'x' || *str == 'X') { + ++str; + base = 16; + } + else + base = 8; + } + else + base = 10; + break; + + case 16: /* skip leading 0x or 0X */ + if (*str == '0') { + ++str; + if (*str == 'x' || *str == 'X') + ++str; + } + break; + } + + /* catch silly bases */ + if (base < 2 || base > 36) { + if (ptr) + *ptr = str; + return 0; + } + + /* skip leading zeroes */ + while (*str == '0') + ++str; + + /* base is guaranteed to be in [2, 36] at this point */ + ovlimit = digitlimit[base]; + + /* do the conversion until non-digit character encountered */ + while ((c = digitlookup[Py_CHARMASK(*str)]) < base) { + if (ovlimit > 0) /* no overflow check required */ + result = result * base + c; + else { /* requires overflow check */ + register unsigned long temp_result; + + if (ovlimit < 0) /* guaranteed overflow */ + goto overflowed; + + /* there could be an overflow */ + /* check overflow just from shifting */ + if (result > smallmax[base]) + goto overflowed; + + result *= base; + + /* check overflow from the digit's value */ + temp_result = result + c; + if (temp_result < result) + goto overflowed; + + result = temp_result; + } + + ++str; + --ovlimit; + } + + /* set pointer to point to the last character scanned */ if (ptr) - *ptr = str; - return 0; - } + *ptr = str; -/* skip leading white space */ - while (*str && isspace(Py_CHARMASK(*str))) - str++; + return result; -/* check for leading 0 or 0x for auto-base or base 16 */ - switch (base) - { - case 0: /* look for leading 0, 0x or 0X */ - if (*str == '0') - { - str++; - if (*str == 'x' || *str == 'X') - { - str++; - base = 16; - } - else - base = 8; +overflowed: + if (ptr) { + /* spool through remaining digit characters */ + while (digitlookup[Py_CHARMASK(*str)] < base) + ++str; + *ptr = str; } - else - base = 10; - break; - - case 16: /* skip leading 0x or 0X */ - if (*str == '0' && (*(str+1) == 'x' || *(str+1) == 'X')) - str += 2; - break; - } - -/* do the conversion */ - while ((c = Py_CHARMASK(*str)) != '\0') - { - if (isdigit(c) && c - '0' < base) - c -= '0'; - else - { - if (isupper(c)) - c = tolower(c); - if (c >= 'a' && c <= 'z') - c -= 'a' - 10; - else /* non-"digit" character */ - break; - if (c >= base) /* non-"digit" character */ - break; - } - temp = result; - result = result * base + c; - if(base == 10) { - if(((long)(result - c) / base != (long)temp)) /* overflow */ - ovf = 1; - } - else { - if ((result - c) / base != temp) /* overflow */ - ovf = 1; - } - str++; - } - -/* set pointer to point to the last character scanned */ - if (ptr) - *ptr = str; - if (ovf) - { - result = (unsigned long) ~0L; errno = ERANGE; - } - return result; + return (unsigned long)-1; } long @@ -127,25 +217,25 @@ PyOS_strtol(char *str, char **ptr, int base) { long result; char sign; - + while (*str && isspace(Py_CHARMASK(*str))) str++; - + sign = *str; if (sign == '+' || sign == '-') str++; - + result = (long) PyOS_strtoul(str, ptr, base); - + /* Signal overflow if the result appears negative, except for the largest negative integer */ if (result < 0 && !(sign == '-' && result == -result)) { errno = ERANGE; result = 0x7fffffff; } - + if (sign == '-') result = -result; - + return result; }