bpo-42663: Support full range of allowed transition hours in zipinfo.

Also refactor parsing of numbers and times.
This commit is contained in:
Serhiy Storchaka 2020-12-17 22:57:49 +02:00
parent 074ad5123f
commit cf67d218e8
1 changed files with 143 additions and 211 deletions

View File

@ -58,7 +58,7 @@ typedef struct {
uint8_t month; uint8_t month;
uint8_t week; uint8_t week;
uint8_t day; uint8_t day;
int8_t hour; int16_t hour;
int8_t minute; int8_t minute;
int8_t second; int8_t second;
} CalendarRule; } CalendarRule;
@ -66,8 +66,8 @@ typedef struct {
typedef struct { typedef struct {
TransitionRuleType base; TransitionRuleType base;
uint8_t julian; uint8_t julian;
unsigned int day; uint16_t day;
int8_t hour; int16_t hour;
int8_t minute; int8_t minute;
int8_t second; int8_t second;
} DayRule; } DayRule;
@ -118,15 +118,14 @@ ts_to_local(size_t *trans_idx, int64_t *trans_utc, long *utcoff,
static int static int
parse_tz_str(PyObject *tz_str_obj, _tzrule *out); parse_tz_str(PyObject *tz_str_obj, _tzrule *out);
static Py_ssize_t static int
parse_abbr(const char *const p, PyObject **abbr); parse_abbr(const char **p, PyObject **abbr);
static Py_ssize_t static int
parse_tz_delta(const char *const p, long *total_seconds); parse_tz_delta(const char **p, long *total_seconds);
static Py_ssize_t static int
parse_transition_time(const char *const p, int8_t *hour, int8_t *minute, parse_transition_time(const char **p, int *hour, int *minute, int *second);
int8_t *second); static int
static Py_ssize_t parse_transition_rule(const char **p, TransitionRuleType **out);
parse_transition_rule(const char *const p, TransitionRuleType **out);
static _ttinfo * static _ttinfo *
find_tzrule_ttinfo(_tzrule *rule, int64_t ts, unsigned char fold, int year); find_tzrule_ttinfo(_tzrule *rule, int64_t ts, unsigned char fold, int year);
@ -1188,14 +1187,14 @@ calendarrule_year_to_timestamp(TransitionRuleType *base_self, int year)
} }
int64_t ordinal = ymd_to_ord(year, self->month, month_day) - EPOCHORDINAL; int64_t ordinal = ymd_to_ord(year, self->month, month_day) - EPOCHORDINAL;
return ((ordinal * 86400) + (int64_t)(self->hour * 3600) + return ((ordinal * 86400L) + (int64_t)(self->hour * 3600L) +
(int64_t)(self->minute * 60) + (int64_t)(self->second)); (int64_t)(self->minute * 60) + (int64_t)(self->second));
} }
/* Constructor for CalendarRule. */ /* Constructor for CalendarRule. */
int int
calendarrule_new(uint8_t month, uint8_t week, uint8_t day, int8_t hour, calendarrule_new(int month, int week, int day, int hour,
int8_t minute, int8_t second, CalendarRule *out) int minute, int second, CalendarRule *out)
{ {
// These bounds come from the POSIX standard, which describes an Mm.n.d // These bounds come from the POSIX standard, which describes an Mm.n.d
// rule as: // rule as:
@ -1204,33 +1203,36 @@ calendarrule_new(uint8_t month, uint8_t week, uint8_t day, int8_t hour,
// 5, 1 <= m <= 12, where week 5 means "the last d day in month m" which // 5, 1 <= m <= 12, where week 5 means "the last d day in month m" which
// may occur in either the fourth or the fifth week). Week 1 is the first // may occur in either the fourth or the fifth week). Week 1 is the first
// week in which the d'th day occurs. Day zero is Sunday. // week in which the d'th day occurs. Day zero is Sunday.
if (month <= 0 || month > 12) { if (month < 1 || month > 12) {
PyErr_Format(PyExc_ValueError, "Month must be in (0, 12]"); PyErr_Format(PyExc_ValueError, "Month must be in [1, 12]");
return -1; return -1;
} }
if (week <= 0 || week > 5) { if (week < 1 || week > 5) {
PyErr_Format(PyExc_ValueError, "Week must be in (0, 5]"); PyErr_Format(PyExc_ValueError, "Week must be in [1, 5]");
return -1; return -1;
} }
// If the 'day' parameter type is changed to a signed type, if (day < 0 || day > 6) {
// "day < 0" check must be added.
if (/* day < 0 || */ day > 6) {
PyErr_Format(PyExc_ValueError, "Day must be in [0, 6]"); PyErr_Format(PyExc_ValueError, "Day must be in [0, 6]");
return -1; return -1;
} }
if (hour < -167 || hour > 167) {
PyErr_Format(PyExc_ValueError, "Hour must be in [0, 167]");
return -1;
}
TransitionRuleType base = {&calendarrule_year_to_timestamp}; TransitionRuleType base = {&calendarrule_year_to_timestamp};
CalendarRule new_offset = { CalendarRule new_offset = {
.base = base, .base = base,
.month = month, .month = Py_SAFE_DOWNCAST(month, int, uint8_t),
.week = week, .week = Py_SAFE_DOWNCAST(week, int, uint8_t),
.day = day, .day = Py_SAFE_DOWNCAST(day, int, uint8_t),
.hour = hour, .hour = Py_SAFE_DOWNCAST(hour, int, int16_t),
.minute = minute, .minute = Py_SAFE_DOWNCAST(minute, int, int8_t),
.second = second, .second = Py_SAFE_DOWNCAST(second, int, int8_t),
}; };
*out = new_offset; *out = new_offset;
@ -1270,40 +1272,45 @@ dayrule_year_to_timestamp(TransitionRuleType *base_self, int year)
// always transitions on a given calendar day (other than February 29th), // always transitions on a given calendar day (other than February 29th),
// you would use a Julian day, e.g. J91 always refers to April 1st and J365 // you would use a Julian day, e.g. J91 always refers to April 1st and J365
// always refers to December 31st. // always refers to December 31st.
unsigned int day = self->day; uint16_t day = self->day;
if (self->julian && day >= 59 && is_leap_year(year)) { if (self->julian && day >= 59 && is_leap_year(year)) {
day += 1; day += 1;
} }
return ((days_before_year + day) * 86400) + (self->hour * 3600) + return ((days_before_year + day) * 86400L) + (self->hour * 3600L) +
(self->minute * 60) + self->second; (self->minute * 60) + self->second;
} }
/* Constructor for DayRule. */ /* Constructor for DayRule. */
static int static int
dayrule_new(uint8_t julian, unsigned int day, int8_t hour, int8_t minute, dayrule_new(int julian, int day, int hour, int minute,
int8_t second, DayRule *out) int second, DayRule *out)
{ {
// The POSIX standard specifies that Julian days must be in the range (1 <= // The POSIX standard specifies that Julian days must be in the range (1 <=
// n <= 365) and that non-Julian (they call it "0-based Julian") days must // n <= 365) and that non-Julian (they call it "0-based Julian") days must
// be in the range (0 <= n <= 365). // be in the range (0 <= n <= 365).
if (day < julian || day > 365) { if (day < julian || day > 365) {
PyErr_Format(PyExc_ValueError, "day must be in [%u, 365], not: %u", PyErr_Format(PyExc_ValueError, "day must be in [%d, 365], not: %d",
julian, day); julian, day);
return -1; return -1;
} }
if (hour < -167 || hour > 167) {
PyErr_Format(PyExc_ValueError, "Hour must be in [0, 167]");
return -1;
}
TransitionRuleType base = { TransitionRuleType base = {
&dayrule_year_to_timestamp, &dayrule_year_to_timestamp,
}; };
DayRule tmp = { DayRule tmp = {
.base = base, .base = base,
.julian = julian, .julian = Py_SAFE_DOWNCAST(julian, int, uint8_t),
.day = day, .day = Py_SAFE_DOWNCAST(day, int, int16_t),
.hour = hour, .hour = Py_SAFE_DOWNCAST(hour, int, int16_t),
.minute = minute, .minute = Py_SAFE_DOWNCAST(minute, int, int8_t),
.second = second, .second = Py_SAFE_DOWNCAST(second, int, int8_t),
}; };
*out = tmp; *out = tmp;
@ -1453,28 +1460,25 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
long std_offset = 1 << 20; long std_offset = 1 << 20;
long dst_offset = 1 << 20; long dst_offset = 1 << 20;
char *tz_str = PyBytes_AsString(tz_str_obj); const char *tz_str = PyBytes_AsString(tz_str_obj);
if (tz_str == NULL) { if (tz_str == NULL) {
return -1; return -1;
} }
char *p = tz_str; const char *p = tz_str;
// Read the `std` abbreviation, which must be at least 3 characters long. // Read the `std` abbreviation, which must be at least 3 characters long.
Py_ssize_t num_chars = parse_abbr(p, &std_abbr); if (parse_abbr(&p, &std_abbr)) {
if (num_chars < 1) { if (!PyErr_Occurred()) {
PyErr_Format(PyExc_ValueError, "Invalid STD format in %R", tz_str_obj); PyErr_Format(PyExc_ValueError, "Invalid STD format in %R", tz_str_obj);
}
goto error; goto error;
} }
p += num_chars;
// Now read the STD offset, which is required // Now read the STD offset, which is required
num_chars = parse_tz_delta(p, &std_offset); if (parse_tz_delta(&p, &std_offset)) {
if (num_chars < 0) {
PyErr_Format(PyExc_ValueError, "Invalid STD offset in %R", tz_str_obj); PyErr_Format(PyExc_ValueError, "Invalid STD offset in %R", tz_str_obj);
goto error; goto error;
} }
p += num_chars;
// If the string ends here, there is no DST, otherwise we must parse the // If the string ends here, there is no DST, otherwise we must parse the
// DST abbreviation and start and end dates and times. // DST abbreviation and start and end dates and times.
@ -1482,12 +1486,12 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
goto complete; goto complete;
} }
num_chars = parse_abbr(p, &dst_abbr); if (parse_abbr(&p, &dst_abbr)) {
if (num_chars < 1) { if (!PyErr_Occurred()) {
PyErr_Format(PyExc_ValueError, "Invalid DST format in %R", tz_str_obj); PyErr_Format(PyExc_ValueError, "Invalid DST format in %R", tz_str_obj);
}
goto error; goto error;
} }
p += num_chars;
if (*p == ',') { if (*p == ',') {
// From the POSIX standard: // From the POSIX standard:
@ -1497,14 +1501,11 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
dst_offset = std_offset + 3600; dst_offset = std_offset + 3600;
} }
else { else {
num_chars = parse_tz_delta(p, &dst_offset); if (parse_tz_delta(&p, &dst_offset)) {
if (num_chars < 0) {
PyErr_Format(PyExc_ValueError, "Invalid DST offset in %R", PyErr_Format(PyExc_ValueError, "Invalid DST offset in %R",
tz_str_obj); tz_str_obj);
goto error; goto error;
} }
p += num_chars;
} }
TransitionRuleType **transitions[2] = {&start, &end}; TransitionRuleType **transitions[2] = {&start, &end};
@ -1517,14 +1518,12 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
} }
p++; p++;
num_chars = parse_transition_rule(p, transitions[i]); if (parse_transition_rule(&p, transitions[i])) {
if (num_chars < 0) {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"Malformed transition rule in TZ string: %R", "Malformed transition rule in TZ string: %R",
tz_str_obj); tz_str_obj);
goto error; goto error;
} }
p += num_chars;
} }
if (*p != '\0') { if (*p != '\0') {
@ -1558,21 +1557,24 @@ error:
} }
static int static int
parse_uint(const char *const p, uint8_t *value) parse_digits(const char **p, int min, int max, int *value)
{ {
if (!isdigit(*p)) { *value = 0;
return -1; for (int i = 0; i < max; i++, (*p)++) {
if (!isdigit(**p)) {
return (i < min) ? -1 : 0;
}
*value *= 10;
*value += (**p) - '0';
} }
*value = (*p) - '0';
return 0; return 0;
} }
/* Parse the STD and DST abbreviations from a TZ string. */ /* Parse the STD and DST abbreviations from a TZ string. */
static Py_ssize_t static int
parse_abbr(const char *const p, PyObject **abbr) parse_abbr(const char **p, PyObject **abbr)
{ {
const char *ptr = p; const char *ptr = *p;
char buff = *ptr; char buff = *ptr;
const char *str_start; const char *str_start;
const char *str_end; const char *str_end;
@ -1601,7 +1603,7 @@ parse_abbr(const char *const p, PyObject **abbr)
ptr++; ptr++;
} }
else { else {
str_start = p; str_start = ptr;
// From the POSIX standard: // From the POSIX standard:
// //
// In the unquoted form, all characters in these fields shall be // In the unquoted form, all characters in these fields shall be
@ -1611,6 +1613,9 @@ parse_abbr(const char *const p, PyObject **abbr)
ptr++; ptr++;
} }
str_end = ptr; str_end = ptr;
if (str_end == str_start) {
return -1;
}
} }
*abbr = PyUnicode_FromStringAndSize(str_start, str_end - str_start); *abbr = PyUnicode_FromStringAndSize(str_start, str_end - str_start);
@ -1618,12 +1623,13 @@ parse_abbr(const char *const p, PyObject **abbr)
return -1; return -1;
} }
return ptr - p; *p = ptr;
return 0;
} }
/* Parse a UTC offset from a TZ str. */ /* Parse a UTC offset from a TZ str. */
static Py_ssize_t static int
parse_tz_delta(const char *const p, long *total_seconds) parse_tz_delta(const char **p, long *total_seconds)
{ {
// From the POSIX spec: // From the POSIX spec:
// //
@ -1638,75 +1644,30 @@ parse_tz_delta(const char *const p, long *total_seconds)
// The POSIX spec says that the values for `hour` must be between 0 and 24 // The POSIX spec says that the values for `hour` must be between 0 and 24
// hours, but RFC 8536 §3.3.1 specifies that the hours part of the // hours, but RFC 8536 §3.3.1 specifies that the hours part of the
// transition times may be signed and range from -167 to 167. // transition times may be signed and range from -167 to 167.
long sign = -1; int hours = 0;
long hours = 0; int minutes = 0;
long minutes = 0; int seconds = 0;
long seconds = 0;
const char *ptr = p; if (parse_transition_time(p, &hours, &minutes, &seconds)) {
char buff = *ptr;
if (buff == '-' || buff == '+') {
// Negative numbers correspond to *positive* offsets, from the spec:
//
// If preceded by a '-', the timezone shall be east of the Prime
// Meridian; otherwise, it shall be west (which may be indicated by
// an optional preceding '+' ).
if (buff == '-') {
sign = 1;
}
ptr++;
}
// The hour can be 1 or 2 numeric characters
for (size_t i = 0; i < 2; ++i) {
buff = *ptr;
if (!isdigit(buff)) {
if (i == 0) {
return -1;
}
else {
break;
}
}
hours *= 10;
hours += buff - '0';
ptr++;
}
if (hours > 24 || hours < 0) {
return -1; return -1;
} }
// Minutes and seconds always of the format ":dd" if (hours > 24 || hours < -24) {
long *outputs[2] = {&minutes, &seconds}; return -1;
for (size_t i = 0; i < 2; ++i) {
if (*ptr != ':') {
goto complete;
}
ptr++;
for (size_t j = 0; j < 2; ++j) {
buff = *ptr;
if (!isdigit(buff)) {
return -1;
}
*(outputs[i]) *= 10;
*(outputs[i]) += buff - '0';
ptr++;
}
} }
complete: // Negative numbers correspond to *positive* offsets, from the spec:
*total_seconds = sign * ((hours * 3600) + (minutes * 60) + seconds); //
// If preceded by a '-', the timezone shall be east of the Prime
return ptr - p; // Meridian; otherwise, it shall be west (which may be indicated by
// an optional preceding '+' ).
*total_seconds = -((hours * 3600L) + (minutes * 60) + seconds);
return 0;
} }
/* Parse the date portion of a transition rule. */ /* Parse the date portion of a transition rule. */
static Py_ssize_t static int
parse_transition_rule(const char *const p, TransitionRuleType **out) parse_transition_rule(const char **p, TransitionRuleType **out)
{ {
// The full transition rule indicates when to change back and forth between // The full transition rule indicates when to change back and forth between
// STD and DST, and has the form: // STD and DST, and has the form:
@ -1718,10 +1679,10 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
// does not include the ',' at the end of the first rule. // does not include the ',' at the end of the first rule.
// //
// The POSIX spec states that if *time* is not given, the default is 02:00. // The POSIX spec states that if *time* is not given, the default is 02:00.
const char *ptr = p; const char *ptr = *p;
int8_t hour = 2; int hour = 2;
int8_t minute = 0; int minute = 0;
int8_t second = 0; int second = 0;
// Rules come in one of three flavors: // Rules come in one of three flavors:
// //
@ -1730,44 +1691,30 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
// 3. Mm.n.d: Specifying by month, week and day-of-week. // 3. Mm.n.d: Specifying by month, week and day-of-week.
if (*ptr == 'M') { if (*ptr == 'M') {
uint8_t month, week, day; int month, week, day;
ptr++; ptr++;
if (parse_uint(ptr, &month)) {
if (parse_digits(&ptr, 1, 2, &month)) {
return -1; return -1;
} }
ptr++; if (*ptr++ != '.') {
if (*ptr != '.') { return -1;
uint8_t tmp;
if (parse_uint(ptr, &tmp)) {
return -1;
}
month *= 10;
month += tmp;
ptr++;
} }
if (parse_digits(&ptr, 1, 1, &week)) {
uint8_t *values[2] = {&week, &day}; return -1;
for (size_t i = 0; i < 2; ++i) { }
if (*ptr != '.') { if (*ptr++ != '.') {
return -1; return -1;
} }
ptr++; if (parse_digits(&ptr, 1, 1, &day)) {
return -1;
if (parse_uint(ptr, values[i])) {
return -1;
}
ptr++;
} }
if (*ptr == '/') { if (*ptr == '/') {
ptr++; ptr++;
Py_ssize_t num_chars = if (parse_transition_time(&ptr, &hour, &minute, &second)) {
parse_transition_time(ptr, &hour, &minute, &second);
if (num_chars < 0) {
return -1; return -1;
} }
ptr += num_chars;
} }
CalendarRule *rv = PyMem_Calloc(1, sizeof(CalendarRule)); CalendarRule *rv = PyMem_Calloc(1, sizeof(CalendarRule));
@ -1783,33 +1730,22 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
*out = (TransitionRuleType *)rv; *out = (TransitionRuleType *)rv;
} }
else { else {
uint8_t julian = 0; int julian = 0;
unsigned int day = 0; int day = 0;
if (*ptr == 'J') { if (*ptr == 'J') {
julian = 1; julian = 1;
ptr++; ptr++;
} }
for (size_t i = 0; i < 3; ++i) { if (parse_digits(&ptr, 1, 3, &day)) {
if (!isdigit(*ptr)) { return -1;
if (i == 0) {
return -1;
}
break;
}
day *= 10;
day += (*ptr) - '0';
ptr++;
} }
if (*ptr == '/') { if (*ptr == '/') {
ptr++; ptr++;
Py_ssize_t num_chars = if (parse_transition_time(&ptr, &hour, &minute, &second)) {
parse_transition_time(ptr, &hour, &minute, &second);
if (num_chars < 0) {
return -1; return -1;
} }
ptr += num_chars;
} }
DayRule *rv = PyMem_Calloc(1, sizeof(DayRule)); DayRule *rv = PyMem_Calloc(1, sizeof(DayRule));
@ -1824,13 +1760,13 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
*out = (TransitionRuleType *)rv; *out = (TransitionRuleType *)rv;
} }
return ptr - p; *p = ptr;
return 0;
} }
/* Parse the time portion of a transition rule (e.g. following an /) */ /* Parse the time portion of a transition rule (e.g. following an /) */
static Py_ssize_t static int
parse_transition_time(const char *const p, int8_t *hour, int8_t *minute, parse_transition_time(const char **p, int *hour, int *minute, int *second)
int8_t *second)
{ {
// From the spec: // From the spec:
// //
@ -1842,12 +1778,9 @@ parse_transition_time(const char *const p, int8_t *hour, int8_t *minute,
// h[h][:mm[:ss]] // h[h][:mm[:ss]]
// //
// RFC 8536 also allows transition times to be signed and to range from // RFC 8536 also allows transition times to be signed and to range from
// -167 to +167, but the current version only supports [0, 99]. // -167 to +167.
// const char *ptr = *p;
// TODO: Support the full range of transition hours. int sign = 1;
int8_t *components[3] = {hour, minute, second};
const char *ptr = p;
int8_t sign = 1;
if (*ptr == '-' || *ptr == '+') { if (*ptr == '-' || *ptr == '+') {
if (*ptr == '-') { if (*ptr == '-') {
@ -1856,32 +1789,31 @@ parse_transition_time(const char *const p, int8_t *hour, int8_t *minute,
ptr++; ptr++;
} }
for (size_t i = 0; i < 3; ++i) { // The hour can be 1 to 3 numeric characters
if (i > 0) { if (parse_digits(&ptr, 1, 3, hour)) {
if (*ptr != ':') { return -1;
break; }
} *hour *= sign;
ptr++;
}
uint8_t buff = 0; // Minutes and seconds always of the format ":dd"
for (size_t j = 0; j < 2; j++) { if (*ptr == ':') {
if (!isdigit(*ptr)) { ptr++;
if (i == 0 && j > 0) { if (parse_digits(&ptr, 2, 2, minute)) {
break; return -1;
} }
*minute *= sign;
if (*ptr == ':') {
ptr++;
if (parse_digits(&ptr, 2, 2, second)) {
return -1; return -1;
} }
*second *= sign;
buff *= 10;
buff += (*ptr) - '0';
ptr++;
} }
*(components[i]) = sign * buff;
} }
return ptr - p; *p = ptr;
return 0;
} }
/* Constructor for a _tzrule. /* Constructor for a _tzrule.
@ -2236,8 +2168,8 @@ get_local_timestamp(PyObject *dt, int64_t *local_ts)
} }
} }
*local_ts = (int64_t)(ord - EPOCHORDINAL) * 86400 + *local_ts = (int64_t)(ord - EPOCHORDINAL) * 86400L +
(int64_t)(hour * 3600 + minute * 60 + second); (int64_t)(hour * 3600L + minute * 60 + second);
return 0; return 0;
} }