bpo-42663: Support full range of allowed transition hours in zipinfo.

Also refactor parsing of numbers and times.
This commit is contained in:
Serhiy Storchaka 2020-12-17 22:57:49 +02:00
parent 074ad5123f
commit cf67d218e8
1 changed files with 143 additions and 211 deletions

View File

@ -58,7 +58,7 @@ typedef struct {
uint8_t month;
uint8_t week;
uint8_t day;
int8_t hour;
int16_t hour;
int8_t minute;
int8_t second;
} CalendarRule;
@ -66,8 +66,8 @@ typedef struct {
typedef struct {
TransitionRuleType base;
uint8_t julian;
unsigned int day;
int8_t hour;
uint16_t day;
int16_t hour;
int8_t minute;
int8_t second;
} DayRule;
@ -118,15 +118,14 @@ ts_to_local(size_t *trans_idx, int64_t *trans_utc, long *utcoff,
static int
parse_tz_str(PyObject *tz_str_obj, _tzrule *out);
static Py_ssize_t
parse_abbr(const char *const p, PyObject **abbr);
static Py_ssize_t
parse_tz_delta(const char *const p, long *total_seconds);
static Py_ssize_t
parse_transition_time(const char *const p, int8_t *hour, int8_t *minute,
int8_t *second);
static Py_ssize_t
parse_transition_rule(const char *const p, TransitionRuleType **out);
static int
parse_abbr(const char **p, PyObject **abbr);
static int
parse_tz_delta(const char **p, long *total_seconds);
static int
parse_transition_time(const char **p, int *hour, int *minute, int *second);
static int
parse_transition_rule(const char **p, TransitionRuleType **out);
static _ttinfo *
find_tzrule_ttinfo(_tzrule *rule, int64_t ts, unsigned char fold, int year);
@ -1188,14 +1187,14 @@ calendarrule_year_to_timestamp(TransitionRuleType *base_self, int year)
}
int64_t ordinal = ymd_to_ord(year, self->month, month_day) - EPOCHORDINAL;
return ((ordinal * 86400) + (int64_t)(self->hour * 3600) +
return ((ordinal * 86400L) + (int64_t)(self->hour * 3600L) +
(int64_t)(self->minute * 60) + (int64_t)(self->second));
}
/* Constructor for CalendarRule. */
int
calendarrule_new(uint8_t month, uint8_t week, uint8_t day, int8_t hour,
int8_t minute, int8_t second, CalendarRule *out)
calendarrule_new(int month, int week, int day, int hour,
int minute, int second, CalendarRule *out)
{
// These bounds come from the POSIX standard, which describes an Mm.n.d
// rule as:
@ -1204,33 +1203,36 @@ calendarrule_new(uint8_t month, uint8_t week, uint8_t day, int8_t hour,
// 5, 1 <= m <= 12, where week 5 means "the last d day in month m" which
// may occur in either the fourth or the fifth week). Week 1 is the first
// week in which the d'th day occurs. Day zero is Sunday.
if (month <= 0 || month > 12) {
PyErr_Format(PyExc_ValueError, "Month must be in (0, 12]");
if (month < 1 || month > 12) {
PyErr_Format(PyExc_ValueError, "Month must be in [1, 12]");
return -1;
}
if (week <= 0 || week > 5) {
PyErr_Format(PyExc_ValueError, "Week must be in (0, 5]");
if (week < 1 || week > 5) {
PyErr_Format(PyExc_ValueError, "Week must be in [1, 5]");
return -1;
}
// If the 'day' parameter type is changed to a signed type,
// "day < 0" check must be added.
if (/* day < 0 || */ day > 6) {
if (day < 0 || day > 6) {
PyErr_Format(PyExc_ValueError, "Day must be in [0, 6]");
return -1;
}
if (hour < -167 || hour > 167) {
PyErr_Format(PyExc_ValueError, "Hour must be in [0, 167]");
return -1;
}
TransitionRuleType base = {&calendarrule_year_to_timestamp};
CalendarRule new_offset = {
.base = base,
.month = month,
.week = week,
.day = day,
.hour = hour,
.minute = minute,
.second = second,
.month = Py_SAFE_DOWNCAST(month, int, uint8_t),
.week = Py_SAFE_DOWNCAST(week, int, uint8_t),
.day = Py_SAFE_DOWNCAST(day, int, uint8_t),
.hour = Py_SAFE_DOWNCAST(hour, int, int16_t),
.minute = Py_SAFE_DOWNCAST(minute, int, int8_t),
.second = Py_SAFE_DOWNCAST(second, int, int8_t),
};
*out = new_offset;
@ -1270,40 +1272,45 @@ dayrule_year_to_timestamp(TransitionRuleType *base_self, int year)
// always transitions on a given calendar day (other than February 29th),
// you would use a Julian day, e.g. J91 always refers to April 1st and J365
// always refers to December 31st.
unsigned int day = self->day;
uint16_t day = self->day;
if (self->julian && day >= 59 && is_leap_year(year)) {
day += 1;
}
return ((days_before_year + day) * 86400) + (self->hour * 3600) +
return ((days_before_year + day) * 86400L) + (self->hour * 3600L) +
(self->minute * 60) + self->second;
}
/* Constructor for DayRule. */
static int
dayrule_new(uint8_t julian, unsigned int day, int8_t hour, int8_t minute,
int8_t second, DayRule *out)
dayrule_new(int julian, int day, int hour, int minute,
int second, DayRule *out)
{
// The POSIX standard specifies that Julian days must be in the range (1 <=
// n <= 365) and that non-Julian (they call it "0-based Julian") days must
// be in the range (0 <= n <= 365).
if (day < julian || day > 365) {
PyErr_Format(PyExc_ValueError, "day must be in [%u, 365], not: %u",
PyErr_Format(PyExc_ValueError, "day must be in [%d, 365], not: %d",
julian, day);
return -1;
}
if (hour < -167 || hour > 167) {
PyErr_Format(PyExc_ValueError, "Hour must be in [0, 167]");
return -1;
}
TransitionRuleType base = {
&dayrule_year_to_timestamp,
};
DayRule tmp = {
.base = base,
.julian = julian,
.day = day,
.hour = hour,
.minute = minute,
.second = second,
.julian = Py_SAFE_DOWNCAST(julian, int, uint8_t),
.day = Py_SAFE_DOWNCAST(day, int, int16_t),
.hour = Py_SAFE_DOWNCAST(hour, int, int16_t),
.minute = Py_SAFE_DOWNCAST(minute, int, int8_t),
.second = Py_SAFE_DOWNCAST(second, int, int8_t),
};
*out = tmp;
@ -1453,28 +1460,25 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
long std_offset = 1 << 20;
long dst_offset = 1 << 20;
char *tz_str = PyBytes_AsString(tz_str_obj);
const char *tz_str = PyBytes_AsString(tz_str_obj);
if (tz_str == NULL) {
return -1;
}
char *p = tz_str;
const char *p = tz_str;
// Read the `std` abbreviation, which must be at least 3 characters long.
Py_ssize_t num_chars = parse_abbr(p, &std_abbr);
if (num_chars < 1) {
PyErr_Format(PyExc_ValueError, "Invalid STD format in %R", tz_str_obj);
if (parse_abbr(&p, &std_abbr)) {
if (!PyErr_Occurred()) {
PyErr_Format(PyExc_ValueError, "Invalid STD format in %R", tz_str_obj);
}
goto error;
}
p += num_chars;
// Now read the STD offset, which is required
num_chars = parse_tz_delta(p, &std_offset);
if (num_chars < 0) {
if (parse_tz_delta(&p, &std_offset)) {
PyErr_Format(PyExc_ValueError, "Invalid STD offset in %R", tz_str_obj);
goto error;
}
p += num_chars;
// If the string ends here, there is no DST, otherwise we must parse the
// DST abbreviation and start and end dates and times.
@ -1482,12 +1486,12 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
goto complete;
}
num_chars = parse_abbr(p, &dst_abbr);
if (num_chars < 1) {
PyErr_Format(PyExc_ValueError, "Invalid DST format in %R", tz_str_obj);
if (parse_abbr(&p, &dst_abbr)) {
if (!PyErr_Occurred()) {
PyErr_Format(PyExc_ValueError, "Invalid DST format in %R", tz_str_obj);
}
goto error;
}
p += num_chars;
if (*p == ',') {
// From the POSIX standard:
@ -1497,14 +1501,11 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
dst_offset = std_offset + 3600;
}
else {
num_chars = parse_tz_delta(p, &dst_offset);
if (num_chars < 0) {
if (parse_tz_delta(&p, &dst_offset)) {
PyErr_Format(PyExc_ValueError, "Invalid DST offset in %R",
tz_str_obj);
goto error;
}
p += num_chars;
}
TransitionRuleType **transitions[2] = {&start, &end};
@ -1517,14 +1518,12 @@ parse_tz_str(PyObject *tz_str_obj, _tzrule *out)
}
p++;
num_chars = parse_transition_rule(p, transitions[i]);
if (num_chars < 0) {
if (parse_transition_rule(&p, transitions[i])) {
PyErr_Format(PyExc_ValueError,
"Malformed transition rule in TZ string: %R",
tz_str_obj);
goto error;
}
p += num_chars;
}
if (*p != '\0') {
@ -1558,21 +1557,24 @@ error:
}
static int
parse_uint(const char *const p, uint8_t *value)
parse_digits(const char **p, int min, int max, int *value)
{
if (!isdigit(*p)) {
return -1;
*value = 0;
for (int i = 0; i < max; i++, (*p)++) {
if (!isdigit(**p)) {
return (i < min) ? -1 : 0;
}
*value *= 10;
*value += (**p) - '0';
}
*value = (*p) - '0';
return 0;
}
/* Parse the STD and DST abbreviations from a TZ string. */
static Py_ssize_t
parse_abbr(const char *const p, PyObject **abbr)
static int
parse_abbr(const char **p, PyObject **abbr)
{
const char *ptr = p;
const char *ptr = *p;
char buff = *ptr;
const char *str_start;
const char *str_end;
@ -1601,7 +1603,7 @@ parse_abbr(const char *const p, PyObject **abbr)
ptr++;
}
else {
str_start = p;
str_start = ptr;
// From the POSIX standard:
//
// In the unquoted form, all characters in these fields shall be
@ -1611,6 +1613,9 @@ parse_abbr(const char *const p, PyObject **abbr)
ptr++;
}
str_end = ptr;
if (str_end == str_start) {
return -1;
}
}
*abbr = PyUnicode_FromStringAndSize(str_start, str_end - str_start);
@ -1618,12 +1623,13 @@ parse_abbr(const char *const p, PyObject **abbr)
return -1;
}
return ptr - p;
*p = ptr;
return 0;
}
/* Parse a UTC offset from a TZ str. */
static Py_ssize_t
parse_tz_delta(const char *const p, long *total_seconds)
static int
parse_tz_delta(const char **p, long *total_seconds)
{
// From the POSIX spec:
//
@ -1638,75 +1644,30 @@ parse_tz_delta(const char *const p, long *total_seconds)
// The POSIX spec says that the values for `hour` must be between 0 and 24
// hours, but RFC 8536 §3.3.1 specifies that the hours part of the
// transition times may be signed and range from -167 to 167.
long sign = -1;
long hours = 0;
long minutes = 0;
long seconds = 0;
int hours = 0;
int minutes = 0;
int seconds = 0;
const char *ptr = p;
char buff = *ptr;
if (buff == '-' || buff == '+') {
// Negative numbers correspond to *positive* offsets, from the spec:
//
// If preceded by a '-', the timezone shall be east of the Prime
// Meridian; otherwise, it shall be west (which may be indicated by
// an optional preceding '+' ).
if (buff == '-') {
sign = 1;
}
ptr++;
}
// The hour can be 1 or 2 numeric characters
for (size_t i = 0; i < 2; ++i) {
buff = *ptr;
if (!isdigit(buff)) {
if (i == 0) {
return -1;
}
else {
break;
}
}
hours *= 10;
hours += buff - '0';
ptr++;
}
if (hours > 24 || hours < 0) {
if (parse_transition_time(p, &hours, &minutes, &seconds)) {
return -1;
}
// Minutes and seconds always of the format ":dd"
long *outputs[2] = {&minutes, &seconds};
for (size_t i = 0; i < 2; ++i) {
if (*ptr != ':') {
goto complete;
}
ptr++;
for (size_t j = 0; j < 2; ++j) {
buff = *ptr;
if (!isdigit(buff)) {
return -1;
}
*(outputs[i]) *= 10;
*(outputs[i]) += buff - '0';
ptr++;
}
if (hours > 24 || hours < -24) {
return -1;
}
complete:
*total_seconds = sign * ((hours * 3600) + (minutes * 60) + seconds);
return ptr - p;
// Negative numbers correspond to *positive* offsets, from the spec:
//
// If preceded by a '-', the timezone shall be east of the Prime
// Meridian; otherwise, it shall be west (which may be indicated by
// an optional preceding '+' ).
*total_seconds = -((hours * 3600L) + (minutes * 60) + seconds);
return 0;
}
/* Parse the date portion of a transition rule. */
static Py_ssize_t
parse_transition_rule(const char *const p, TransitionRuleType **out)
static int
parse_transition_rule(const char **p, TransitionRuleType **out)
{
// The full transition rule indicates when to change back and forth between
// STD and DST, and has the form:
@ -1718,10 +1679,10 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
// does not include the ',' at the end of the first rule.
//
// The POSIX spec states that if *time* is not given, the default is 02:00.
const char *ptr = p;
int8_t hour = 2;
int8_t minute = 0;
int8_t second = 0;
const char *ptr = *p;
int hour = 2;
int minute = 0;
int second = 0;
// Rules come in one of three flavors:
//
@ -1730,44 +1691,30 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
// 3. Mm.n.d: Specifying by month, week and day-of-week.
if (*ptr == 'M') {
uint8_t month, week, day;
int month, week, day;
ptr++;
if (parse_uint(ptr, &month)) {
if (parse_digits(&ptr, 1, 2, &month)) {
return -1;
}
ptr++;
if (*ptr != '.') {
uint8_t tmp;
if (parse_uint(ptr, &tmp)) {
return -1;
}
month *= 10;
month += tmp;
ptr++;
if (*ptr++ != '.') {
return -1;
}
uint8_t *values[2] = {&week, &day};
for (size_t i = 0; i < 2; ++i) {
if (*ptr != '.') {
return -1;
}
ptr++;
if (parse_uint(ptr, values[i])) {
return -1;
}
ptr++;
if (parse_digits(&ptr, 1, 1, &week)) {
return -1;
}
if (*ptr++ != '.') {
return -1;
}
if (parse_digits(&ptr, 1, 1, &day)) {
return -1;
}
if (*ptr == '/') {
ptr++;
Py_ssize_t num_chars =
parse_transition_time(ptr, &hour, &minute, &second);
if (num_chars < 0) {
if (parse_transition_time(&ptr, &hour, &minute, &second)) {
return -1;
}
ptr += num_chars;
}
CalendarRule *rv = PyMem_Calloc(1, sizeof(CalendarRule));
@ -1783,33 +1730,22 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
*out = (TransitionRuleType *)rv;
}
else {
uint8_t julian = 0;
unsigned int day = 0;
int julian = 0;
int day = 0;
if (*ptr == 'J') {
julian = 1;
ptr++;
}
for (size_t i = 0; i < 3; ++i) {
if (!isdigit(*ptr)) {
if (i == 0) {
return -1;
}
break;
}
day *= 10;
day += (*ptr) - '0';
ptr++;
if (parse_digits(&ptr, 1, 3, &day)) {
return -1;
}
if (*ptr == '/') {
ptr++;
Py_ssize_t num_chars =
parse_transition_time(ptr, &hour, &minute, &second);
if (num_chars < 0) {
if (parse_transition_time(&ptr, &hour, &minute, &second)) {
return -1;
}
ptr += num_chars;
}
DayRule *rv = PyMem_Calloc(1, sizeof(DayRule));
@ -1824,13 +1760,13 @@ parse_transition_rule(const char *const p, TransitionRuleType **out)
*out = (TransitionRuleType *)rv;
}
return ptr - p;
*p = ptr;
return 0;
}
/* Parse the time portion of a transition rule (e.g. following an /) */
static Py_ssize_t
parse_transition_time(const char *const p, int8_t *hour, int8_t *minute,
int8_t *second)
static int
parse_transition_time(const char **p, int *hour, int *minute, int *second)
{
// From the spec:
//
@ -1842,12 +1778,9 @@ parse_transition_time(const char *const p, int8_t *hour, int8_t *minute,
// h[h][:mm[:ss]]
//
// RFC 8536 also allows transition times to be signed and to range from
// -167 to +167, but the current version only supports [0, 99].
//
// TODO: Support the full range of transition hours.
int8_t *components[3] = {hour, minute, second};
const char *ptr = p;
int8_t sign = 1;
// -167 to +167.
const char *ptr = *p;
int sign = 1;
if (*ptr == '-' || *ptr == '+') {
if (*ptr == '-') {
@ -1856,32 +1789,31 @@ parse_transition_time(const char *const p, int8_t *hour, int8_t *minute,
ptr++;
}
for (size_t i = 0; i < 3; ++i) {
if (i > 0) {
if (*ptr != ':') {
break;
}
ptr++;
}
// The hour can be 1 to 3 numeric characters
if (parse_digits(&ptr, 1, 3, hour)) {
return -1;
}
*hour *= sign;
uint8_t buff = 0;
for (size_t j = 0; j < 2; j++) {
if (!isdigit(*ptr)) {
if (i == 0 && j > 0) {
break;
}
// Minutes and seconds always of the format ":dd"
if (*ptr == ':') {
ptr++;
if (parse_digits(&ptr, 2, 2, minute)) {
return -1;
}
*minute *= sign;
if (*ptr == ':') {
ptr++;
if (parse_digits(&ptr, 2, 2, second)) {
return -1;
}
buff *= 10;
buff += (*ptr) - '0';
ptr++;
*second *= sign;
}
*(components[i]) = sign * buff;
}
return ptr - p;
*p = ptr;
return 0;
}
/* Constructor for a _tzrule.
@ -2236,8 +2168,8 @@ get_local_timestamp(PyObject *dt, int64_t *local_ts)
}
}
*local_ts = (int64_t)(ord - EPOCHORDINAL) * 86400 +
(int64_t)(hour * 3600 + minute * 60 + second);
*local_ts = (int64_t)(ord - EPOCHORDINAL) * 86400L +
(int64_t)(hour * 3600L + minute * 60 + second);
return 0;
}