From 384f3c536dd15ba33ea7e8afb4087ae359d4c12e Mon Sep 17 00:00:00 2001 From: Andy Lester Date: Thu, 27 Feb 2020 20:44:52 -0600 Subject: [PATCH] closes bpo-39721: Fix constness of members of tok_state struct. (GH-18600) The function PyTokenizer_FromUTF8 from Parser/tokenizer.c had a comment: /* XXX: constify members. */ This patch addresses that. In the tok_state struct: * end and start were non-const but could be made const * str and input were const but should have been non-const Changes to support this include: * decode_str() now returns a char * since it is allocated. * PyTokenizer_FromString() and PyTokenizer_FromUTF8() each creates a new char * for an allocate string instead of reusing the input const char *. * PyTokenizer_Get() and tok_get() now take const char ** arguments. * Various local vars are const or non-const accordingly. I was able to remove five casts that cast away constness. --- Parser/parsetok.c | 4 ++-- Parser/tokenizer.c | 50 +++++++++++++++++++++++++++------------------- Parser/tokenizer.h | 10 +++++----- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/Parser/parsetok.c b/Parser/parsetok.c index b0b1bd38a7b..554455dbc2b 100644 --- a/Parser/parsetok.c +++ b/Parser/parsetok.c @@ -240,7 +240,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, #endif for (;;) { - char *a, *b; + const char *a, *b; int type; size_t len; char *str; @@ -371,7 +371,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, buffer after parsing. Trailing whitespace and comments are OK. */ if (err_ret->error == E_DONE && start == single_input) { - char *cur = tok->cur; + const char *cur = tok->cur; char c = *tok->cur; for (;;) { diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 630b0aaab03..f82b1029981 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -59,7 +59,9 @@ tok_new(void) sizeof(struct tok_state)); if (tok == NULL) return NULL; - tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; + tok->buf = tok->cur = tok->inp = NULL; + tok->start = NULL; + tok->end = NULL; tok->done = E_OK; tok->fp = NULL; tok->input = NULL; @@ -111,7 +113,9 @@ error_ret(struct tok_state *tok) /* XXX */ tok->decoding_erred = 1; if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ PyMem_FREE(tok->buf); - tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL; + tok->buf = tok->cur = tok->inp = NULL; + tok->start = NULL; + tok->end = NULL; tok->done = E_DECODE; return NULL; /* as if it were EOF */ } @@ -664,11 +668,11 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { Look for encoding declarations inside STR, and record them inside TOK. */ -static const char * +static char * decode_str(const char *input, int single, struct tok_state *tok) { PyObject* utf8 = NULL; - const char *str; + char *str; const char *s; const char *newl[2] = {NULL, NULL}; int lineno = 0; @@ -726,16 +730,18 @@ struct tok_state * PyTokenizer_FromString(const char *str, int exec_input) { struct tok_state *tok = tok_new(); + char *decoded; + if (tok == NULL) return NULL; - str = decode_str(str, exec_input, tok); - if (str == NULL) { + decoded = decode_str(str, exec_input, tok); + if (decoded == NULL) { PyTokenizer_Free(tok); return NULL; } - /* XXX: constify members. */ - tok->buf = tok->cur = tok->end = tok->inp = (char*)str; + tok->buf = tok->cur = tok->inp = decoded; + tok->end = decoded; return tok; } @@ -743,17 +749,18 @@ struct tok_state * PyTokenizer_FromUTF8(const char *str, int exec_input) { struct tok_state *tok = tok_new(); + char *translated; if (tok == NULL) return NULL; - tok->input = str = translate_newlines(str, exec_input, tok); - if (str == NULL) { + tok->input = translated = translate_newlines(str, exec_input, tok); + if (translated == NULL) { PyTokenizer_Free(tok); return NULL; } tok->decoding_state = STATE_RAW; tok->read_coding_spec = 1; tok->enc = NULL; - tok->str = str; + tok->str = translated; tok->encoding = (char *)PyMem_MALLOC(6); if (!tok->encoding) { PyTokenizer_Free(tok); @@ -761,8 +768,8 @@ PyTokenizer_FromUTF8(const char *str, int exec_input) } strcpy(tok->encoding, "utf-8"); - /* XXX: constify members. */ - tok->buf = tok->cur = tok->end = tok->inp = (char*)str; + tok->buf = tok->cur = tok->inp = translated; + tok->end = translated; return tok; } @@ -812,7 +819,7 @@ PyTokenizer_Free(struct tok_state *tok) if (tok->fp != NULL && tok->buf != NULL) PyMem_FREE(tok->buf); if (tok->input) - PyMem_FREE((char *)tok->input); + PyMem_FREE(tok->input); PyMem_FREE(tok); } @@ -1138,7 +1145,7 @@ tok_decimal_tail(struct tok_state *tok) /* Get next token, after space stripping etc. */ static int -tok_get(struct tok_state *tok, char **p_start, char **p_end) +tok_get(struct tok_state *tok, const char **p_start, const char **p_end) { int c; int blankline, nonascii; @@ -1321,7 +1328,7 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); if (is_type_ignore) { - *p_start = (char *) ignore_end; + *p_start = ignore_end; *p_end = tok->cur; /* If this type ignore is the only thing on the line, consume the newline also. */ @@ -1331,7 +1338,7 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } return TYPE_IGNORE; } else { - *p_start = (char *) type_start; /* after type_comment_prefix */ + *p_start = type_start; /* after type_comment_prefix */ *p_end = tok->cur; return TYPE_COMMENT; } @@ -1410,7 +1417,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) Look ahead one token to see if that is 'def'. */ struct tok_state ahead_tok; - char *ahead_tok_start = NULL, *ahead_tok_end = NULL; + const char *ahead_tok_start = NULL; + const char *ahead_tok_end = NULL; int ahead_tok_kind; memcpy(&ahead_tok, tok, sizeof(ahead_tok)); @@ -1798,7 +1806,7 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } int -PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) +PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end) { int result = tok_get(tok, p_start, p_end); if (tok->decoding_erred) { @@ -1823,7 +1831,9 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { struct tok_state *tok; FILE *fp; - char *p_start =NULL , *p_end =NULL , *encoding = NULL; + const char *p_start = NULL; + const char *p_end = NULL; + char *encoding = NULL; fd = _Py_dup(fd); if (fd < 0) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 92669bfd8a1..5660ea38e94 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -26,8 +26,8 @@ struct tok_state { char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ char *cur; /* Next character in buffer */ char *inp; /* End of data in buffer */ - char *end; /* End of input buffer if buf != NULL */ - char *start; /* Start of current token if not NULL */ + const char *end; /* End of input buffer if buf != NULL */ + const char *start; /* Start of current token if not NULL */ int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ /* NB If done != E_OK, cur must be == inp!!! */ FILE *fp; /* Rest of input; NULL if tokenizing a string */ @@ -60,8 +60,8 @@ struct tok_state { PyObject *decoding_readline; /* open(...).readline */ PyObject *decoding_buffer; const char* enc; /* Encoding for the current str. */ - const char* str; - const char* input; /* Tokenizer's newline translated copy of the string. */ + char* str; + char* input; /* Tokenizer's newline translated copy of the string. */ int type_comments; /* Whether to look for type comments */ @@ -78,7 +78,7 @@ extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void PyTokenizer_Free(struct tok_state *); -extern int PyTokenizer_Get(struct tok_state *, char **, char **); +extern int PyTokenizer_Get(struct tok_state *, const char **, const char **); #define tok_dump _Py_tok_dump