2020-07-16 10:07:29 -03:00
|
|
|
#include <stdbool.h>
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
#include <Python.h>
|
2023-07-23 17:10:12 -03:00
|
|
|
#include "pycore_bytesobject.h" // _PyBytes_DecodeEscape()
|
2023-07-04 04:29:52 -03:00
|
|
|
#include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal()
|
2020-04-22 19:29:27 -03:00
|
|
|
|
2023-10-11 12:14:44 -03:00
|
|
|
#include "lexer/state.h"
|
2020-04-22 19:29:27 -03:00
|
|
|
#include "pegen.h"
|
2020-06-11 13:30:46 -03:00
|
|
|
#include "string_parser.h"
|
2020-04-22 19:29:27 -03:00
|
|
|
|
|
|
|
//// STRING HANDLING FUNCTIONS ////
|
|
|
|
|
|
|
|
static int
|
2022-04-30 07:16:27 -03:00
|
|
|
warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
2023-10-27 00:19:34 -03:00
|
|
|
if (p->call_invalid_rules) {
|
|
|
|
// Do not report warnings if we are in the second pass of the parser
|
|
|
|
// to avoid showing the warning twice.
|
|
|
|
return 0;
|
|
|
|
}
|
2024-09-29 22:13:13 -03:00
|
|
|
unsigned char c = (unsigned char)*first_invalid_escape;
|
2023-10-12 04:34:35 -03:00
|
|
|
if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {
|
|
|
|
// in this case the tokenizer has already emitted a warning,
|
|
|
|
// see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
|
2023-06-20 09:38:46 -03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-04-30 07:16:27 -03:00
|
|
|
int octal = ('4' <= c && c <= '7');
|
2020-04-22 19:29:27 -03:00
|
|
|
PyObject *msg =
|
2022-04-30 07:16:27 -03:00
|
|
|
octal
|
|
|
|
? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
|
|
|
|
first_invalid_escape)
|
|
|
|
: PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (msg == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
2022-11-03 13:53:25 -03:00
|
|
|
PyObject *category;
|
|
|
|
if (p->feature_version >= 12) {
|
|
|
|
category = PyExc_SyntaxWarning;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
category = PyExc_DeprecationWarning;
|
|
|
|
}
|
|
|
|
if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
|
2020-05-07 07:37:51 -03:00
|
|
|
t->lineno, NULL, NULL) < 0) {
|
2022-11-03 13:53:25 -03:00
|
|
|
if (PyErr_ExceptionMatches(category)) {
|
2023-06-20 09:38:46 -03:00
|
|
|
/* Replace the Syntax/DeprecationWarning exception with a SyntaxError
|
2020-04-22 19:29:27 -03:00
|
|
|
to get a more accurate error report */
|
|
|
|
PyErr_Clear();
|
2020-05-07 07:37:51 -03:00
|
|
|
|
|
|
|
/* This is needed, in order for the SyntaxError to point to the token t,
|
|
|
|
since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
|
|
|
|
error location, if p->known_err_token is not set. */
|
|
|
|
p->known_err_token = t;
|
2022-04-30 07:16:27 -03:00
|
|
|
if (octal) {
|
|
|
|
RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
|
|
|
|
first_invalid_escape);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
|
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
Py_DECREF(msg);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
Py_DECREF(msg);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
decode_utf8(const char **sPtr, const char *end)
|
|
|
|
{
|
2020-06-15 10:23:43 -03:00
|
|
|
const char *s;
|
|
|
|
const char *t;
|
2020-04-22 19:29:27 -03:00
|
|
|
t = s = *sPtr;
|
|
|
|
while (s < end && (*s & 0x80)) {
|
|
|
|
s++;
|
|
|
|
}
|
|
|
|
*sPtr = s;
|
|
|
|
return PyUnicode_DecodeUTF8(t, s - t, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyObject *
|
2020-05-07 07:37:51 -03:00
|
|
|
decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
2020-06-15 10:23:43 -03:00
|
|
|
PyObject *v;
|
|
|
|
PyObject *u;
|
2020-04-22 19:29:27 -03:00
|
|
|
char *buf;
|
|
|
|
char *p;
|
|
|
|
const char *end;
|
|
|
|
|
|
|
|
/* check for integer overflow */
|
2024-09-29 22:13:13 -03:00
|
|
|
if (len > (size_t)PY_SSIZE_T_MAX / 6) {
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
|
|
|
|
"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
|
2024-09-29 22:13:13 -03:00
|
|
|
u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (u == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
p = buf = PyBytes_AsString(u);
|
2020-11-18 11:38:53 -04:00
|
|
|
if (p == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
end = s + len;
|
|
|
|
while (s < end) {
|
|
|
|
if (*s == '\\') {
|
|
|
|
*p++ = *s++;
|
|
|
|
if (s >= end || *s & 0x80) {
|
|
|
|
strcpy(p, "u005c");
|
|
|
|
p += 5;
|
|
|
|
if (s >= end) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (*s & 0x80) {
|
|
|
|
PyObject *w;
|
|
|
|
int kind;
|
2021-06-12 10:11:59 -03:00
|
|
|
const void *data;
|
2020-06-15 10:23:43 -03:00
|
|
|
Py_ssize_t w_len;
|
|
|
|
Py_ssize_t i;
|
2020-04-22 19:29:27 -03:00
|
|
|
w = decode_utf8(&s, end);
|
|
|
|
if (w == NULL) {
|
|
|
|
Py_DECREF(u);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
kind = PyUnicode_KIND(w);
|
|
|
|
data = PyUnicode_DATA(w);
|
2020-06-15 10:23:43 -03:00
|
|
|
w_len = PyUnicode_GET_LENGTH(w);
|
|
|
|
for (i = 0; i < w_len; i++) {
|
2020-04-22 19:29:27 -03:00
|
|
|
Py_UCS4 chr = PyUnicode_READ(kind, data, i);
|
|
|
|
sprintf(p, "\\U%08x", chr);
|
|
|
|
p += 10;
|
|
|
|
}
|
|
|
|
/* Should be impossible to overflow */
|
|
|
|
assert(p - buf <= PyBytes_GET_SIZE(u));
|
|
|
|
Py_DECREF(w);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
*p++ = *s++;
|
|
|
|
}
|
|
|
|
}
|
2024-09-29 22:13:13 -03:00
|
|
|
len = (size_t)(p - buf);
|
2020-04-22 19:29:27 -03:00
|
|
|
s = buf;
|
|
|
|
|
|
|
|
const char *first_invalid_escape;
|
2024-09-29 22:13:13 -03:00
|
|
|
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
|
2020-04-22 19:29:27 -03:00
|
|
|
|
2023-04-19 13:18:16 -03:00
|
|
|
// HACK: later we can simply pass the line no, since we don't preserve the tokens
|
|
|
|
// when we are decoding the string but we preserve the line numbers.
|
|
|
|
if (v != NULL && first_invalid_escape != NULL && t != NULL) {
|
2022-04-30 07:16:27 -03:00
|
|
|
if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
|
2020-04-22 19:29:27 -03:00
|
|
|
/* We have not decref u before because first_invalid_escape points
|
|
|
|
inside u. */
|
|
|
|
Py_XDECREF(u);
|
|
|
|
Py_DECREF(v);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Py_XDECREF(u);
|
|
|
|
return v;
|
|
|
|
}
|
|
|
|
|
|
|
|
static PyObject *
|
2020-05-07 07:37:51 -03:00
|
|
|
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
|
|
|
const char *first_invalid_escape;
|
|
|
|
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
|
|
|
|
if (result == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (first_invalid_escape != NULL) {
|
2022-04-30 07:16:27 -03:00
|
|
|
if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
|
2020-04-22 19:29:27 -03:00
|
|
|
Py_DECREF(result);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2023-04-19 13:18:16 -03:00
|
|
|
PyObject *
|
|
|
|
_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
|
|
|
|
{
|
|
|
|
if (raw) {
|
2024-09-29 22:13:13 -03:00
|
|
|
return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);
|
2023-04-19 13:18:16 -03:00
|
|
|
}
|
|
|
|
return decode_unicode_with_escapes(p, s, len, t);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* s must include the bracketing quote characters, and r, b &/or f prefixes
|
|
|
|
(if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
|
|
|
|
_PyPegen_parse_string parses it, and returns the decoded Python string object. */
|
|
|
|
PyObject *
|
|
|
|
_PyPegen_parse_string(Parser *p, Token *t)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
2020-05-07 07:37:51 -03:00
|
|
|
const char *s = PyBytes_AsString(t->bytes);
|
|
|
|
if (s == NULL) {
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-05-07 07:37:51 -03:00
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
size_t len;
|
|
|
|
int quote = Py_CHARMASK(*s);
|
2023-04-19 13:18:16 -03:00
|
|
|
int bytesmode = 0;
|
|
|
|
int rawmode = 0;
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
if (Py_ISALPHA(quote)) {
|
2023-04-19 13:18:16 -03:00
|
|
|
while (!bytesmode || !rawmode) {
|
2020-04-22 19:29:27 -03:00
|
|
|
if (quote == 'b' || quote == 'B') {
|
2020-06-15 10:23:43 -03:00
|
|
|
quote =(unsigned char)*++s;
|
2023-04-19 13:18:16 -03:00
|
|
|
bytesmode = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
else if (quote == 'u' || quote == 'U') {
|
2020-06-15 10:23:43 -03:00
|
|
|
quote = (unsigned char)*++s;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
else if (quote == 'r' || quote == 'R') {
|
2020-06-15 10:23:43 -03:00
|
|
|
quote = (unsigned char)*++s;
|
2023-04-19 13:18:16 -03:00
|
|
|
rawmode = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (quote != '\'' && quote != '\"') {
|
|
|
|
PyErr_BadInternalCall();
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2024-06-25 14:40:05 -03:00
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
/* Skip the leading quote char. */
|
|
|
|
s++;
|
|
|
|
len = strlen(s);
|
2024-06-25 14:40:05 -03:00
|
|
|
// gh-120155: 's' contains at least the trailing quote,
|
|
|
|
// so the code '--len' below is safe.
|
|
|
|
assert(len >= 1);
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
if (len > INT_MAX) {
|
|
|
|
PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
if (s[--len] != quote) {
|
|
|
|
/* Last quote char must match the first. */
|
|
|
|
PyErr_BadInternalCall();
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
if (len >= 4 && s[0] == quote && s[1] == quote) {
|
|
|
|
/* A triple quoted string. We've already skipped one quote at
|
|
|
|
the start and one at the end of the string. Now skip the
|
|
|
|
two at the start. */
|
|
|
|
s += 2;
|
|
|
|
len -= 2;
|
|
|
|
/* And check that the last two match. */
|
|
|
|
if (s[--len] != quote || s[--len] != quote) {
|
|
|
|
PyErr_BadInternalCall();
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Avoid invoking escape decoding routines if possible. */
|
2023-04-19 13:18:16 -03:00
|
|
|
rawmode = rawmode || strchr(s, '\\') == NULL;
|
|
|
|
if (bytesmode) {
|
2020-04-22 19:29:27 -03:00
|
|
|
/* Disallow non-ASCII characters. */
|
|
|
|
const char *ch;
|
|
|
|
for (ch = s; *ch; ch++) {
|
|
|
|
if (Py_CHARMASK(*ch) >= 0x80) {
|
2023-04-22 21:08:27 -03:00
|
|
|
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
|
|
|
|
t,
|
2020-04-22 19:29:27 -03:00
|
|
|
"bytes can only contain ASCII "
|
2021-01-23 18:56:57 -04:00
|
|
|
"literal characters");
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-06-15 10:23:43 -03:00
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2023-04-19 13:18:16 -03:00
|
|
|
if (rawmode) {
|
2024-09-29 22:13:13 -03:00
|
|
|
return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
|
2020-06-15 10:23:43 -03:00
|
|
|
}
|
2024-09-29 22:13:13 -03:00
|
|
|
return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2023-04-19 13:18:16 -03:00
|
|
|
return _PyPegen_decode_string(p, rawmode, s, len, t);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|