#include "Python.h" #include "errcode.h" #include "helpers.h" #include "../lexer/state.h" static int tok_underflow_string(struct tok_state *tok) { char *end = strchr(tok->inp, '\n'); if (end != NULL) { end++; } else { end = strchr(tok->inp, '\0'); if (end == tok->inp) { tok->done = E_EOF; return 0; } } if (tok->start == NULL) { tok->buf = tok->cur; } tok->line_start = tok->cur; ADVANCE_LINENO(); tok->inp = end; return 1; } /* Fetch a byte from TOK, using the string buffer. */ static int buf_getc(struct tok_state *tok) { return Py_CHARMASK(*tok->str++); } /* Unfetch a byte from TOK, using the string buffer. */ static void buf_ungetc(int c, struct tok_state *tok) { tok->str--; assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ } /* Set the readline function for TOK to ENC. For the string-based tokenizer, this means to just record the encoding. */ static int buf_setreadl(struct tok_state *tok, const char* enc) { tok->enc = enc; return 1; } /* Decode a byte string STR for use as the buffer of TOK. Look for encoding declarations inside STR, and record them inside TOK. */ static char * decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) { PyObject* utf8 = NULL; char *str; const char *s; const char *newl[2] = {NULL, NULL}; int lineno = 0; tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok); if (str == NULL) return NULL; tok->enc = NULL; tok->str = str; if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) return _PyTokenizer_error_ret(tok); str = tok->str; /* string after BOM if any */ assert(str); if (tok->enc != NULL) { utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); if (utf8 == NULL) return _PyTokenizer_error_ret(tok); str = PyBytes_AsString(utf8); } for (s = str;; s++) { if (*s == '\0') break; else if (*s == '\n') { assert(lineno < 2); newl[lineno] = s; lineno++; if (lineno == 2) break; } } tok->enc = NULL; /* need to check line 1 and 2 separately since check_coding_spec assumes a single line as input */ if (newl[0]) { if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { return NULL; } if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) return NULL; } } if (tok->enc != NULL) { assert(utf8 == NULL); utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); if (utf8 == NULL) return _PyTokenizer_error_ret(tok); str = PyBytes_AS_STRING(utf8); } assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str; } /* Set up tokenizer for string */ struct tok_state * _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) { struct tok_state *tok = _PyTokenizer_tok_new(); char *decoded; if (tok == NULL) return NULL; decoded = decode_str(str, exec_input, tok, preserve_crlf); if (decoded == NULL) { _PyTokenizer_Free(tok); return NULL; } tok->buf = tok->cur = tok->inp = decoded; tok->end = decoded; tok->underflow = &tok_underflow_string; return tok; }