cpython/Parser/tokenizer.h

#ifndef Py_TOKENIZER_H
#define Py_TOKENIZER_H
#ifdef __cplusplus
extern "C" {
#endif

#include "object.h"

/* Tokenizer interface */

#include "token.h"      /* For token types */

#define MAXINDENT 100   /* Max indentation level */
#define MAXLEVEL 200    /* Max parentheses level */

enum decoding_state {
    STATE_INIT,
    STATE_RAW,
    STATE_NORMAL        /* have a codec associated with input */
};

/* Tokenizer state */
struct tok_state {
    /* Input state; buf <= cur <= inp <= end */
    /* NB an entire line is held in the buffer */
    char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
    char *cur;          /* Next character in buffer */
    char *inp;          /* End of data in buffer */
    char *end;          /* End of input buffer if buf != NULL */
    char *start;        /* Start of current token if not NULL */
    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
    /* NB If done != E_OK, cur must be == inp!!! */
    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
    int tabsize;        /* Tab spacing */
    int indent;         /* Current indentation index */
    int indstack[MAXINDENT];            /* Stack of indents */
    int atbol;          /* Nonzero if at begin of new line */
    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
    const char *prompt, *nextprompt;          /* For interactive prompting */
    int lineno;         /* Current line number */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
#ifndef PGEN
    char parenstack[MAXLEVEL];
    int parenlinenostack[MAXLEVEL];
    /* pgen doesn't have access to Python codecs, it cannot decode the input
       filename. The bytes filename might be kept, but it is only used by
       indenterror() and it is not really needed: pgen only compiles one file
       (Grammar/Grammar). */
    PyObject *filename;
#endif
    /* Stuff for checking on different tab sizes */
    int altindstack[MAXINDENT];         /* Stack of alternate indents */
    /* Stuff for PEP 0263 */
    enum decoding_state decoding_state;
    int decoding_erred;         /* whether erred in decoding  */
    int read_coding_spec;       /* whether 'coding:...' has been read  */
    char *encoding;         /* Source encoding. */
    int cont_line;          /* whether we are in a continuation line. */
    const char* line_start;     /* pointer to start of current line */
#ifndef PGEN
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
#endif
    const char* enc;        /* Encoding for the current str. */
    const char* str;
    const char* input; /* Tokenizer's newline translated copy of the string. */
};

extern struct tok_state *PyTokenizer_FromString(const char *, int);
extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
                                              const char *, const char *);
extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);

#ifdef __cplusplus
}
#endif
#endif /* !Py_TOKENIZER_H */
* Added support for X11 modules. * Makefile: change location of FORMS library. * posixmodule.c: turn #if 0 into #ifdef MSDOS (stuff in unistd.h or not) * Almost all .h files: added CPP magic to avoid duplicate inclusions and to support inclusion from C++. 1993-07-28 06:05:47 -03:00			`#ifndef Py_TOKENIZER_H`
			`#define Py_TOKENIZER_H`
			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

Patch #534304: Implement phase 1 of PEP 263. 2002-08-04 14:29:52 -03:00			`#include "object.h"`
Added copyright notice. 1991-02-19 08:39:46 -04:00
Initial revision 1990-10-14 09:07:46 -03:00			`/* Tokenizer interface */`

Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`#include "token.h" /* For token types */`
Initial revision 1990-10-14 09:07:46 -03:00
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`#define MAXINDENT 100 /* Max indentation level */`
bpo-33306: Improve SyntaxError messages for unbalanced parentheses. (GH-6516) 2018-12-17 11:34:14 -04:00			`#define MAXLEVEL 200 /* Max parentheses level */`
Initial revision 1990-10-14 09:07:46 -03:00
Use an enum for decoding_state. It makes the code a little more understandable. 2007-09-21 17:50:26 -03:00			`enum decoding_state {`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`STATE_INIT,`
			`STATE_RAW,`
#10222: fix for overzealous AIX compiler. 2010-10-29 01:54:13 -03:00			`STATE_NORMAL /* have a codec associated with input */`
Use an enum for decoding_state. It makes the code a little more understandable. 2007-09-21 17:50:26 -03:00			`};`

Initial revision 1990-10-14 09:07:46 -03:00			`/* Tokenizer state */`
			`struct tok_state {`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`/* Input state; buf <= cur <= inp <= end */`
			`/* NB an entire line is held in the buffer */`
			`char buf; / Input buffer, or NULL; malloc'ed if fp != NULL */`
			`char cur; / Next character in buffer */`
			`char inp; / End of data in buffer */`
			`char end; / End of input buffer if buf != NULL */`
			`char start; / Start of current token if not NULL */`
			`int done; /* E_OK normally, E_EOF at EOF, otherwise error code */`
			`/* NB If done != E_OK, cur must be == inp!!! */`
			`FILE fp; / Rest of input; NULL if tokenizing a string */`
			`int tabsize; /* Tab spacing */`
			`int indent; /* Current indentation index */`
			`int indstack[MAXINDENT]; /* Stack of indents */`
			`int atbol; /* Nonzero if at begin of new line */`
			`int pendin; /* Pending indents (if > 0) or dedents (if < 0) */`
Issue #1772673: The type of `char` arguments now changed to `const char`. 2013-10-19 15:03:34 -03:00			`const char prompt, nextprompt; /* For interactive prompting */`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`int lineno; /* Current line number */`
			`int level; /* () [] {} Parentheses nesting level */`
			`/* Used to allow free continuations inside them */`
Issue #10785: Store the filename as Unicode in the Python parser. 2011-04-04 19:39:01 -03:00			`#ifndef PGEN`
bpo-33306: Improve SyntaxError messages for unbalanced parentheses. (GH-6516) 2018-12-17 11:34:14 -04:00			`char parenstack[MAXLEVEL];`
			`int parenlinenostack[MAXLEVEL];`
Issue #10785: Store the filename as Unicode in the Python parser. 2011-04-04 19:39:01 -03:00			`/* pgen doesn't have access to Python codecs, it cannot decode the input`
			`filename. The bytes filename might be kept, but it is only used by`
			`indenterror() and it is not really needed: pgen only compiles one file`
			`(Grammar/Grammar). */`
			`PyObject *filename;`
			`#endif`
bpo-33306: Improve SyntaxError messages for unbalanced parentheses. (GH-6516) 2018-12-17 11:34:14 -04:00			`/* Stuff for checking on different tab sizes */`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`int altindstack[MAXINDENT]; /* Stack of alternate indents */`
			`/* Stuff for PEP 0263 */`
			`enum decoding_state decoding_state;`
			`int decoding_erred; /* whether erred in decoding */`
			`int read_coding_spec; /* whether 'coding:...' has been read */`
			`char encoding; / Source encoding. */`
			`int cont_line; /* whether we are in a continuation line. */`
			`const char* line_start; /* pointer to start of current line */`
Make pgen compile with pydebug. Duplicate normalized names, as it may be longer than the old string. 2002-08-04 17:10:29 -03:00			`#ifndef PGEN`
Issue #10095: fp_setreadl() doesn't reopen the file, reuse instead the file descriptor. 2010-10-14 09:04:34 -03:00			`PyObject decoding_readline; / open(...).readline */`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`PyObject *decoding_buffer;`
Make pgen compile with pydebug. Duplicate normalized names, as it may be longer than the old string. 2002-08-04 17:10:29 -03:00			`#endif`
Recorded merge of revisions 81029 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81029 \| antoine.pitrou \| 2010-05-09 16:46:46 +0200 (dim., 09 mai 2010) \| 3 lines Untabify C files. Will watch buildbots. ........ 2010-05-09 12:52:27 -03:00			`const char* enc; /* Encoding for the current str. */`
			`const char* str;`
			`const char* input; /* Tokenizer's newline translated copy of the string. */`
Initial revision 1990-10-14 09:07:46 -03:00			`};`

Merged revisions 76230 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r76230 \| benjamin.peterson \| 2009-11-12 17:39:44 -0600 (Thu, 12 Nov 2009) \| 2 lines fix several compile() issues by translating newlines in the tokenizer ........ 2009-11-12 20:17:59 -04:00			`extern struct tok_state PyTokenizer_FromString(const char , int);`
			`extern struct tok_state PyTokenizer_FromUTF8(const char , int);`
Issue #1772673: The type of `char` arguments now changed to `const char`. 2013-10-19 15:03:34 -03:00			`extern struct tok_state PyTokenizer_FromFile(FILE , const char*,`
			`const char , const char );`
Nuke all remaining occurrences of Py_PROTO and Py_FPROTO. 2000-07-09 00:09:57 -03:00			`extern void PyTokenizer_Free(struct tok_state *);`
			`extern int PyTokenizer_Get(struct tok_state , char , char *);`
* Added support for X11 modules. * Makefile: change location of FORMS library. * posixmodule.c: turn #if 0 into #ifdef MSDOS (stuff in unistd.h or not) * Almost all .h files: added CPP magic to avoid duplicate inclusions and to support inclusion from C++. 1993-07-28 06:05:47 -03:00
			`#ifdef __cplusplus`
			`}`
			`#endif`
			`#endif /* !Py_TOKENIZER_H */`