cpython/Parser/tokenizer.c

491 lines
9.3 KiB
C

/* Tokenizer implementation */
/* XXX This is rather old, should be restructured perhaps */
/* XXX Need a better interface to report errors than writing to stderr */
#include <stdio.h>
#include <ctype.h>
#include "string.h"
#include "PROTO.h"
#include "malloc.h"
#include "tokenizer.h"
#include "errcode.h"
#ifdef THINK_C
#define TABSIZE 4
#endif
#ifndef TABSIZE
#define TABSIZE 8
#endif
/* Token names */
char *tok_name[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"BACKQUOTE",
"LBRACE",
"RBRACE",
"OP",
"<ERRORTOKEN>",
"<N_TOKENS>"
};
/* Create and initialize a new tok_state structure */
static struct tok_state *
tok_new()
{
struct tok_state *tok = NEW(struct tok_state, 1);
if (tok == NULL)
return NULL;
tok->buf = tok->cur = tok->end = tok->inp = NULL;
tok->done = E_OK;
tok->fp = NULL;
tok->tabsize = TABSIZE;
tok->indent = 0;
tok->indstack[0] = 0;
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
return tok;
}
/* Set up tokenizer for string */
struct tok_state *
tok_setups(str)
char *str;
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
tok->buf = tok->cur = str;
tok->end = tok->inp = strchr(str, '\0');
return tok;
}
/* Set up tokenizer for string */
struct tok_state *
tok_setupf(fp, ps1, ps2)
FILE *fp;
char *ps1, *ps2;
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
DEL(tok);
return NULL;
}
tok->cur = tok->inp = tok->buf;
tok->end = tok->buf + BUFSIZ;
tok->fp = fp;
tok->prompt = ps1;
tok->nextprompt = ps2;
return tok;
}
/* Free a tok_state structure */
void
tok_free(tok)
struct tok_state *tok;
{
/* XXX really need a separate flag to say 'my buffer' */
if (tok->fp != NULL && tok->buf != NULL)
DEL(tok->buf);
DEL(tok);
}
/* Get next char, updating state; error code goes into tok->done */
static int
tok_nextc(tok)
register struct tok_state *tok;
{
if (tok->done != E_OK)
return EOF;
for (;;) {
if (tok->cur < tok->inp)
return *tok->cur++;
if (tok->fp == NULL) {
tok->done = E_EOF;
return EOF;
}
if (tok->inp > tok->buf && tok->inp[-1] == '\n')
tok->inp = tok->buf;
if (tok->inp == tok->end) {
int n = tok->end - tok->buf;
char *new = tok->buf;
RESIZE(new, char, n+n);
if (new == NULL) {
fprintf(stderr, "tokenizer out of mem\n");
tok->done = E_NOMEM;
return EOF;
}
tok->buf = new;
tok->inp = tok->buf + n;
tok->end = tok->inp + n;
}
#ifdef USE_READLINE
if (tok->prompt != NULL) {
extern char *readline PROTO((char *prompt));
static int been_here;
if (!been_here) {
/* Force rebind of TAB to insert-tab */
extern int rl_insert();
rl_bind_key('\t', rl_insert);
been_here++;
}
if (tok->buf != NULL)
free(tok->buf);
tok->buf = readline(tok->prompt);
(void) intrcheck(); /* Clear pending interrupt */
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
/* XXX different semantics w/o readline()! */
if (tok->buf == NULL) {
tok->done = E_EOF;
}
else {
unsigned int n = strlen(tok->buf);
if (n > 0)
add_history(tok->buf);
/* Append the '\n' that readline()
doesn't give us, for the tokenizer... */
tok->buf = realloc(tok->buf, n+2);
if (tok->buf == NULL)
tok->done = E_NOMEM;
else {
tok->end = tok->buf + n;
*tok->end++ = '\n';
*tok->end = '\0';
tok->inp = tok->end;
tok->cur = tok->buf;
}
}
}
else
#endif
{
tok->cur = tok->inp;
if (tok->prompt != NULL && tok->inp == tok->buf) {
fprintf(stderr, "%s", tok->prompt);
tok->prompt = tok->nextprompt;
}
tok->done = fgets_intr(tok->inp,
(int)(tok->end - tok->inp), tok->fp);
}
if (tok->done != E_OK) {
if (tok->prompt != NULL)
fprintf(stderr, "\n");
return EOF;
}
tok->inp = strchr(tok->inp, '\0');
}
}
/* Back-up one character */
static void
tok_backup(tok, c)
register struct tok_state *tok;
register int c;
{
if (c != EOF) {
if (--tok->cur < tok->buf) {
fprintf(stderr, "tok_backup: begin of buffer\n");
abort();
}
if (*tok->cur != c)
*tok->cur = c;
}
}
/* Return the token corresponding to a single character */
int
tok_1char(c)
int c;
{
switch (c) {
case '(': return LPAR;
case ')': return RPAR;
case '[': return LSQB;
case ']': return RSQB;
case ':': return COLON;
case ',': return COMMA;
case ';': return SEMI;
case '+': return PLUS;
case '-': return MINUS;
case '*': return STAR;
case '/': return SLASH;
case '|': return VBAR;
case '&': return AMPER;
case '<': return LESS;
case '>': return GREATER;
case '=': return EQUAL;
case '.': return DOT;
case '%': return PERCENT;
case '`': return BACKQUOTE;
case '{': return LBRACE;
case '}': return RBRACE;
default: return OP;
}
}
/* Get next token, after space stripping etc. */
int
tok_get(tok, p_start, p_end)
register struct tok_state *tok; /* In/out: tokenizer state */
char **p_start, **p_end; /* Out: point to start/end of token */
{
register int c;
/* Get indentation level */
if (tok->atbol) {
register int col = 0;
tok->atbol = 0;
tok->lineno++;
for (;;) {
c = tok_nextc(tok);
if (c == ' ')
col++;
else if (c == '\t')
col = (col/tok->tabsize + 1) * tok->tabsize;
else
break;
}
tok_backup(tok, c);
if (col == tok->indstack[tok->indent]) {
/* No change */
}
else if (col > tok->indstack[tok->indent]) {
/* Indent -- always one */
if (tok->indent+1 >= MAXINDENT) {
fprintf(stderr, "excessive indent\n");
tok->done = E_TOKEN;
return ERRORTOKEN;
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
}
else /* col < tok->indstack[tok->indent] */ {
/* Dedent -- any number, must be consistent */
while (tok->indent > 0 &&
col < tok->indstack[tok->indent]) {
tok->indent--;
tok->pendin--;
}
if (col != tok->indstack[tok->indent]) {
fprintf(stderr, "inconsistent dedent\n");
tok->done = E_TOKEN;
return ERRORTOKEN;
}
}
}
*p_start = *p_end = tok->cur;
/* Return pending indents/dedents */
if (tok->pendin != 0) {
if (tok->pendin < 0) {
tok->pendin++;
return DEDENT;
}
else {
tok->pendin--;
return INDENT;
}
}
again:
/* Skip spaces */
do {
c = tok_nextc(tok);
} while (c == ' ' || c == '\t');
/* Set start of current token */
*p_start = tok->cur - 1;
/* Skip comment */
if (c == '#') {
/* Hack to allow overriding the tabsize in the file.
This is also recognized by vi, when it occurs near the
beginning or end of the file. (Will vi never die...?) */
int x;
if (sscanf(tok->cur, " vi:set tabsize=%d:", &x) == 1 &&
x >= 1 && x <= 40) {
fprintf(stderr, "# vi:set tabsize=%d:\n", x);
tok->tabsize = x;
}
do {
c = tok_nextc(tok);
} while (c != EOF && c != '\n');
}
/* Check for EOF and errors now */
if (c == EOF)
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
/* Identifier (most frequent token!) */
if (isalpha(c) || c == '_') {
do {
c = tok_nextc(tok);
} while (isalnum(c) || c == '_');
tok_backup(tok, c);
*p_end = tok->cur;
return NAME;
}
/* Newline */
if (c == '\n') {
tok->atbol = 1;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
return NEWLINE;
}
/* Number */
if (isdigit(c)) {
if (c == '0') {
/* Hex or octal */
c = tok_nextc(tok);
if (c == '.')
goto fraction;
if (c == 'x' || c == 'X') {
/* Hex */
do {
c = tok_nextc(tok);
} while (isxdigit(c));
}
else {
/* Octal; c is first char of it */
/* There's no 'isoctdigit' macro, sigh */
while ('0' <= c && c < '8') {
c = tok_nextc(tok);
}
}
}
else {
/* Decimal */
do {
c = tok_nextc(tok);
} while (isdigit(c));
/* Accept floating point numbers.
XXX This accepts incomplete things like 12e or 1e+;
worry about that at run-time.
XXX Doesn't accept numbers starting with a dot */
if (c == '.') {
fraction:
/* Fraction */
do {
c = tok_nextc(tok);
} while (isdigit(c));
}
if (c == 'e' || c == 'E') {
/* Exponent part */
c = tok_nextc(tok);
if (c == '+' || c == '-')
c = tok_nextc(tok);
while (isdigit(c)) {
c = tok_nextc(tok);
}
}
}
tok_backup(tok, c);
*p_end = tok->cur;
return NUMBER;
}
/* String */
if (c == '\'') {
for (;;) {
c = tok_nextc(tok);
if (c == '\n' || c == EOF) {
tok->done = E_TOKEN;
return ERRORTOKEN;
}
if (c == '\\') {
c = tok_nextc(tok);
*p_end = tok->cur;
if (c == '\n' || c == EOF) {
tok->done = E_TOKEN;
return ERRORTOKEN;
}
continue;
}
if (c == '\'')
break;
}
*p_end = tok->cur;
return STRING;
}
/* Line continuation */
if (c == '\\') {
c = tok_nextc(tok);
if (c != '\n') {
tok->done = E_TOKEN;
return ERRORTOKEN;
}
goto again; /* Read next line */
}
/* Punctuation character */
*p_end = tok->cur;
return tok_1char(c);
}
#ifdef DEBUG
void
tok_dump(type, start, end)
int type;
char *start, *end;
{
printf("%s", tok_name[type]);
if (type == NAME || type == NUMBER || type == STRING || type == OP)
printf("(%.*s)", (int)(end - start), start);
}
#endif