cpython/Modules/regexmodule.c

696 lines
15 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
XXX support range parameter on search
XXX support mstop parameter on search
*/
/***********************************************************
Copyright (c) 2000, BeOpen.com.
Copyright (c) 1995-2000, Corporation for National Research Initiatives.
Copyright (c) 1990-1995, Stichting Mathematisch Centrum.
All rights reserved.
See the file "Misc/COPYRIGHT" for information on usage and
redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
******************************************************************/
/* Regular expression objects */
/* This uses Tatu Ylonen's copyleft-free reimplementation of
GNU regular expressions */
#include "Python.h"
#include <ctype.h>
#include "regexpr.h"
static PyObject *RegexError; /* Exception */
typedef struct {
PyObject_HEAD
struct re_pattern_buffer re_patbuf; /* The compiled expression */
struct re_registers re_regs; /* The registers from the last match */
char re_fastmap[256]; /* Storage for fastmap */
PyObject *re_translate; /* String object for translate table */
PyObject *re_lastok; /* String object last matched/searched */
PyObject *re_groupindex; /* Group name to index dictionary */
PyObject *re_givenpat; /* Pattern with symbolic groups */
PyObject *re_realpat; /* Pattern without symbolic groups */
} regexobject;
/* Regex object methods */
static void
reg_dealloc(regexobject *re)
{
if (re->re_patbuf.buffer)
free(re->re_patbuf.buffer);
Py_XDECREF(re->re_translate);
Py_XDECREF(re->re_lastok);
Py_XDECREF(re->re_groupindex);
Py_XDECREF(re->re_givenpat);
Py_XDECREF(re->re_realpat);
PyObject_Del(re);
}
static PyObject *
makeresult(struct re_registers *regs)
{
PyObject *v;
int i;
static PyObject *filler = NULL;
if (filler == NULL) {
filler = Py_BuildValue("(ii)", -1, -1);
if (filler == NULL)
return NULL;
}
v = PyTuple_New(RE_NREGS);
if (v == NULL)
return NULL;
for (i = 0; i < RE_NREGS; i++) {
int lo = regs->start[i];
int hi = regs->end[i];
PyObject *w;
if (lo == -1 && hi == -1) {
w = filler;
Py_INCREF(w);
}
else
w = Py_BuildValue("(ii)", lo, hi);
if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
Py_DECREF(v);
return NULL;
}
}
return v;
}
static PyObject *
regobj_match(regexobject *re, PyObject *args)
{
PyObject *argstring;
char *buffer;
int size;
int offset = 0;
int result;
if (!PyArg_ParseTuple(args, "O|i:match", &argstring, &offset))
return NULL;
if (!PyArg_Parse(argstring, "t#", &buffer, &size))
return NULL;
if (offset < 0 || offset > size) {
PyErr_SetString(RegexError, "match offset out of range");
return NULL;
}
Py_XDECREF(re->re_lastok);
re->re_lastok = NULL;
result = _Py_re_match(&re->re_patbuf, (unsigned char *)buffer, size, offset,
&re->re_regs);
if (result < -1) {
/* Serious failure of some sort; if re_match didn't
set an exception, raise a generic error */
if (!PyErr_Occurred())
PyErr_SetString(RegexError, "match failure");
return NULL;
}
if (result >= 0) {
Py_INCREF(argstring);
re->re_lastok = argstring;
}
return PyInt_FromLong((long)result); /* Length of the match or -1 */
}
static PyObject *
regobj_search(regexobject *re, PyObject *args)
{
PyObject *argstring;
char *buffer;
int size;
int offset = 0;
int range;
int result;
if (!PyArg_ParseTuple(args, "O|i:search", &argstring, &offset))
return NULL;
if (!PyArg_Parse(argstring, "t#:search", &buffer, &size))
return NULL;
if (offset < 0 || offset > size) {
PyErr_SetString(RegexError, "search offset out of range");
return NULL;
}
/* NB: In Emacs 18.57, the documentation for re_search[_2] and
the implementation don't match: the documentation states that
|range| positions are tried, while the code tries |range|+1
positions. It seems more productive to believe the code! */
range = size - offset;
Py_XDECREF(re->re_lastok);
re->re_lastok = NULL;
result = _Py_re_search(&re->re_patbuf, (unsigned char *)buffer, size, offset, range,
&re->re_regs);
if (result < -1) {
/* Serious failure of some sort; if re_match didn't
set an exception, raise a generic error */
if (!PyErr_Occurred())
PyErr_SetString(RegexError, "match failure");
return NULL;
}
if (result >= 0) {
Py_INCREF(argstring);
re->re_lastok = argstring;
}
return PyInt_FromLong((long)result); /* Position of the match or -1 */
}
/* get the group from the regex where index can be a string (group name) or
an integer index [0 .. 99]
*/
static PyObject*
group_from_index(regexobject *re, PyObject *index)
{
int i, a, b;
char *v;
if (PyString_Check(index))
if (re->re_groupindex == NULL ||
!(index = PyDict_GetItem(re->re_groupindex, index)))
{
PyErr_SetString(RegexError,
"group() group name doesn't exist");
return NULL;
}
i = PyInt_AsLong(index);
if (i == -1 && PyErr_Occurred())
return NULL;
if (i < 0 || i >= RE_NREGS) {
PyErr_SetString(RegexError, "group() index out of range");
return NULL;
}
if (re->re_lastok == NULL) {
PyErr_SetString(RegexError,
"group() only valid after successful match/search");
return NULL;
}
a = re->re_regs.start[i];
b = re->re_regs.end[i];
if (a < 0 || b < 0) {
Py_INCREF(Py_None);
return Py_None;
}
if (!(v = PyString_AsString(re->re_lastok)))
return NULL;
return PyString_FromStringAndSize(v+a, b-a);
}
static PyObject *
regobj_group(regexobject *re, PyObject *args)
{
int n = PyTuple_Size(args);
int i;
PyObject *res = NULL;
if (n < 0)
return NULL;
if (n == 0) {
PyErr_SetString(PyExc_TypeError, "not enough arguments");
return NULL;
}
if (n == 1) {
/* return value is a single string */
PyObject *index = PyTuple_GetItem(args, 0);
if (!index)
return NULL;
return group_from_index(re, index);
}
/* return value is a tuple */
if (!(res = PyTuple_New(n)))
return NULL;
for (i = 0; i < n; i++) {
PyObject *index = PyTuple_GetItem(args, i);
PyObject *group = NULL;
if (!index)
goto finally;
if (!(group = group_from_index(re, index)))
goto finally;
if (PyTuple_SetItem(res, i, group) < 0)
goto finally;
}
return res;
finally:
Py_DECREF(res);
return NULL;
}
static struct PyMethodDef reg_methods[] = {
{"match", (PyCFunction)regobj_match, 1},
{"search", (PyCFunction)regobj_search, 1},
{"group", (PyCFunction)regobj_group, 1},
{NULL, NULL} /* sentinel */
};
static char* members[] = {
"last", "regs", "translate",
"groupindex", "realpat", "givenpat",
NULL
};
static PyObject *
regobj_getattr(regexobject *re, char *name)
{
if (strcmp(name, "regs") == 0) {
if (re->re_lastok == NULL) {
Py_INCREF(Py_None);
return Py_None;
}
return makeresult(&re->re_regs);
}
if (strcmp(name, "last") == 0) {
if (re->re_lastok == NULL) {
Py_INCREF(Py_None);
return Py_None;
}
Py_INCREF(re->re_lastok);
return re->re_lastok;
}
if (strcmp(name, "translate") == 0) {
if (re->re_translate == NULL) {
Py_INCREF(Py_None);
return Py_None;
}
Py_INCREF(re->re_translate);
return re->re_translate;
}
if (strcmp(name, "groupindex") == 0) {
if (re->re_groupindex == NULL) {
Py_INCREF(Py_None);
return Py_None;
}
Py_INCREF(re->re_groupindex);
return re->re_groupindex;
}
if (strcmp(name, "realpat") == 0) {
if (re->re_realpat == NULL) {
Py_INCREF(Py_None);
return Py_None;
}
Py_INCREF(re->re_realpat);
return re->re_realpat;
}
if (strcmp(name, "givenpat") == 0) {
if (re->re_givenpat == NULL) {
Py_INCREF(Py_None);
return Py_None;
}
Py_INCREF(re->re_givenpat);
return re->re_givenpat;
}
if (strcmp(name, "__members__") == 0) {
int i = 0;
PyObject *list = NULL;
/* okay, so it's unlikely this list will change that often.
still, it's easier to change it in just one place.
*/
while (members[i])
i++;
if (!(list = PyList_New(i)))
return NULL;
i = 0;
while (members[i]) {
PyObject* v = PyString_FromString(members[i]);
if (!v || PyList_SetItem(list, i, v) < 0) {
Py_DECREF(list);
return NULL;
}
i++;
}
return list;
}
return Py_FindMethod(reg_methods, (PyObject *)re, name);
}
static PyTypeObject Regextype = {
PyObject_HEAD_INIT(&PyType_Type)
0, /*ob_size*/
"regex", /*tp_name*/
sizeof(regexobject), /*tp_size*/
0, /*tp_itemsize*/
/* methods */
(destructor)reg_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)regobj_getattr, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
};
/* reference counting invariants:
pattern: borrowed
translate: borrowed
givenpat: borrowed
groupindex: transferred
*/
static PyObject *
newregexobject(PyObject *pattern, PyObject *translate, PyObject *givenpat, PyObject *groupindex)
{
regexobject *re;
char *pat;
int size;
if (!PyArg_Parse(pattern, "t#", &pat, &size))
return NULL;
if (translate != NULL && PyString_Size(translate) != 256) {
PyErr_SetString(RegexError,
"translation table must be 256 bytes");
return NULL;
}
re = PyObject_New(regexobject, &Regextype);
if (re != NULL) {
char *error;
re->re_patbuf.buffer = NULL;
re->re_patbuf.allocated = 0;
re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
if (translate) {
re->re_patbuf.translate = (unsigned char *)PyString_AsString(translate);
if (!re->re_patbuf.translate)
goto finally;
Py_INCREF(translate);
}
else
re->re_patbuf.translate = NULL;
re->re_translate = translate;
re->re_lastok = NULL;
re->re_groupindex = groupindex;
Py_INCREF(pattern);
re->re_realpat = pattern;
Py_INCREF(givenpat);
re->re_givenpat = givenpat;
error = _Py_re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
if (error != NULL) {
PyErr_SetString(RegexError, error);
goto finally;
}
}
return (PyObject *)re;
finally:
Py_DECREF(re);
return NULL;
}
static PyObject *
regex_compile(PyObject *self, PyObject *args)
{
PyObject *pat = NULL;
PyObject *tran = NULL;
if (!PyArg_ParseTuple(args, "S|S:compile", &pat, &tran))
return NULL;
return newregexobject(pat, tran, pat, NULL);
}
static PyObject *
symcomp(PyObject *pattern, PyObject *gdict)
{
char *opat, *oend, *o, *n, *g, *v;
int group_count = 0;
int sz;
int escaped = 0;
char name_buf[128];
PyObject *npattern;
int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
if (!(opat = PyString_AsString(pattern)))
return NULL;
if ((sz = PyString_Size(pattern)) < 0)
return NULL;
oend = opat + sz;
o = opat;
if (oend == opat) {
Py_INCREF(pattern);
return pattern;
}
if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) ||
!(n = PyString_AsString(npattern)))
return NULL;
while (o < oend) {
if (*o == '(' && escaped == require_escape) {
char *backtrack;
escaped = 0;
++group_count;
*n++ = *o;
if (++o >= oend || *o != '<')
continue;
/* *o == '<' */
if (o+1 < oend && *(o+1) == '>')
continue;
backtrack = o;
g = name_buf;
for (++o; o < oend;) {
if (*o == '>') {
PyObject *group_name = NULL;
PyObject *group_index = NULL;
*g++ = '\0';
group_name = PyString_FromString(name_buf);
group_index = PyInt_FromLong(group_count);
if (group_name == NULL ||
group_index == NULL ||
PyDict_SetItem(gdict, group_name,
group_index) != 0)
{
Py_XDECREF(group_name);
Py_XDECREF(group_index);
Py_XDECREF(npattern);
return NULL;
}
Py_DECREF(group_name);
Py_DECREF(group_index);
++o; /* eat the '>' */
break;
}
if (!isalnum(Py_CHARMASK(*o)) && *o != '_') {
o = backtrack;
break;
}
*g++ = *o++;
}
}
else if (*o == '[' && !escaped) {
*n++ = *o;
++o; /* eat the char following '[' */
*n++ = *o;
while (o < oend && *o != ']') {
++o;
*n++ = *o;
}
if (o < oend)
++o;
}
else if (*o == '\\') {
escaped = 1;
*n++ = *o;
++o;
}
else {
escaped = 0;
*n++ = *o;
++o;
}
}
if (!(v = PyString_AsString(npattern))) {
Py_DECREF(npattern);
return NULL;
}
/* _PyString_Resize() decrements npattern on failure */
if (_PyString_Resize(&npattern, n - v) == 0)
return npattern;
else {
return NULL;
}
}
static PyObject *
regex_symcomp(PyObject *self, PyObject *args)
{
PyObject *pattern;
PyObject *tran = NULL;
PyObject *gdict = NULL;
PyObject *npattern;
PyObject *retval = NULL;
if (!PyArg_ParseTuple(args, "S|S:symcomp", &pattern, &tran))
return NULL;
gdict = PyDict_New();
if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) {
Py_DECREF(gdict);
Py_DECREF(pattern);
return NULL;
}
retval = newregexobject(npattern, tran, pattern, gdict);
Py_DECREF(npattern);
return retval;
}
static PyObject *cache_pat;
static PyObject *cache_prog;
static int
update_cache(PyObject *pat)
{
PyObject *tuple = Py_BuildValue("(O)", pat);
int status = 0;
if (!tuple)
return -1;
if (pat != cache_pat) {
Py_XDECREF(cache_pat);
cache_pat = NULL;
Py_XDECREF(cache_prog);
cache_prog = regex_compile((PyObject *)NULL, tuple);
if (cache_prog == NULL) {
status = -1;
goto finally;
}
cache_pat = pat;
Py_INCREF(cache_pat);
}
finally:
Py_DECREF(tuple);
return status;
}
static PyObject *
regex_match(PyObject *self, PyObject *args)
{
PyObject *pat, *string;
PyObject *tuple, *v;
if (!PyArg_Parse(args, "(SS)", &pat, &string))
return NULL;
if (update_cache(pat) < 0)
return NULL;
if (!(tuple = Py_BuildValue("(S)", string)))
return NULL;
v = regobj_match((regexobject *)cache_prog, tuple);
Py_DECREF(tuple);
return v;
}
static PyObject *
regex_search(PyObject *self, PyObject *args)
{
PyObject *pat, *string;
PyObject *tuple, *v;
if (!PyArg_Parse(args, "(SS)", &pat, &string))
return NULL;
if (update_cache(pat) < 0)
return NULL;
if (!(tuple = Py_BuildValue("(S)", string)))
return NULL;
v = regobj_search((regexobject *)cache_prog, tuple);
Py_DECREF(tuple);
return v;
}
static PyObject *
regex_set_syntax(PyObject *self, PyObject *args)
{
int syntax;
if (!PyArg_Parse(args, "i", &syntax))
return NULL;
syntax = re_set_syntax(syntax);
/* wipe the global pattern cache */
Py_XDECREF(cache_pat);
cache_pat = NULL;
Py_XDECREF(cache_prog);
cache_prog = NULL;
return PyInt_FromLong((long)syntax);
}
static PyObject *
regex_get_syntax(PyObject *self, PyObject *args)
{
if (!PyArg_Parse(args, ""))
return NULL;
return PyInt_FromLong((long)re_syntax);
}
static struct PyMethodDef regex_global_methods[] = {
{"compile", regex_compile, 1},
{"symcomp", regex_symcomp, 1},
{"match", regex_match, 0},
{"search", regex_search, 0},
{"set_syntax", regex_set_syntax, 0},
{"get_syntax", regex_get_syntax, 0},
{NULL, NULL} /* sentinel */
};
DL_EXPORT(void)
initregex()
{
PyObject *m, *d, *v;
int i;
char *s;
m = Py_InitModule("regex", regex_global_methods);
d = PyModule_GetDict(m);
/* Initialize regex.error exception */
v = RegexError = PyErr_NewException("regex.error", NULL, NULL);
if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
goto finally;
/* Initialize regex.casefold constant */
if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
goto finally;
if (!(s = PyString_AsString(v)))
goto finally;
for (i = 0; i < 256; i++) {
if (isupper(i))
s[i] = tolower(i);
else
s[i] = i;
}
if (PyDict_SetItemString(d, "casefold", v) < 0)
goto finally;
Py_DECREF(v);
if (!PyErr_Occurred())
return;
finally:
/* Nothing */ ;
}