595 lines
14 KiB
C
595 lines
14 KiB
C
/*
|
|
XXX support range parameter on search
|
|
XXX support mstop parameter on search
|
|
*/
|
|
|
|
/***********************************************************
|
|
Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
|
|
The Netherlands.
|
|
|
|
All Rights Reserved
|
|
|
|
Permission to use, copy, modify, and distribute this software and its
|
|
documentation for any purpose and without fee is hereby granted,
|
|
provided that the above copyright notice appear in all copies and that
|
|
both that copyright notice and this permission notice appear in
|
|
supporting documentation, and that the names of Stichting Mathematisch
|
|
Centrum or CWI not be used in advertising or publicity pertaining to
|
|
distribution of the software without specific, written prior permission.
|
|
|
|
STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
|
|
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
|
|
FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
|
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
******************************************************************/
|
|
|
|
/* Regular expression objects */
|
|
/* This uses Tatu Ylonen's copyleft-free reimplementation of
|
|
GNU regular expressions */
|
|
|
|
#include "Python.h"
|
|
|
|
#include "regexpr.h"
|
|
|
|
static PyObject *RegexError; /* Exception */
|
|
|
|
typedef struct {
|
|
PyObject_HEAD
|
|
struct re_pattern_buffer re_patbuf; /* The compiled expression */
|
|
struct re_registers re_regs; /* The registers from the last match */
|
|
char re_fastmap[256]; /* Storage for fastmap */
|
|
PyObject *re_translate; /* String object for translate table */
|
|
PyObject *re_lastok; /* String object last matched/searched */
|
|
PyObject *re_groupindex; /* Group name to index dictionary */
|
|
PyObject *re_givenpat; /* Pattern with symbolic groups */
|
|
PyObject *re_realpat; /* Pattern without symbolic groups */
|
|
} regexobject;
|
|
|
|
/* Regex object methods */
|
|
|
|
static void
|
|
reg_dealloc(re)
|
|
regexobject *re;
|
|
{
|
|
PyMem_XDEL(re->re_patbuf.buffer);
|
|
Py_XDECREF(re->re_translate);
|
|
Py_XDECREF(re->re_lastok);
|
|
Py_XDECREF(re->re_groupindex);
|
|
Py_XDECREF(re->re_givenpat);
|
|
Py_XDECREF(re->re_realpat);
|
|
PyMem_DEL(re);
|
|
}
|
|
|
|
static PyObject *
|
|
makeresult(regs)
|
|
struct re_registers *regs;
|
|
{
|
|
PyObject *v = PyTuple_New(RE_NREGS);
|
|
if (v != NULL) {
|
|
int i;
|
|
for (i = 0; i < RE_NREGS; i++) {
|
|
PyObject *w;
|
|
w = Py_BuildValue("(ii)", regs->start[i], regs->end[i]);
|
|
if (w == NULL) {
|
|
Py_XDECREF(v);
|
|
v = NULL;
|
|
break;
|
|
}
|
|
PyTuple_SetItem(v, i, w);
|
|
}
|
|
}
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
reg_match(re, args)
|
|
regexobject *re;
|
|
PyObject *args;
|
|
{
|
|
PyObject *argstring;
|
|
char *buffer;
|
|
int size;
|
|
int offset;
|
|
int result;
|
|
if (PyArg_Parse(args, "S", &argstring)) {
|
|
offset = 0;
|
|
}
|
|
else {
|
|
PyErr_Clear();
|
|
if (!PyArg_Parse(args, "(Si)", &argstring, &offset))
|
|
return NULL;
|
|
}
|
|
buffer = PyString_AsString(argstring);
|
|
size = PyString_Size(argstring);
|
|
if (offset < 0 || offset > size) {
|
|
PyErr_SetString(RegexError, "match offset out of range");
|
|
return NULL;
|
|
}
|
|
Py_XDECREF(re->re_lastok);
|
|
re->re_lastok = NULL;
|
|
result = re_match(&re->re_patbuf, buffer, size, offset, &re->re_regs);
|
|
if (result < -1) {
|
|
/* Failure like stack overflow */
|
|
PyErr_SetString(RegexError, "match failure");
|
|
return NULL;
|
|
}
|
|
if (result >= 0) {
|
|
Py_INCREF(argstring);
|
|
re->re_lastok = argstring;
|
|
}
|
|
return PyInt_FromLong((long)result); /* Length of the match or -1 */
|
|
}
|
|
|
|
static PyObject *
|
|
reg_search(re, args)
|
|
regexobject *re;
|
|
PyObject *args;
|
|
{
|
|
PyObject *argstring;
|
|
char *buffer;
|
|
int size;
|
|
int offset;
|
|
int range;
|
|
int result;
|
|
|
|
if (PyArg_Parse(args, "S", &argstring)) {
|
|
offset = 0;
|
|
}
|
|
else {
|
|
PyErr_Clear();
|
|
if (!PyArg_Parse(args, "(Si)", &argstring, &offset))
|
|
return NULL;
|
|
}
|
|
buffer = PyString_AsString(argstring);
|
|
size = PyString_Size(argstring);
|
|
if (offset < 0 || offset > size) {
|
|
PyErr_SetString(RegexError, "search offset out of range");
|
|
return NULL;
|
|
}
|
|
/* NB: In Emacs 18.57, the documentation for re_search[_2] and
|
|
the implementation don't match: the documentation states that
|
|
|range| positions are tried, while the code tries |range|+1
|
|
positions. It seems more productive to believe the code! */
|
|
range = size - offset;
|
|
Py_XDECREF(re->re_lastok);
|
|
re->re_lastok = NULL;
|
|
result = re_search(&re->re_patbuf, buffer, size, offset, range,
|
|
&re->re_regs);
|
|
if (result < -1) {
|
|
/* Failure like stack overflow */
|
|
PyErr_SetString(RegexError, "match failure");
|
|
return NULL;
|
|
}
|
|
if (result >= 0) {
|
|
Py_INCREF(argstring);
|
|
re->re_lastok = argstring;
|
|
}
|
|
return PyInt_FromLong((long)result); /* Position of the match or -1 */
|
|
}
|
|
|
|
static PyObject *
|
|
reg_group(re, args)
|
|
regexobject *re;
|
|
PyObject *args;
|
|
{
|
|
int i, a, b;
|
|
if (args != NULL && PyTuple_Check(args)) {
|
|
int n = PyTuple_Size(args);
|
|
PyObject *res = PyTuple_New(n);
|
|
if (res == NULL)
|
|
return NULL;
|
|
for (i = 0; i < n; i++) {
|
|
PyObject *v = reg_group(re, PyTuple_GetItem(args, i));
|
|
if (v == NULL) {
|
|
Py_DECREF(res);
|
|
return NULL;
|
|
}
|
|
PyTuple_SetItem(res, i, v);
|
|
}
|
|
return res;
|
|
}
|
|
if (!PyArg_Parse(args, "i", &i)) {
|
|
PyObject *n;
|
|
PyErr_Clear();
|
|
if (!PyArg_Parse(args, "S", &n))
|
|
return NULL;
|
|
else {
|
|
PyObject *index;
|
|
if (re->re_groupindex == NULL)
|
|
index = NULL;
|
|
else
|
|
index = PyDict_GetItem(re->re_groupindex, n);
|
|
if (index == NULL) {
|
|
PyErr_SetString(RegexError, "group() group name doesn't exist");
|
|
return NULL;
|
|
}
|
|
i = PyInt_AsLong(index);
|
|
}
|
|
}
|
|
if (i < 0 || i >= RE_NREGS) {
|
|
PyErr_SetString(RegexError, "group() index out of range");
|
|
return NULL;
|
|
}
|
|
if (re->re_lastok == NULL) {
|
|
PyErr_SetString(RegexError,
|
|
"group() only valid after successful match/search");
|
|
return NULL;
|
|
}
|
|
a = re->re_regs.start[i];
|
|
b = re->re_regs.end[i];
|
|
if (a < 0 || b < 0) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
return PyString_FromStringAndSize(PyString_AsString(re->re_lastok)+a, b-a);
|
|
}
|
|
|
|
static struct PyMethodDef reg_methods[] = {
|
|
{"match", (PyCFunction)reg_match},
|
|
{"search", (PyCFunction)reg_search},
|
|
{"group", (PyCFunction)reg_group},
|
|
{NULL, NULL} /* sentinel */
|
|
};
|
|
|
|
static PyObject *
|
|
reg_getattr(re, name)
|
|
regexobject *re;
|
|
char *name;
|
|
{
|
|
if (strcmp(name, "regs") == 0) {
|
|
if (re->re_lastok == NULL) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
return makeresult(&re->re_regs);
|
|
}
|
|
if (strcmp(name, "last") == 0) {
|
|
if (re->re_lastok == NULL) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
Py_INCREF(re->re_lastok);
|
|
return re->re_lastok;
|
|
}
|
|
if (strcmp(name, "translate") == 0) {
|
|
if (re->re_translate == NULL) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
Py_INCREF(re->re_translate);
|
|
return re->re_translate;
|
|
}
|
|
if (strcmp(name, "groupindex") == 0) {
|
|
if (re->re_groupindex == NULL) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
Py_INCREF(re->re_groupindex);
|
|
return re->re_groupindex;
|
|
}
|
|
if (strcmp(name, "realpat") == 0) {
|
|
if (re->re_realpat == NULL) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
Py_INCREF(re->re_realpat);
|
|
return re->re_realpat;
|
|
}
|
|
if (strcmp(name, "givenpat") == 0) {
|
|
if (re->re_givenpat == NULL) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
Py_INCREF(re->re_givenpat);
|
|
return re->re_givenpat;
|
|
}
|
|
if (strcmp(name, "__members__") == 0) {
|
|
PyObject *list = PyList_New(6);
|
|
if (list) {
|
|
PyList_SetItem(list, 0, PyString_FromString("last"));
|
|
PyList_SetItem(list, 1, PyString_FromString("regs"));
|
|
PyList_SetItem(list, 2, PyString_FromString("translate"));
|
|
PyList_SetItem(list, 3, PyString_FromString("groupindex"));
|
|
PyList_SetItem(list, 4, PyString_FromString("realpat"));
|
|
PyList_SetItem(list, 5, PyString_FromString("givenpat"));
|
|
if (PyErr_Occurred()) {
|
|
Py_DECREF(list);
|
|
list = NULL;
|
|
}
|
|
}
|
|
return list;
|
|
}
|
|
return Py_FindMethod(reg_methods, (PyObject *)re, name);
|
|
}
|
|
|
|
static PyTypeObject Regextype = {
|
|
PyObject_HEAD_INIT(&PyType_Type)
|
|
0, /*ob_size*/
|
|
"regex", /*tp_name*/
|
|
sizeof(regexobject), /*tp_size*/
|
|
0, /*tp_itemsize*/
|
|
/* methods */
|
|
(destructor)reg_dealloc, /*tp_dealloc*/
|
|
0, /*tp_print*/
|
|
(getattrfunc)reg_getattr, /*tp_getattr*/
|
|
0, /*tp_setattr*/
|
|
0, /*tp_compare*/
|
|
0, /*tp_repr*/
|
|
};
|
|
|
|
static PyObject *
|
|
newregexobject(pattern, translate, givenpat, groupindex)
|
|
PyObject *pattern;
|
|
PyObject *translate;
|
|
PyObject *givenpat;
|
|
PyObject *groupindex;
|
|
{
|
|
regexobject *re;
|
|
char *pat = PyString_AsString(pattern);
|
|
int size = PyString_Size(pattern);
|
|
|
|
if (translate != NULL && PyString_Size(translate) != 256) {
|
|
PyErr_SetString(RegexError,
|
|
"translation table must be 256 bytes");
|
|
return NULL;
|
|
}
|
|
re = PyObject_NEW(regexobject, &Regextype);
|
|
if (re != NULL) {
|
|
char *error;
|
|
re->re_patbuf.buffer = NULL;
|
|
re->re_patbuf.allocated = 0;
|
|
re->re_patbuf.fastmap = re->re_fastmap;
|
|
if (translate)
|
|
re->re_patbuf.translate = PyString_AsString(translate);
|
|
else
|
|
re->re_patbuf.translate = NULL;
|
|
Py_XINCREF(translate);
|
|
re->re_translate = translate;
|
|
re->re_lastok = NULL;
|
|
re->re_groupindex = groupindex;
|
|
Py_INCREF(pattern);
|
|
re->re_realpat = pattern;
|
|
Py_INCREF(givenpat);
|
|
re->re_givenpat = givenpat;
|
|
error = re_compile_pattern(pat, size, &re->re_patbuf);
|
|
if (error != NULL) {
|
|
PyErr_SetString(RegexError, error);
|
|
Py_DECREF(re);
|
|
re = NULL;
|
|
}
|
|
}
|
|
return (PyObject *)re;
|
|
}
|
|
|
|
static PyObject *
|
|
regex_compile(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
PyObject *pat = NULL;
|
|
PyObject *tran = NULL;
|
|
if (!PyArg_Parse(args, "S", &pat)) {
|
|
PyErr_Clear();
|
|
if (!PyArg_Parse(args, "(SS)", &pat, &tran))
|
|
return NULL;
|
|
}
|
|
return newregexobject(pat, tran, pat, NULL);
|
|
}
|
|
|
|
static PyObject *
|
|
symcomp(pattern, gdict)
|
|
PyObject *pattern;
|
|
PyObject *gdict;
|
|
{
|
|
char *opat = PyString_AsString(pattern);
|
|
char *oend = opat + PyString_Size(pattern);
|
|
int group_count = 0;
|
|
int escaped = 0;
|
|
char *o = opat;
|
|
char *n;
|
|
char name_buf[128];
|
|
char *g;
|
|
PyObject *npattern;
|
|
int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
|
|
|
|
if (oend == opat) {
|
|
Py_INCREF(pattern);
|
|
return pattern;
|
|
}
|
|
|
|
npattern = PyString_FromStringAndSize((char*)NULL, PyString_Size(pattern));
|
|
if (npattern == NULL)
|
|
return NULL;
|
|
n = PyString_AsString(npattern);
|
|
|
|
while (o < oend) {
|
|
if (*o == '(' && escaped == require_escape) {
|
|
char *backtrack;
|
|
escaped = 0;
|
|
++group_count;
|
|
*n++ = *o;
|
|
if (++o >= oend || *o != '<')
|
|
continue;
|
|
/* *o == '<' */
|
|
if (o+1 < oend && *(o+1) == '>')
|
|
continue;
|
|
backtrack = o;
|
|
g = name_buf;
|
|
for (++o; o < oend;) {
|
|
if (*o == '>') {
|
|
PyObject *group_name = NULL;
|
|
PyObject *group_index = NULL;
|
|
*g++ = '\0';
|
|
group_name = PyString_FromString(name_buf);
|
|
group_index = PyInt_FromLong(group_count);
|
|
if (group_name == NULL || group_index == NULL
|
|
|| PyDict_SetItem(gdict, group_name, group_index) != 0) {
|
|
Py_XDECREF(group_name);
|
|
Py_XDECREF(group_index);
|
|
Py_XDECREF(npattern);
|
|
return NULL;
|
|
}
|
|
++o; /* eat the '>' */
|
|
break;
|
|
}
|
|
if (!isalnum(Py_CHARMASK(*o)) && *o != '_') {
|
|
o = backtrack;
|
|
break;
|
|
}
|
|
*g++ = *o++;
|
|
}
|
|
}
|
|
if (*o == '[' && !escaped) {
|
|
*n++ = *o;
|
|
++o; /* eat the char following '[' */
|
|
*n++ = *o;
|
|
while (o < oend && *o != ']') {
|
|
++o;
|
|
*n++ = *o;
|
|
}
|
|
if (o < oend)
|
|
++o;
|
|
}
|
|
else if (*o == '\\') {
|
|
escaped = 1;
|
|
*n++ = *o;
|
|
++o;
|
|
}
|
|
else {
|
|
escaped = 0;
|
|
*n++ = *o;
|
|
++o;
|
|
}
|
|
}
|
|
|
|
if (_PyString_Resize(&npattern, n - PyString_AsString(npattern)) == 0)
|
|
return npattern;
|
|
else {
|
|
return NULL;
|
|
}
|
|
|
|
}
|
|
|
|
static PyObject *
|
|
regex_symcomp(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
PyObject *pattern;
|
|
PyObject *tran = NULL;
|
|
PyObject *gdict = NULL;
|
|
PyObject *npattern;
|
|
if (!PyArg_Parse(args, "S", &pattern)) {
|
|
PyErr_Clear();
|
|
if (!PyArg_Parse(args, "(SS)", &pattern, &tran))
|
|
return NULL;
|
|
}
|
|
gdict = PyDict_New();
|
|
if (gdict == NULL
|
|
|| (npattern = symcomp(pattern, gdict)) == NULL) {
|
|
Py_DECREF(gdict);
|
|
Py_DECREF(pattern);
|
|
return NULL;
|
|
}
|
|
return newregexobject(npattern, tran, pattern, gdict);
|
|
}
|
|
|
|
|
|
static PyObject *cache_pat;
|
|
static PyObject *cache_prog;
|
|
|
|
static int
|
|
update_cache(pat)
|
|
PyObject *pat;
|
|
{
|
|
if (pat != cache_pat) {
|
|
Py_XDECREF(cache_pat);
|
|
cache_pat = NULL;
|
|
Py_XDECREF(cache_prog);
|
|
cache_prog = regex_compile((PyObject *)NULL, pat);
|
|
if (cache_prog == NULL)
|
|
return -1;
|
|
cache_pat = pat;
|
|
Py_INCREF(cache_pat);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static PyObject *
|
|
regex_match(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
PyObject *pat, *string;
|
|
if (!PyArg_Parse(args, "(SS)", &pat, &string))
|
|
return NULL;
|
|
if (update_cache(pat) < 0)
|
|
return NULL;
|
|
return reg_match((regexobject *)cache_prog, string);
|
|
}
|
|
|
|
static PyObject *
|
|
regex_search(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
PyObject *pat, *string;
|
|
if (!PyArg_Parse(args, "(SS)", &pat, &string))
|
|
return NULL;
|
|
if (update_cache(pat) < 0)
|
|
return NULL;
|
|
return reg_search((regexobject *)cache_prog, string);
|
|
}
|
|
|
|
static PyObject *
|
|
regex_set_syntax(self, args)
|
|
PyObject *self, *args;
|
|
{
|
|
int syntax;
|
|
if (!PyArg_Parse(args, "i", &syntax))
|
|
return NULL;
|
|
syntax = re_set_syntax(syntax);
|
|
return PyInt_FromLong((long)syntax);
|
|
}
|
|
|
|
static struct PyMethodDef regex_global_methods[] = {
|
|
{"compile", regex_compile, 0},
|
|
{"symcomp", regex_symcomp, 0},
|
|
{"match", regex_match, 0},
|
|
{"search", regex_search, 0},
|
|
{"set_syntax", regex_set_syntax, 0},
|
|
{NULL, NULL} /* sentinel */
|
|
};
|
|
|
|
initregex()
|
|
{
|
|
PyObject *m, *d, *v;
|
|
|
|
m = Py_InitModule("regex", regex_global_methods);
|
|
d = PyModule_GetDict(m);
|
|
|
|
/* Initialize regex.error exception */
|
|
RegexError = PyString_FromString("regex.error");
|
|
if (RegexError == NULL || PyDict_SetItemString(d, "error", RegexError) != 0)
|
|
Py_FatalError("can't define regex.error");
|
|
|
|
/* Initialize regex.casefold constant */
|
|
v = PyString_FromStringAndSize((char *)NULL, 256);
|
|
if (v != NULL) {
|
|
int i;
|
|
char *s = PyString_AsString(v);
|
|
for (i = 0; i < 256; i++) {
|
|
if (isupper(i))
|
|
s[i] = tolower(i);
|
|
else
|
|
s[i] = i;
|
|
}
|
|
PyDict_SetItemString(d, "casefold", v);
|
|
Py_DECREF(v);
|
|
}
|
|
}
|