New "re" regular expression support.

This code is written by Philip Hazel and Andrew Kuchling. It requires a new "re.py" module, too.
1997-10-06 14:43:11 +00:00 · 1997-10-06 14:43:11 +00:00 · 51b3aa3d38
parent 04ac894189
commit 51b3aa3d38
4 changed files with 5135 additions and 0 deletions
--- a/Modules/pcre-internal.h
+++ b/Modules/pcre-internal.h
@ -0,0 +1,226 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+
+#define PCRE_VERSION       "0.95 23-Sep-1997"
+
+
+/* This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+           Copyright (c) 1997 University of Cambridge
+
+-----------------------------------------------------------------------------
+Permission is granted to anyone to use this software for any purpose on any
+computer system, and to redistribute it freely, subject to the following
+restrictions:
+
+1. This software is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+2. The origin of this software must not be misrepresented, either by
+   explicit claim or by omission.
+
+3. Altered versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+-----------------------------------------------------------------------------
+*/
+
+/* This header contains definitions that are shared between the different
+modules, but which are not relevant to the outside. */
+
+/* Standard C headers plus the external interface definition */
+
+#include <ctype.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pcre.h"
+
+/* Private options flags start at the most significant end of the byte. The
+public options defined in pcre.h start at the least significant end. Make sure
+they don't overlap! */
+
+#define PCRE_FIRSTSET  0x80          /* first_char is set */
+#define PCRE_STARTLINE 0x40          /* start after \n for multiline */
+
+/* Options for the "extra" block produced by pcre_study(). */
+
+#define PCRE_STUDY_CASELESS 0x01     /* study was caseless */
+#define PCRE_STUDY_MAPPED   0x20     /* a map of starting chars exists */
+
+/* Masks for identifying the public options: all permitted at compile time,
+only some permitted at run or study time. */
+
+#ifdef FOR_PYTHON
+#define PUBLIC_OPTIONS \
+  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_DOTALL)
+#else
+#define PUBLIC_OPTIONS \
+  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE)
+#endif
+#define PUBLIC_EXEC_OPTIONS (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE)
+#define PUBLIC_STUDY_OPTIONS (PCRE_CASELESS)
+
+/* Magic number to provide a small check against being handed junk. */
+
+#define MAGIC_NUMBER  0x50435245   /* 'PCRE' */
+
+/* Miscellaneous definitions */
+
+typedef int BOOL;
+
+#define FALSE   0
+#define TRUE    1
+
+/* Flags for character classes - see also class_ops table below. */
+
+#define CLASS_DIGITS         0x01
+#define CLASS_NOT_DIGITS     0x02
+#define CLASS_WHITESPACE     0x04
+#define CLASS_NOT_WHITESPACE 0x08
+#define CLASS_WORD           0x10
+#define CLASS_NOT_WORD       0x20
+
+/* These are escaped items that aren't just an encoding of a particular data
+value such as \n. They must have non-zero values, as check_escape() returns
+their negation. Also, they must appear in the same order as in the opcode
+definitions below, up to ESC_Z. The final one must be ESC_REF as subsequent
+values are used for \1, \2, \3, etc. There is a test in the code for an escape
+greater than ESC_b and less than ESC_Z to detect the types that may be
+repeated. If any new escapes are put in-between that don't consume a character,
+that code will have to change. */
+
+enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w,
+       ESC_Z, ESC_REF };
+
+/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
+that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
+OP_EOL must correspond in order to the list of escapes immediately above. */
+
+enum {
+  OP_END,            /* End of pattern */
+
+  /* Values corresponding to backslashed metacharacters */
+
+  OP_SOD,            /* Start of data: \A */
+  OP_NOT_WORD_BOUNDARY,  /* \W */
+  OP_WORD_BOUNDARY,      /* \w */
+  OP_NOT_DIGIT,          /* \D */
+  OP_DIGIT,              /* \d */
+  OP_NOT_WHITESPACE,     /* \S */
+  OP_WHITESPACE,         /* \s */
+  OP_NOT_WORDCHAR,       /* \W */
+  OP_WORDCHAR,           /* \w */
+  OP_EOD,            /* End of data: or \Z. This must always be the last
+                        of the backslashed meta values. */
+
+  OP_CIRC,           /* Start of line - varies with multiline switch */
+  OP_DOLL,           /* End of line - varies with multiline switch */
+  OP_ANY,            /* Match any character */
+  OP_CHARS,          /* Match string of characters */
+
+  OP_STAR,           /* The maximizing and minimizing versions of */
+  OP_MINSTAR,        /* all these opcodes must come in pairs, with */
+  OP_PLUS,           /* the minimizing one second. */
+  OP_MINPLUS,        /* This first set applies to single characters */
+  OP_QUERY,
+  OP_MINQUERY,
+  OP_UPTO,           /* From 0 to n matches. */
+  OP_MINUPTO,
+  OP_EXACT,          /* Exactly n matches. */
+
+  OP_TYPESTAR,       /* The maximizing and minimizing versions of */
+  OP_TYPEMINSTAR,    /* all these opcodes must come in pairs, with */
+  OP_TYPEPLUS,       /* the minimizing one second. These codes must */
+  OP_TYPEMINPLUS,    /* be in exactly the same order as those above. */
+  OP_TYPEQUERY,      /* This set applies to character types such as \d */
+  OP_TYPEMINQUERY,
+  OP_TYPEUPTO,
+  OP_TYPEMINUPTO,
+  OP_TYPEEXACT,
+
+  OP_CRSTAR,         /* The maximizing and minimizing versions of */
+  OP_CRMINSTAR,      /* all these opcodes must come in pairs, with */
+  OP_CRPLUS,         /* the minimizing one second. These codes must */
+  OP_CRMINPLUS,      /* be in exactly the same order as those above. */
+  OP_CRQUERY,        /* These are for character classes and back refs */
+  OP_CRMINQUERY,
+  OP_CRRANGE,        /* These are different to the two seta above. */
+  OP_CRMINRANGE,
+
+  OP_CLASS,          /* Match a character class */
+  OP_NEGCLASS,       /* Don't match a character class */
+  OP_REF,            /* Match a back reference */
+
+  OP_ALT,            /* Start of alternation */
+  OP_KET,            /* End of group that doesn't have an unbounded repeat */
+  OP_KETRMAX,        /* These two must remain together and in this */
+  OP_KETRMIN,        /* order. They are for groups the repeat for ever. */
+
+  OP_ASSERT,
+  OP_ASSERT_NOT,
+
+  OP_BRAZERO,        /* These two must remain together and in this */
+  OP_BRAMINZERO,     /* order. */
+
+  OP_BRA             /* This and greater values are used for brackets that
+                        extract substrings. */
+};
+
+/* The highest extraction number. This is limited by the number of opcodes
+left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */
+
+#define EXTRACT_MAX  99
+
+/* All character handling must be done as unsigned characters. Otherwise there
+are problems with top-bit-set characters and functions such as isspace().
+However, we leave the interface to the outside world as char *, because that
+should make things easier for callers. We define a short type for unsigned char
+to save lots of typing. I tried "uchar", but it causes problems on Digital
+Unix, where it is defined in sys/types, so use "uschar" instead. */
+
+typedef unsigned char uschar;
+
+/* The real format of the start of the pcre block; the actual code vector
+runs on as long as necessary after the end. */
+
+typedef struct real_pcre {
+  unsigned int  magic_number;
+  unsigned char options;
+  unsigned char top_bracket;
+  unsigned char first_char;
+  unsigned char code[1];
+} real_pcre;
+
+/* The real format of the extra block returned by pcre_study(). */
+
+typedef struct real_pcre_extra {
+  unsigned char options;
+  unsigned char start_bits[32];
+} real_pcre_extra;
+
+/* Global tables from pcre-chartables.c */
+
+extern uschar pcre_lcc[];
+extern uschar pcre_ucc[];
+extern uschar pcre_ctypes[];
+
+/* Bit definitions for entries in pcre_ctypes[]. */
+
+#define ctype_space   0x01
+#define ctype_digit   0x02
+#define ctype_xdigit  0x04
+#define ctype_word    0x08   /* alphameric or '_' */
+#ifdef FOR_PYTHON
+#define ctype_odigit  0x10   /* Octal digits */
+#endif
+#define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
+
+/* End of pcre-internal.h */
--- a/Modules/pcre.h
+++ b/Modules/pcre.h
@ -0,0 +1,59 @@
+/*************************************************
+*       Perl-Compatible Regular Expressions      *
+*************************************************/
+
+/* Copyright (c) 1997 University of Cambridge */
+
+/* Have to include stdlib.h in order to ensure that size_t is defined;
+it is needed in there for malloc. */
+
+#ifndef PCRE_H
+#define PCRE_H
+
+#include <stdlib.h>
+#ifdef FOR_PYTHON
+#include "Python.h"
+#endif
+
+/* Options */
+
+#define PCRE_CASELESS     0x01
+#define PCRE_EXTENDED     0x02
+#define PCRE_ANCHORED     0x04
+#define PCRE_MULTILINE    0x08
+#define PCRE_DOTALL       0x10
+
+/* Exec-time error codes */
+
+#define PCRE_ERROR_NOMATCH        (-1)
+#define PCRE_ERROR_BADREF         (-2)
+#define PCRE_ERROR_NULL           (-3)
+#define PCRE_ERROR_BADOPTION      (-4)
+#define PCRE_ERROR_BADMAGIC       (-5)
+#define PCRE_ERROR_UNKNOWN_NODE   (-6)
+
+/* Types */
+
+typedef void pcre;
+typedef void pcre_extra;
+
+/* Store get and free functions. These can be set to alternative malloc/free
+functions if required. */
+
+extern void *(*pcre_malloc)(size_t);
+extern void  (*pcre_free)(void *);
+
+/* Functions */
+
+#ifdef FOR_PYTHON
+extern pcre *pcre_compile(char *, int, char **, int *, PyObject *);
+#else
+extern pcre *pcre_compile(char *, int, char **, int *);
+#endif
+extern int pcre_exec(pcre *, pcre_extra *, char *, int, int, int *, int);
+extern int pcre_info(pcre *, int *, int *);
+extern pcre_extra *pcre_study(pcre *, int, char **);
+extern char *pcre_version(void);
+
+#endif /* ifndef PCRE_H */
+/* End of pcre.h */
--- a/Modules/pcremodule.c
+++ b/Modules/pcremodule.c
@ -0,0 +1,775 @@
+/***********************************************************
+Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
+The Netherlands.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the names of Stichting Mathematisch
+Centrum or CWI or Corporation for National Research Initiatives or
+CNRI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+While CWI is the initial source for this software, a modified version
+is made available by the Corporation for National Research Initiatives
+(CNRI) at the Internet address ftp://ftp.python.org.
+
+STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
+CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
+DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+
+******************************************************************/
+
+/* Pcre objects */
+
+#include "Python.h"
+
+#ifndef Py_eval_input
+/* For Python 1.4, graminit.h has to be explicitly included */
+#include "graminit.h"
+#define Py_eval_input eval_input
+#endif
+
+#ifndef FOR_PYTHON
+#define FOR_PYTHON
+#endif
+
+#include "pcre.h"
+#include "pcre-internal.h"
+
+static PyObject *ErrorObject;
+
+typedef struct {
+	PyObject_HEAD
+	pcre *regex;
+	pcre_extra *regex_extra;
+        int num_groups;
+} PcreObject;
+
+staticforward PyTypeObject Pcre_Type;
+
+#define PcreObject_Check(v)	((v)->ob_type == &Pcre_Type)
+#define NORMAL			0
+#define CHARCLASS		1
+#define REPLACEMENT		2
+
+#define CHAR 			0
+#define MEMORY_REFERENCE 	1
+#define SYNTAX 			2
+#define NOT_SYNTAX 		3
+#define SET			4
+#define WORD_BOUNDARY		5
+#define NOT_WORD_BOUNDARY	6
+#define BEGINNING_OF_BUFFER	7
+#define END_OF_BUFFER		8
+
+
+static PcreObject *
+newPcreObject(arg)
+	PyObject *arg;
+{
+	PcreObject *self;
+	self = PyObject_NEW(PcreObject, &Pcre_Type);
+	if (self == NULL)
+		return NULL;
+	self->regex = NULL;
+	self->regex_extra = NULL;
+	return self;
+}
+
+/* Pcre methods */
+
+static void
+PyPcre_dealloc(self)
+	PcreObject *self;
+{
+	if (self->regex) free(self->regex);
+	if (self->regex_extra) free(self->regex_extra);
+	self->regex=NULL;
+	self->regex_extra=NULL;
+	PyMem_DEL(self);
+}
+
+
+static PyObject *
+PyPcre_exec(self, args)
+	PcreObject *self;
+	PyObject *args;
+{
+        unsigned char *string;
+	int stringlen, pos=0, options=0, i, count;
+	int offsets[100*2]; /* XXX must this be fixed? */
+	PyObject *list;
+
+	if (!PyArg_ParseTuple(args, "s#|ii", &string, &stringlen, &pos, &options))
+		return NULL;
+	count = pcre_exec(self->regex, self->regex_extra, 
+			  string+pos, stringlen-pos, options,
+			  offsets, sizeof(offsets)/sizeof(int) );
+	if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;}
+	if (count<0)
+	  {
+	    PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", count));
+	    return NULL;
+	  }
+	
+	list=PyList_New(self->num_groups+1);
+	if (list==NULL) return NULL;
+	/* XXX count can be >size_offset! */
+	for(i=0; i<=self->num_groups; i++)
+	  {
+	    PyObject *v;
+	    int start=offsets[i*2], end=offsets[i*2+1];
+	    /* If the group wasn't affected by the match, return -1, -1 */
+            if (start<0 || count<=i) 
+	      {start=end=-1;}
+	    else 
+	      {start += pos; end +=pos;}
+	    v=Py_BuildValue("ii", start, end);
+	    if (v==NULL) {Py_DECREF(list); return NULL;}
+	    PyList_SetItem(list, i, v);
+	  }
+	return list;
+}
+
+static PyMethodDef Pcre_methods[] = {
+	{"match",	(PyCFunction)PyPcre_exec,	1},
+	{NULL,		NULL}		/* sentinel */
+};
+
+static PyObject *
+PyPcre_getattr(self, name)
+	PcreObject *self;
+	char *name;
+{
+	return Py_FindMethod(Pcre_methods, (PyObject *)self, name);
+}
+
+
+staticforward PyTypeObject Pcre_Type = {
+	PyObject_HEAD_INIT(&PyType_Type)
+	0,			/*ob_size*/
+	"Pcre",			/*tp_name*/
+	sizeof(PcreObject),	/*tp_basicsize*/
+	0,			/*tp_itemsize*/
+	/* methods */
+	(destructor)PyPcre_dealloc, /*tp_dealloc*/
+	0,			/*tp_print*/
+	PyPcre_getattr,                      /*tp_getattr*/
+	0,                      /*tp_setattr*/
+	0,			/*tp_compare*/
+	0,			/*tp_repr*/
+	0,			/*tp_as_number*/
+	0,			/*tp_as_sequence*/
+	0,			/*tp_as_mapping*/
+	0,			/*tp_hash*/
+};
+/* --------------------------------------------------------------------- */
+
+static PyObject *
+PyPcre_compile(self, args)
+	PyObject *self; /* Not used */
+	PyObject *args;
+{
+	PcreObject *rv;
+	PyObject *dictionary;
+	unsigned char *pattern, *newpattern;
+	char *error;
+	int num_zeros, i, j;
+	
+	int patternlen, options, erroroffset;
+	if (!PyArg_ParseTuple(args, "s#iO!", &pattern, &patternlen, &options,
+			      &PyDict_Type, &dictionary))
+		return NULL;
+	rv = newPcreObject(args);
+	if ( rv == NULL )
+	    return NULL;
+
+	/* PCRE doesn't like having null bytes in its pattern, so we have to replace 
+	   any zeros in the string with the characters '\0'. */
+	num_zeros=1;
+	for(i=0; i<patternlen; i++) {
+	  if (pattern[i]==0) num_zeros++;
+	}
+	newpattern=malloc(patternlen+num_zeros);
+	if (newpattern==NULL) {
+	  PyErr_SetString(PyExc_MemoryError, "can't allocate memory for new pattern");
+	  return NULL;
+	}
+	for (i=j=0; i<patternlen; i++, j++)
+	  {
+	    if (pattern[i]!=0) newpattern[j]=pattern[i];
+	    else {
+	      newpattern[j++]='\\';
+	      newpattern[j]  ='0';
+	    }
+	  }
+	newpattern[j]='\0';
+
+	rv->regex = pcre_compile(newpattern, options, 
+				 &error, &erroroffset, dictionary);
+	free(newpattern);
+	if (rv->regex==NULL) 
+	  {
+	    PyMem_DEL(rv);
+	    if (!PyErr_Occurred())
+	      PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, erroroffset));
+	    return NULL;
+	  }
+	rv->regex_extra=pcre_study(rv->regex, 0, &error);
+	if (rv->regex_extra==NULL && error!=NULL) 
+	  {
+	    PyMem_DEL(rv);
+	    PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, 0));
+	    return NULL;
+	  }
+        rv->num_groups = pcre_info(rv->regex, NULL, NULL);
+	if (rv->num_groups<0) 
+	  {
+	    PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", rv->num_groups));
+	    PyMem_DEL(rv);
+	    return NULL;
+	  }
+	return (PyObject *)rv;
+}
+
+static PyObject *
+PyPcre_expand_escape(self, args)
+	PyObject *self;
+	PyObject *args;
+{
+  unsigned char c, *pattern;
+  int index, pattern_len;
+  const int context=REPLACEMENT;
+
+  if (!PyArg_ParseTuple(args, "s#i", &pattern, &pattern_len, &index)) 
+    return NULL;
+  if (pattern_len<=index)
+    {
+      PyErr_SetString(ErrorObject, "escape ends too soon");
+      return NULL;
+    }
+  c=pattern[index]; index++;
+  switch (c)
+    {
+    case('t'):
+      return Py_BuildValue("ici", CHAR, (char)9, index);
+      break;
+    case('n'):
+      return Py_BuildValue("ici", CHAR, (char)10, index);
+      break;
+    case('v'):
+      return Py_BuildValue("ici", CHAR, (char)11, index);
+      break;
+    case('r'):
+      return Py_BuildValue("ici", CHAR, (char)13, index);
+      break;
+    case('f'):
+      return Py_BuildValue("ici", CHAR, (char)12, index);
+      break;
+    case('a'):
+      return Py_BuildValue("ici", CHAR, (char)7, index);
+      break;
+    case('x'):
+      {
+	int end, length;
+	unsigned char *string;
+	PyObject *v, *result;
+
+	end=index; 
+	while (end<pattern_len && 
+	       ( pcre_ctypes[ pattern[end] ] & ctype_xdigit ) )
+	  end++;
+	if (end==index)
+	  {
+	    PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
+	    return NULL;
+	  }
+	length=end-index;
+	string=malloc(length+4+1);
+	if (string==NULL)
+	  {
+	    PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
+	    return NULL;
+	  }
+	/* Create a string containing "\x<hexdigits>", which will be
+	   passed to eval() */
+	string[0]=string[length+3]='"';
+	string[1]='\\';
+	string[length+4]='\0';
+	memcpy(string+2, pattern+index-1, length+1);
+	v=PyRun_String((char *)string, Py_eval_input, 
+		       PyEval_GetGlobals(), PyEval_GetLocals());
+	free(string);
+	/* The evaluation raised an exception */
+	if (v==NULL) return NULL;
+	result=Py_BuildValue("iOi", CHAR, v, end);
+	Py_DECREF(v);
+	return result;
+      }
+      break;
+
+    case('b'):
+      if (context!=NORMAL)
+	return Py_BuildValue("ici", CHAR, (char)8, index);
+      else 
+	{
+	  unsigned char empty_string[1];
+	  empty_string[0]='\0';
+	  return Py_BuildValue("isi", WORD_BOUNDARY, empty_string, index);
+	}
+      break;
+    case('B'):
+      if (context!=NORMAL)
+	return Py_BuildValue("ici", CHAR, 'B', index);
+      else 
+	{
+	  unsigned char empty_string[1];
+	  empty_string[0]='\0';
+	  return Py_BuildValue("isi", NOT_WORD_BOUNDARY, empty_string, index);
+	}
+      break;
+    case('A'):
+      if (context!=NORMAL)
+	return Py_BuildValue("ici", CHAR, 'A', index);
+      else 
+	{
+	  unsigned char empty_string[1];
+	  empty_string[0]='\0';
+	  return Py_BuildValue("isi", BEGINNING_OF_BUFFER, empty_string, index);
+	}
+      break;
+    case('Z'):
+      if (context!=NORMAL)
+	return Py_BuildValue("ici", CHAR, 'Z', index);
+      else 
+	{
+	  unsigned char empty_string[1];
+	  empty_string[0]='\0';
+	  return Py_BuildValue("isi", END_OF_BUFFER, empty_string, index);
+	}
+      break;
+    case('E'):    case('G'):    case('L'):    case('Q'):
+    case('U'):    case('l'):    case('u'):
+      {
+	char message[50];
+	sprintf(message, "\\%c is not allowed", c);
+	PyErr_SetString(ErrorObject, message);
+	return NULL;
+      }
+
+    case ('w'):
+      return Py_BuildValue("ici", CHAR, 'w', index);
+      break;
+    case ('W'):
+      return Py_BuildValue("ici", CHAR, 'W', index);
+      break;
+    case ('s'):
+	return Py_BuildValue("ici", CHAR, 's', index);
+      break;
+    case ('S'):
+	return Py_BuildValue("ici", CHAR, 'S', index);
+      break;
+
+    case ('d'):
+	return Py_BuildValue("ici", CHAR, 'd', index);
+      break;
+    case ('D'):
+	return Py_BuildValue("ici", CHAR, 'D', index);
+      break;
+
+    case('g'):
+      {
+	int end, valid, i;
+	if (pattern_len<=index)
+	  {
+	    PyErr_SetString(ErrorObject, "unfinished symbolic reference");
+	    return NULL;
+	  }
+	if (pattern[index]!='<')
+	  {
+	    PyErr_SetString(ErrorObject, "missing < in symbolic reference");
+	    return NULL;
+	  }
+	index++;
+	end=index;
+	while (end<pattern_len && pattern[end]!='>')
+	  end++;
+	if (end==pattern_len)
+	  {
+	    PyErr_SetString(ErrorObject, "unfinished symbolic reference");
+	    return NULL;
+	  }
+	valid=1;
+	if (index==end		/* Zero-length name */
+	    || !(pcre_ctypes[pattern[index]] & ctype_word) /* First char. not alphanumeric */
+	    || (pcre_ctypes[pattern[index]] & ctype_digit) ) /* First char. a digit */
+	  valid=0;
+
+	for(i=index+1; i<end; i++)
+	  {
+	    if (!(pcre_ctypes[pattern[i]] & ctype_word) )
+	      valid=0;
+	  }	
+	if (!valid)
+	  {
+	    /* XXX should include the text of the reference */
+	    PyErr_SetString(ErrorObject, "illegal symbolic reference");
+	    return NULL;
+	  }
+	    
+	return Py_BuildValue("is#i", MEMORY_REFERENCE, 
+			             pattern+index, end-index, 
+			             end+1);
+      }
+    break;
+
+    case('0'):
+      {
+	/* \0 always indicates an octal escape, so we consume up to 3
+	   characters, as long as they're all octal digits */
+	int octval=0, i;
+	index--;
+	for(i=index;
+	    i<=index+2 && i<pattern_len 
+	      && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
+	    i++)
+	  {
+	    octval = octval * 8 + pattern[i] - '0';
+	  }
+	if (octval>255)
+	  {
+	    PyErr_SetString(ErrorObject, "octal value out of range");
+	    return NULL;
+	  }
+	return Py_BuildValue("ici", CHAR, (unsigned char)octval, i);
+      }
+      break;
+    case('1'):    case('2'):    case('3'):    case('4'):
+    case('5'):    case('6'):    case('7'):    case('8'):
+    case('9'):
+      {
+	/* Handle \?, where ? is from 1 through 9 */
+	int value=0;
+	index--;
+	/* If it's at least a two-digit reference, like \34, it might
+           either be a 3-digit octal escape (\123) or a 2-digit
+           decimal memory reference (\34) */
+
+	if ( (index+1) <pattern_len && 
+	    (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
+	  {
+	    if ( (index+2) <pattern_len && 
+		(pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
+		(pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
+		(pcre_ctypes[ pattern[index  ] ] & ctype_odigit)
+		)
+	      {
+		/* 3 octal digits */
+		value= 8*8*(pattern[index  ]-'0') +
+		         8*(pattern[index+1]-'0') +
+		           (pattern[index+2]-'0');
+		if (value>255)
+		  {
+		    PyErr_SetString(ErrorObject, "octal value out of range");
+		    return NULL;
+		  }
+		return Py_BuildValue("ici", CHAR, (unsigned char)value, index+3);
+	      }
+	    else
+	      {
+		/* 2-digit form, so it's a memory reference */
+		if (context==CHARCLASS)
+		  {
+		    PyErr_SetString(ErrorObject, "cannot reference a register "
+				    "from inside a character class");
+		    return NULL;
+		  }
+		value= 10*(pattern[index  ]-'0') +
+		          (pattern[index+1]-'0');
+		if (value<1 || EXTRACT_MAX<=value)
+		  {
+		    PyErr_SetString(ErrorObject, "memory reference out of range");
+		    return NULL;
+		  }
+		return Py_BuildValue("iii", MEMORY_REFERENCE, 
+				     value, index+2);
+	      }
+	  }
+	else 
+	  {
+	    /* Single-digit form, like \2, so it's a memory reference */
+	    if (context==CHARCLASS)
+	      {
+		PyErr_SetString(ErrorObject, "cannot reference a register "
+				"from inside a character class");
+		return NULL;
+	      }
+	    return Py_BuildValue("iii", MEMORY_REFERENCE, 
+				 pattern[index]-'0', index+1);
+	  }
+      }
+      break;
+
+    default:
+	return Py_BuildValue("ici", CHAR, c, index);
+	break;
+    }
+}
+
+static PyObject *
+PyPcre_expand(self, args)
+	PyObject *self;
+	PyObject *args;
+{
+  PyObject *results, *match_obj;
+  PyObject *repl_obj, *newstring;
+  unsigned char *repl;
+  int size, total_len, i, start, pos;
+
+  if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj)) 
+    return NULL;
+
+  repl=(unsigned char *)PyString_AsString(repl_obj);
+  size=PyString_Size(repl_obj);
+  results=PyList_New(0);
+  if (results==NULL) return NULL;
+  for(start=total_len=i=0; i<size; i++)
+    {
+      if (repl[i]=='\\')
+	{
+	  PyObject *args, *t, *value;
+	  int escape_type;
+
+	  if (start!=i)
+	    {
+	      PyList_Append(results, 
+			    PyString_FromStringAndSize((char *)repl+start, i-start));
+	      total_len += i-start;
+	    }
+	  i++;
+	  args=Py_BuildValue("Oi", repl_obj, i);
+	  t=PyPcre_expand_escape(NULL, args);
+	  Py_DECREF(args);
+	  if (t==NULL)
+	    {
+	      /* PyPcre_expand_escape triggered an exception of some sort,
+		 so just return */
+	      Py_DECREF(results);
+	      return NULL;
+	    }
+	  value=PyTuple_GetItem(t, 1);
+	  escape_type=PyInt_AsLong(PyTuple_GetItem(t, 0));
+	  switch (escape_type)
+	    {
+	    case (CHAR):
+	      PyList_Append(results, value);
+	      total_len += PyString_Size(value);
+	      break;
+	    case(MEMORY_REFERENCE):
+	      {
+		PyObject *r, *tuple, *result;
+		r=PyObject_GetAttrString(match_obj, "group");
+		tuple=PyTuple_New(1);
+		Py_INCREF(value);
+		PyTuple_SetItem(tuple, 0, value);
+		result=PyEval_CallObject(r, tuple);
+		Py_DECREF(r); Py_DECREF(tuple);
+		if (result==NULL)
+		  {
+		    /* The group() method trigged an exception of some sort */
+		    Py_DECREF(results);
+		    return NULL;
+		  }
+		if (result==Py_None)
+		  {
+		    char message[50];
+		    sprintf(message, 
+			    "group %li did not contribute to the match",
+			    PyInt_AsLong(value));
+		    PyErr_SetString(ErrorObject, 
+				    message);
+		    Py_DECREF(result);
+		    Py_DECREF(t);
+		    Py_DECREF(results);
+		    return NULL;
+		  }
+		/* XXX typecheck that it's a string! */
+		PyList_Append(results, result);
+		total_len += PyString_Size(result);
+		Py_DECREF(result);
+	      }
+	      break;
+	    default:
+	      Py_DECREF(t);
+	      Py_DECREF(results);
+	      PyErr_SetString(ErrorObject, 
+			      "bad escape in replacement");
+	      return NULL;
+	    }
+	  i=start=PyInt_AsLong(PyTuple_GetItem(t, 2));
+	  i--; /* Decrement now, because the 'for' loop will increment it */
+	  Py_DECREF(t);
+	}
+    } /* endif repl[i]!='\\' */
+
+  if (start!=i)
+    {
+      PyList_Append(results, PyString_FromStringAndSize((char *)repl+start, i-start));
+      total_len += i-start;
+    }
+
+  /* Whew!  Now we've constructed a list containing various pieces of
+     strings that will make up our final result.  So, iterate over 
+     the list concatenating them.  A new string measuring total_len
+     bytes is allocated and filled in. */
+     
+  newstring=PyString_FromStringAndSize(NULL, total_len);
+  if (newstring==NULL)
+    {
+      Py_DECREF(results);
+      return NULL;
+    }
+
+  repl=(unsigned char *)PyString_AsString(newstring);
+  for (pos=i=0; i<PyList_Size(results); i++)
+    {
+      PyObject *item=PyList_GetItem(results, i);
+      memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
+      pos += PyString_Size(item);
+    }
+  Py_DECREF(results);
+  return newstring;
+}
+
+
+/* List of functions defined in the module */
+
+static PyMethodDef pcre_methods[] = {
+	{"pcre_compile",		PyPcre_compile,		1},
+	{"pcre_expand",		PyPcre_expand,		1},
+	{NULL,		NULL}		/* sentinel */
+};
+
+
+/*
+ * Convenience routine to export an integer value.
+ * For simplicity, errors (which are unlikely anyway) are ignored.
+ */
+
+static void
+insint(d, name, value)
+	PyObject * d;
+	char * name;
+	int value;
+{
+	PyObject *v = PyInt_FromLong((long) value);
+	if (v == NULL) {
+		/* Don't bother reporting this error */
+		PyErr_Clear();
+	}
+	else {
+		PyDict_SetItemString(d, name, v);
+		Py_DECREF(v);
+	}
+}
+
+
+/* Initialization function for the module (*must* be called initpcre) */
+
+void
+initpcre()
+{
+	PyObject *m, *d;
+	int a;
+
+	/* Create the module and add the functions */
+	m = Py_InitModule("pcre", pcre_methods);
+
+	/* Add some symbolic constants to the module */
+	d = PyModule_GetDict(m);
+	ErrorObject = PyString_FromString("pcre.error");
+	PyDict_SetItemString(d, "error", ErrorObject);
+
+	/* Insert the flags */
+	insint(d, "IGNORECASE", PCRE_CASELESS);
+	insint(d, "ANCHORED", PCRE_ANCHORED);
+	insint(d, "MULTILINE", PCRE_MULTILINE);
+	insint(d, "DOTALL", PCRE_DOTALL);
+	insint(d, "VERBOSE", PCRE_EXTENDED);
+	
+	/* Insert the opcodes */
+	insint(d, "OP_END", OP_END);
+	insint(d, "OP_SOD", OP_SOD);
+	insint(d, "OP_NOT_WORD_BOUNDARY", OP_NOT_WORD_BOUNDARY);
+	insint(d, "OP_WORD_BOUNDARY", OP_WORD_BOUNDARY);
+	insint(d, "OP_NOT_DIGIT", OP_NOT_DIGIT);
+	insint(d, "OP_NOT_WHITESPACE", OP_NOT_WHITESPACE);
+	insint(d, "OP_WHITESPACE", OP_WHITESPACE);
+	insint(d, "OP_NOT_WORDCHAR", OP_NOT_WORDCHAR);
+	insint(d, "OP_WORDCHAR", OP_WORDCHAR);
+	insint(d, "OP_EOD", OP_EOD);
+	insint(d, "OP_CIRC", OP_CIRC);
+	insint(d, "OP_DOLL", OP_DOLL);
+	insint(d, "OP_ANY", OP_ANY);
+	insint(d, "OP_CHARS", OP_CHARS);
+
+	insint(d, "OP_STAR", OP_STAR);
+	insint(d, "OP_MINSTAR", OP_MINSTAR);
+	insint(d, "OP_PLUS", OP_PLUS);
+	insint(d, "OP_MINPLUS", OP_MINPLUS);
+	insint(d, "OP_QUERY", OP_QUERY);
+	insint(d, "OP_MINQUERY", OP_MINQUERY);
+	insint(d, "OP_UPTO", OP_UPTO);
+	insint(d, "OP_MINUPTO", OP_MINUPTO);
+	insint(d, "OP_EXACT", OP_EXACT);
+
+	insint(d, "OP_TYPESTAR", OP_TYPESTAR);
+	insint(d, "OP_TYPEMINSTAR", OP_TYPEMINSTAR);
+	insint(d, "OP_TYPEPLUS", OP_TYPEPLUS);
+	insint(d, "OP_TYPEMINPLUS", OP_TYPEMINPLUS);
+	insint(d, "OP_TYPEQUERY", OP_TYPEQUERY);
+	insint(d, "OP_TYPEMINQUERY", OP_TYPEMINQUERY);
+	insint(d, "OP_TYPEUPTO", OP_TYPEUPTO);
+	insint(d, "OP_TYPEMINUPTO", OP_TYPEMINUPTO);
+	insint(d, "OP_TYPEEXACT", OP_TYPEEXACT);
+
+	insint(d, "OP_CRSTAR", OP_CRSTAR);
+	insint(d, "OP_CRMINSTAR", OP_CRMINSTAR);
+	insint(d, "OP_CRPLUS", OP_CRPLUS);
+	insint(d, "OP_CRMINPLUS", OP_CRMINPLUS);
+	insint(d, "OP_CRQUERY", OP_CRQUERY);
+	insint(d, "OP_CRMINQUERY", OP_CRMINQUERY);
+	insint(d, "OP_CRRANGE", OP_CRRANGE);
+	insint(d, "OP_CRMINRANGE", OP_CRMINRANGE);
+
+	insint(d, "OP_CLASS", OP_CLASS);
+	insint(d, "OP_NEGCLASS", OP_NEGCLASS);
+	insint(d, "OP_REF", OP_REF);
+
+	insint(d, "OP_ALT", OP_ALT);
+	insint(d, "OP_KET", OP_KET);
+	insint(d, "OP_KETRMAX", OP_KETRMAX);
+	insint(d, "OP_KETRMIN", OP_KETRMIN);
+
+	insint(d, "OP_ASSERT", OP_ASSERT);
+	insint(d, "OP_ASSERT_NOT", OP_ASSERT_NOT);
+
+	insint(d, "OP_BRAZERO", OP_BRAZERO);
+	insint(d, "OP_BRAMINZERO", OP_BRAMINZERO);
+	insint(d, "OP_BRA", OP_BRA);
+	
+	/* Check for errors */
+	if (PyErr_Occurred())
+		Py_FatalError("can't initialize module pcre");
+}
+
--- a/Modules/pypcre.c
+++ b/Modules/pypcre.c