cpython/Objects/stringlib/string_format.h

/*
    string_format.h -- implementation of string.format().

    It uses the Objects/stringlib conventions, so that it can be
    compiled for both unicode and string objects.
*/


/* Defines for more efficiently reallocating the string buffer */
#define INITIAL_SIZE_INCREMENT 100
#define SIZE_MULTIPLIER 2
#define MAX_SIZE_INCREMENT  3200


/************************************************************************/
/***********   Global data structures and forward declarations  *********/
/************************************************************************/

/*
   A SubString consists of the characters between two string or
   unicode pointers.
*/
typedef struct {
    STRINGLIB_CHAR *ptr;
    STRINGLIB_CHAR *end;
} SubString;


/* forward declaration for recursion */
static PyObject *
build_string(SubString *input, PyObject *args, PyObject *kwargs,
             int *recursion_level);


/************************************************************************/
/**************************  Utility  functions  ************************/
/************************************************************************/

/* fill in a SubString from a pointer and length */
Py_LOCAL_INLINE(void)
SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len)
{
    str->ptr = p;
    if (p == NULL)
        str->end = NULL;
    else
        str->end = str->ptr + len;
}

Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString *str)
{
    return STRINGLIB_NEW(str->ptr, str->end - str->ptr);
}

/************************************************************************/
/***********      Error handling and exception generation  **************/
/************************************************************************/

/*
    Most of our errors are value errors, because to Python, the
    format string is a "value".  Also, it's convenient to return
    a NULL when we are erroring out.

    XXX: need better error handling, per PEP 3101.
*/
static void *
SetError(const char *s)
{
    /* PyErr_Format always returns NULL */
    return PyErr_Format(PyExc_ValueError, "%s in format string", s);
}

/*
    check_input returns True if we still have characters
    left in the input string.

    XXX: make this function go away when better error handling is
    implemented.
*/
Py_LOCAL_INLINE(int)
check_input(SubString *input)
{
    if (input->ptr < input->end)
        return 1;
    PyErr_SetString(PyExc_ValueError,
                    "unterminated replacement field");
    return 0;
}

/************************************************************************/
/***********    Output string management functions       ****************/
/************************************************************************/

typedef struct {
    STRINGLIB_CHAR *ptr;
    STRINGLIB_CHAR *end;
    PyObject *obj;
    Py_ssize_t size_increment;
} OutputString;

/* initialize an OutputString object, reserving size characters */
static int
output_initialize(OutputString *output, Py_ssize_t size)
{
    output->obj = STRINGLIB_NEW(NULL, size);
    if (output->obj == NULL)
        return 0;

    output->ptr = STRINGLIB_STR(output->obj);
    output->end = STRINGLIB_LEN(output->obj) + output->ptr;
    output->size_increment = INITIAL_SIZE_INCREMENT;

    return 1;
}

/*
    output_extend reallocates the output string buffer.
    It returns a status:  0 for a failed reallocation,
    1 for success.
*/

static int
output_extend(OutputString *output, Py_ssize_t count)
{
    STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj);
    Py_ssize_t curlen = output->ptr - startptr;
    Py_ssize_t maxlen = curlen + count + output->size_increment;

    if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0)
        return 0;
    startptr = STRINGLIB_STR(output->obj);
    output->ptr = startptr + curlen;
    output->end = startptr + maxlen;
    if (output->size_increment < MAX_SIZE_INCREMENT)
        output->size_increment *= SIZE_MULTIPLIER;
    return 1;
}

/*
    output_data dumps characters into our output string
    buffer.

    In some cases, it has to reallocate the string.

    It returns a status:  0 for a failed reallocation,
    1 for success.
*/
static int
output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count)
{
    if ((count > output->end - output->ptr) && !output_extend(output, count))
        return 0;
    memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR));
    output->ptr += count;
    return 1;
}

/************************************************************************/
/***********  Format string parsing -- integers and identifiers *********/
/************************************************************************/

/*
    end_identifier returns true if a character marks
    the end of an identifier string.

    Although the PEP specifies that identifiers are
    numbers or valid Python identifiers, we just let
    getattr/getitem handle that, so the implementation
    is more flexible than the PEP would indicate.
*/
Py_LOCAL_INLINE(int)
end_identifier(STRINGLIB_CHAR c)
{
    switch (c) {
    case '.': case '[': case ']':
        return 1;
    default:
        return 0;
    }
}

/*
    get_integer consumes 0 or more decimal digit characters from an
    input string, updates *result with the corresponding positive
    integer, and returns the number of digits consumed.

    returns -1 on error.
*/
static int
get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end,
                  Py_ssize_t *result)
{
    Py_ssize_t accumulator, digitval, oldaccumulator;
    int numdigits;
    accumulator = numdigits = 0;
    for (;;(*ptr)++, numdigits++) {
        if (*ptr >= end)
            break;
        digitval = STRINGLIB_TODECIMAL(**ptr);
        if (digitval < 0)
            break;
        /*
           This trick was copied from old Unicode format code.  It's cute,
           but would really suck on an old machine with a slow divide
           implementation.  Fortunately, in the normal case we do not
           expect too many digits.
        */
        oldaccumulator = accumulator;
        accumulator *= 10;
        if ((accumulator+10)/10 != oldaccumulator+1) {
            PyErr_Format(PyExc_ValueError,
                         "Too many decimal digits in format string");
            return -1;
        }
        accumulator += digitval;
    }
    *result = accumulator;
    return numdigits;
}

/*
    get_identifier is a bit of a misnomer.  It returns a value for use
    with getattr or getindex.  This value will a string/unicode
    object. The input cannot be zero length.  Continues until end of
    input, or end_identifier() returns true.
*/
static PyObject *
get_identifier(SubString *input)
{
    STRINGLIB_CHAR *start;

    for (start = input->ptr;
         input->ptr < input->end && !end_identifier(*input->ptr);
         input->ptr++)
        ;

    return  STRINGLIB_NEW(start, input->ptr - start);

    /*
        We might want to add code here to check for invalid Python
        identifiers.  All identifiers are eventually passed to getattr
        or getitem, so there is a check when used.  However, we might
        want to remove (or not) the ability to have strings like
        "a/b" or " ab" or "-1" (which is not parsed as a number).
        For now, this is left as an exercise for the first disgruntled
        user...

    if (XXX -- need check function) {
        Py_DECREF(result);
        PyErr_SetString(PyExc_ValueError,
                      "Invalid embedded Python identifier");
        return NULL;
    }
    */
}

/************************************************************************/
/******** Functions to get field objects and specification strings ******/
/************************************************************************/

/* get_field_and_spec is the main function in this section.  It parses
   the format string well enough to return a field object to render along
   with a field specification string.
*/

/*
    look up key in our keyword arguments
*/
static PyObject *
key_lookup(PyObject *kwargs, PyObject *key)
{
    PyObject *result;

    if (kwargs && (result = PyDict_GetItem(kwargs, key)) != NULL) {
        Py_INCREF(result);
        return result;
    }
    return NULL;
}

/*
    get_field_object returns the object inside {}, before the
    format_spec.  It handles getindex and getattr lookups and consumes
    the entire input string.
*/
static PyObject *
get_field_object(SubString *input, PyObject *args, PyObject *kwargs)
{
    PyObject *myobj, *subobj, *newobj;
    STRINGLIB_CHAR c;
    Py_ssize_t index;
    int isindex, isnumeric, isargument;

    index = isnumeric = 0;  /* Just to shut up the compiler warnings */

    myobj = args;
    Py_INCREF(myobj);

    for (isindex=1, isargument=1;;) {
        if (!check_input(input))
            break;
        if (!isindex) {
            if ((subobj = get_identifier(input)) == NULL)
                break;
            newobj = PyObject_GetAttr(myobj, subobj);
            Py_DECREF(subobj);
        } else {
            isnumeric = (STRINGLIB_ISDECIMAL(*input->ptr));
            if (isnumeric)
                /* XXX: add error checking */
                get_integer(&input->ptr, input->end, &index);

            if (isnumeric && PySequence_Check(myobj))
                newobj = PySequence_GetItem(myobj, index);
            else {
                /* XXX -- do we need PyLong_FromLongLong?
                                   Using ssizet, not int... */
                subobj = isnumeric ?
                          PyInt_FromLong(index) :
                          get_identifier(input);
                if (subobj == NULL)
                    break;
                if (isargument) {
                    newobj = key_lookup(kwargs, subobj);
                } else {
                    newobj = PyObject_GetItem(myobj, subobj);
                }
                Py_DECREF(subobj);
            }
        }
        Py_DECREF(myobj);
        myobj = newobj;
        if (myobj == NULL)
            break;
        if (!isargument && isindex)
            if  ((!check_input(input)) || (*(input->ptr++) != ']')) {
                SetError("Expected ]");
                break;
            }

        /* if at the end of input, return with myobj */
        if (input->ptr >= input->end)
            return myobj;

        c = *input->ptr;
        input->ptr++;
        isargument = 0;
        isindex = (c == '[');
        if (!isindex && (c != '.')) {
           SetError("Expected ., [, :, !, or }");
           break;
        }
    }
    if ((myobj == NULL) && isargument) {
        /* XXX: include more useful error information, like which
         * keyword not found or which index missing */
       PyErr_Clear();
       return SetError(isnumeric
            ? "Not enough positional arguments"
            : "Keyword argument not found");
    }
    Py_XDECREF(myobj);
    return NULL;
}

/************************************************************************/
/*****************  Field rendering functions  **************************/
/************************************************************************/

/*
    render_field() is the main function in this section.  It takes the
    field object and field specification string generated by
    get_field_and_spec, and renders the field into the output string.

    format() does the actual calling of the objects __format__ method.
*/


/* returns fieldobj.__format__(format_spec) */
static PyObject *
format(PyObject *fieldobj, SubString *format_spec)
{
    static PyObject *format_str = NULL;
    PyObject *meth;
    PyObject *spec = NULL;
    PyObject *result = NULL;

    /* Initialize cached value */
    if (format_str == NULL) {
        /* Initialize static variable needed by _PyType_Lookup */
        format_str = PyUnicode_FromString("__format__");
        if (format_str == NULL)
            return NULL;
    }

    /* Make sure the type is initialized.  float gets initialized late */
    if (Py_Type(fieldobj)->tp_dict == NULL)
        if (PyType_Ready(Py_Type(fieldobj)) < 0)
            return NULL;

    /* we need to create an object out of the pointers we have */
    spec = SubString_new_object(format_spec);
    if (spec == NULL)
        goto done;

    /* Find the (unbound!) __format__ method (a borrowed reference) */
    meth = _PyType_Lookup(Py_Type(fieldobj), format_str);
    if (meth == NULL) {
        PyErr_Format(PyExc_TypeError,
                     "Type %.100s doesn't define __format__",
                     Py_Type(fieldobj)->tp_name);
        goto done;
    }

    /* And call it, binding it to the value */
    result = PyObject_CallFunctionObjArgs(meth, fieldobj, spec, NULL);
    if (result == NULL)
        goto done;

    if (!STRINGLIB_CHECK(result)) {
        PyErr_SetString(PyExc_TypeError,
                        "__format__ method did not return "
                        STRINGLIB_TYPE_NAME);
        Py_DECREF(result);
        result = NULL;
        goto done;
    }

done:
    Py_XDECREF(spec);
    return result;
}

/*
    render_field calls fieldobj.__format__(format_spec) method, and
    appends to the output.
*/
static int
render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output)
{
    int ok = 0;
    PyObject *result = format(fieldobj, format_spec);

    if (result == NULL)
        goto done;

    ok = output_data(output,
                     STRINGLIB_STR(result), STRINGLIB_LEN(result));
done:
    Py_XDECREF(result);
    return ok;
}

static int
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
            STRINGLIB_CHAR *conversion)
{
    STRINGLIB_CHAR c = 0;

    /* initialize these, as they may be empty */
    *conversion = '\0';
    SubString_init(format_spec, NULL, 0);

    /* search for the field name.  it's terminated by the end of the
       string, or a ':' or '!' */
    field_name->ptr = str->ptr;
    while (str->ptr < str->end) {
        switch (c = *(str->ptr++)) {
        case ':':
        case '!':
            break;
        default:
            continue;
        }
        break;
    }

    if (c == '!' || c == ':') {
        /* we have a format specifier and/or a conversion */
        /* don't include the last character */
        field_name->end = str->ptr-1;

        /* the format specifier is the rest of the string */
        format_spec->ptr = str->ptr;
        format_spec->end = str->end;

        /* see if there's a conversion specifier */
        if (c == '!') {
            /* there must be another character present */
            if (format_spec->ptr >= format_spec->end) {
                PyErr_SetString(PyExc_ValueError,
                                "end of format while looking for conversion "
                                "specifier");
                return 0;
            }
            *conversion = *(format_spec->ptr++);

            /* if there is another character, it must be a colon */
            if (format_spec->ptr < format_spec->end) {
                c = *(format_spec->ptr++);
                if (c != ':') {
                    PyErr_SetString(PyExc_ValueError,
                                    "expected ':' after format specifier");
                    return 0;
                }
            }
        }

        return 1;

    } else {
        /* end of string, there's no format_spec or conversion */
        field_name->end = str->ptr;
        return 1;
    }
}

/************************************************************************/
/******* Output string allocation and escape-to-markup processing  ******/
/************************************************************************/

/* MarkupIterator breaks the string into pieces of either literal
   text, or things inside {} that need to be marked up.  it is
   designed to make it easy to wrap a Python iterator around it, for
   use with the Formatter class */

typedef struct {
    SubString str;
    int in_markup;
} MarkupIterator;

static int
MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len)
{
    SubString_init(&self->str, ptr, len);
    self->in_markup = 0;
    return 1;
}

/* returns 0 on error, 1 on non-error termination, and 2 if it got a
   string (or something to be expanded) */
static int
MarkupIterator_next(MarkupIterator *self, int *is_markup, SubString *literal,
                    SubString *field_name, SubString *format_spec,
                    STRINGLIB_CHAR *conversion,
                    int *format_spec_needs_expanding)
{
    int at_end;
    STRINGLIB_CHAR c = 0;
    STRINGLIB_CHAR *start;
    int count;
    Py_ssize_t len;

    *format_spec_needs_expanding = 0;

    /* no more input, end of iterator */
    if (self->str.ptr >= self->str.end)
        return 1;

    *is_markup = self->in_markup;
    start = self->str.ptr;

    if (self->in_markup) {

        /* prepare for next iteration */
        self->in_markup = 0;

        /* this is markup, find the end of the string by counting nested
           braces.  note that this prohibits escaped braces, so that
           format_specs cannot have braces in them. */
        count = 1;

        /* we know we can't have a zero length string, so don't worry
           about that case */
        while (self->str.ptr < self->str.end) {
            switch (c = *(self->str.ptr++)) {
            case '{':
                /* the format spec needs to be recursively expanded.
                   this is an optimization, and not strictly needed */
                *format_spec_needs_expanding = 1;
                count++;
                break;
            case '}':
                count--;
                if (count <= 0) {
                    /* we're done.  parse and get out */
                    literal->ptr = start;
                    literal->end = self->str.ptr-1;

                    if (parse_field(literal, field_name, format_spec,
                                    conversion) == 0)
                        return 0;

                    /* success */
                    return 2;
                }
                break;
            }
        }
        /* end of string while searching for matching '}' */
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format");
        return 0;

    } else {
        /* literal text, read until the end of string, an escaped { or },
           or an unescaped { */
        while (self->str.ptr < self->str.end) {
            switch (c = *(self->str.ptr++)) {
            case '{':
            case '}':
                self->in_markup = 1;
                break;
            default:
                continue;
            }
            break;
        }

        at_end = self->str.ptr >= self->str.end;
        len = self->str.ptr - start;

        if ((c == '}') && (at_end || (c != *self->str.ptr))) {
            SetError("Single } encountered");
            return 0;
        }
        if (at_end && c == '{') {
            SetError("Single { encountered");
            return 0;
        }
        if (!at_end) {
            if (c == *self->str.ptr) {
                /* escaped } or {, skip it in the input */
                self->str.ptr++;
                self->in_markup = 0;
            } else
                len--;
        }

        /* this is just plain text, return it */
        literal->ptr = start;
        literal->end = start + len;
        return 2;
    }
}


/* do the !r or !s conversion on obj */
static PyObject *
do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
{
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
       might have returned a string? */
    switch (conversion) {
    case 'r':
        return PyObject_Repr(obj);
    case 's':
        return PyObject_Unicode(obj);
    default:
        PyErr_Format(PyExc_ValueError,
                     "Unknown converion specifier %c",
                     conversion);
        return NULL;
    }
}

/* given:

   {field_name!conversion:format_spec}

   compute the result and write it to output.
   format_spec_needs_expanding is an optimization.  if it's false,
   just output the string directly, otherwise recursively expand the
   format_spec string. */

static int
output_markup(SubString *field_name, SubString *format_spec,
              int format_spec_needs_expanding, STRINGLIB_CHAR conversion,
              OutputString *output, PyObject *args, PyObject *kwargs,
              int *recursion_level)
{
    PyObject *tmp = NULL;
    PyObject *fieldobj = NULL;
    SubString expanded_format_spec;
    SubString *actual_format_spec;
    int result = 0;

    /* convert field_name to an object */
    fieldobj = get_field_object(field_name, args, kwargs);
    if (fieldobj == NULL)
        goto done;

    if (conversion != '\0') {
        tmp = do_conversion(fieldobj, conversion);
        if (tmp == NULL)
            goto done;

        /* do the assignment, transferring ownership: fieldobj = tmp */
        Py_DECREF(fieldobj);
        fieldobj = tmp;
        tmp = NULL;
    }

    /* if needed, recurively compute the format_spec */
    if (format_spec_needs_expanding) {
        tmp = build_string(format_spec, args, kwargs, recursion_level);
        if (tmp == NULL)
            goto done;

        /* note that in the case we're expanding the format string,
           tmp must be kept around until after the call to
           render_field. */
        SubString_init(&expanded_format_spec,
                       STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp));
        actual_format_spec = &expanded_format_spec;
    } else
        actual_format_spec = format_spec;

    if (render_field(fieldobj, actual_format_spec, output) == 0)
        goto done;

    result = 1;

done:
    Py_XDECREF(fieldobj);
    Py_XDECREF(tmp);

    return result;
}

/*
    do_markup is the top-level loop for the format() function.  It
    searches through the format string for escapes to markup codes, and
    calls other functions to move non-markup text to the output,
    and to perform the markup to the output.
*/
static int
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
          OutputString *output, int *recursion_level)
{
    MarkupIterator iter;
    int is_markup;
    int format_spec_needs_expanding;
    int result;
    SubString str;
    SubString field_name;
    SubString format_spec;
    STRINGLIB_CHAR conversion;

    MarkupIterator_init(&iter, input->ptr, input->end - input->ptr);
    while ((result = MarkupIterator_next(&iter, &is_markup, &str, &field_name,
                                         &format_spec, &conversion,
                                         &format_spec_needs_expanding)) == 2) {
        if (is_markup) {
            if (!output_markup(&field_name, &format_spec,
                               format_spec_needs_expanding, conversion, output,
                               args, kwargs, recursion_level))
                return 0;
        } else {
            if (!output_data(output, str.ptr, str.end-str.ptr))
                return 0;
        }
    }
    return result;
}


/*
    build_string allocates the output string and then
    calls do_markup to do the heavy lifting.
*/
static PyObject *
build_string(SubString *input, PyObject *args, PyObject *kwargs,
             int *recursion_level)
{
    OutputString output;
    PyObject *result = NULL;
    Py_ssize_t count;

    output.obj = NULL; /* needed so cleanup code always works */

    /* check the recursion level */
    (*recursion_level)--;
    if (*recursion_level < 0) {
        PyErr_SetString(PyExc_ValueError,
                        "Max string recursion exceeded");
        goto done;
    }

    /* initial size is the length of the format string, plus the size
       increment.  seems like a reasonable default */
    if (!output_initialize(&output,
                           input->end - input->ptr +
                           INITIAL_SIZE_INCREMENT))
        goto done;

    if (!do_markup(input, args, kwargs, &output, recursion_level)) {
        goto done;
    }

    count = output.ptr - STRINGLIB_STR(output.obj);
    if (STRINGLIB_RESIZE(&output.obj, count) < 0) {
        goto done;
    }

    /* transfer ownership to result */
    result = output.obj;
    output.obj = NULL;

done:
    (*recursion_level)++;
    Py_XDECREF(output.obj);
    return result;
}

/************************************************************************/
/*********** main routine ***********************************************/
/************************************************************************/

/* this is the main entry point */
static PyObject *
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
{
    SubString input;

    /* PEP 3101 says only 2 levels, so that
       "{0:{1}}".format('abc', 's')            # works
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
    */
    int recursion_level = 2;

    SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self));
    return build_string(&input, args, kwargs, &recursion_level);
}