# Regular expression patterns for C syntax. # # None of these patterns has any capturing. However, a number of them # have capturing markers compatible with utils.set_capture_groups(). import textwrap def _ind(text, level=1, edges='both'): indent = ' ' * level text = textwrap.indent(text, indent) if edges == 'pre' or edges == 'both': text = '\n' + indent + text.lstrip() if edges == 'post' or edges == 'both': text = text.rstrip() + '\n' + ' ' * (level - 1) return text ####################################### # general HEX = r'(?: [0-9a-zA-Z] )' STRING_LITERAL = textwrap.dedent(rf''' (?: # character literal (?: ['] [^'] ['] | ['] \\ . ['] | ['] \\x{HEX}{HEX} ['] | ['] \\0\d\d ['] | (?: ['] \\o[01]\d\d ['] | ['] \\o2[0-4]\d ['] | ['] \\o25[0-5] ['] ) ) | # string literal (?: ["] (?: [^"\\]* \\ . )* [^"\\]* ["] ) # end string literal ) ''') _KEYWORD = textwrap.dedent(r''' (?: \b (?: auto | extern | register | static | typedef | const | volatile | signed | unsigned | char | short | int | long | float | double | void | struct | union | enum | goto | return | sizeof | break | continue | if | else | for | do | while | switch | case | default | entry ) \b ) ''') KEYWORD = rf''' # keyword {_KEYWORD} # end keyword ''' _KEYWORD = ''.join(_KEYWORD.split()) IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )' # We use a negative lookahead to filter out keywords. STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )' ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )' ####################################### # types SIMPLE_TYPE = textwrap.dedent(rf''' # simple type (?: \b (?: void | (?: signed | unsigned ) # implies int | (?: (?: (?: signed | unsigned ) \s+ )? (?: (?: long | short ) \s+ )? (?: char | short | int | long | float | double ) ) ) \b ) # end simple type ''') COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )' ####################################### # variable declarations _STORAGE = 'auto register static extern'.split() STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )' TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )' PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )' TYPE_SPEC = textwrap.dedent(rf''' # type spec (?: {_ind(SIMPLE_TYPE, 2)} | (?: [_]*typeof[_]* \s* [(] (?: \s* [*&] )* \s* {STRICT_IDENTIFIER} \s* [)] ) | # reference to a compound type (?: {COMPOUND_TYPE_KIND} (?: \s* {ANON_IDENTIFIER} )? ) | # reference to a typedef {STRICT_IDENTIFIER} ) # end type spec ''') DECLARATOR = textwrap.dedent(rf''' # declarator (possibly abstract) (?: (?: {PTR_QUALIFIER} \s* )* (?: (?: (?: # {STRICT_IDENTIFIER} ) (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays ) | (?: [(] \s* (?: # {STRICT_IDENTIFIER} ) (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays \s* [)] ) | # func ptr (?: [(] (?: \s* {PTR_QUALIFIER} )? \s* (?: # {STRICT_IDENTIFIER} ) (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays \s* [)] # We allow for a single level of paren nesting in parameters. \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)] ) ) ) # end declarator ''') VAR_DECL = textwrap.dedent(rf''' # var decl (and typedef and func return type) (?: (?: (?: # {STORAGE_CLASS} ) \s* )? (?: (?: # {TYPE_QUALIFIER} ) \s* )? (?: (?: # {_ind(TYPE_SPEC, 4)} ) ) \s* (?: (?: # {_ind(DECLARATOR, 4)} ) ) ) # end var decl ''') INITIALIZER = textwrap.dedent(rf''' # initializer (?: (?: [(] # no nested parens (e.g. func ptr) [^)]* [)] \s* )? (?: # a string literal (?: (?: {_ind(STRING_LITERAL, 4)} \s* )* {_ind(STRING_LITERAL, 4)} ) | # a simple initializer (?: (?: [^'",;{{]* {_ind(STRING_LITERAL, 4)} )* [^'",;{{]* ) | # a struct/array literal (?: # We only expect compound initializers with # single-variable declarations. {{ (?: [^'";]*? {_ind(STRING_LITERAL, 5)} )* [^'";]*? }} (?= \s* ; ) # Note this lookahead. ) ) ) # end initializer ''') ####################################### # compound type declarations STRUCT_MEMBER_DECL = textwrap.dedent(rf''' (?: # inline compound type decl (?: (?: # {COMPOUND_TYPE_KIND} ) (?: \s+ (?: # {STRICT_IDENTIFIER} ) )? \s* {{ ) | (?: # typed member (?: # Technically it doesn't have to have a type... (?: # (?: {TYPE_QUALIFIER} \s* )? {_ind(TYPE_SPEC, 5)} ) (?: # If it doesn't have a declarator then it will have # a size and vice versa. \s* (?: # {_ind(DECLARATOR, 6)} ) )? ) # sized member (?: \s* [:] \s* (?: # \d+ ) )? \s* (?: # [,;] ) ) | (?: \s* (?: # }} ) ) ) ''') ENUM_MEMBER_DECL = textwrap.dedent(rf''' (?: (?: \s* (?: # }} ) ) | (?: \s* (?: # {IDENTIFIER} ) (?: \s* = \s* (?: # {_ind(STRING_LITERAL, 4)} | [^'",}}]+ ) )? \s* (?: # , | }} ) ) ) ''') ####################################### # statements SIMPLE_STMT_BODY = textwrap.dedent(rf''' # simple statement body (?: (?: [^'"{{}};]* {_ind(STRING_LITERAL, 3)} )* [^'"{{}};]* #(?= [;{{] ) # Note this lookahead. ) # end simple statement body ''') SIMPLE_STMT = textwrap.dedent(rf''' # simple statement (?: (?: # # stmt-inline "initializer" (?: return \b (?: \s* {_ind(INITIALIZER, 5)} )? ) | # variable assignment (?: (?: [*] \s* )? (?: {STRICT_IDENTIFIER} \s* (?: . | -> ) \s* )* {STRICT_IDENTIFIER} (?: \s* \[ \s* \d+ \s* \] )? \s* = \s* {_ind(INITIALIZER, 4)} ) | # catchall return statement (?: return \b (?: (?: [^'";]* {_ind(STRING_LITERAL, 6)} )* \s* [^'";]* )? ) | # simple statement (?: {_ind(SIMPLE_STMT_BODY, 4)} ) ) \s* (?: # ; ) ) # end simple statement ''') COMPOUND_STMT = textwrap.dedent(rf''' # compound statement (?: \b (?: (?: (?: # else | do ) \b ) | (?: (?: # (?: case \b (?: [^'":]* {_ind(STRING_LITERAL, 7)} )* \s* [^'":]* ) | default | {STRICT_IDENTIFIER} ) \s* [:] ) | (?: (?: # for | while | if | switch ) \s* (?= [(] ) # Note this lookahead. ) ) \s* ) # end compound statement ''') ####################################### # function bodies LOCAL = textwrap.dedent(rf''' (?: # an empty statement (?: # ; ) | # inline type decl (?: (?: (?: # [^;{{}}]+? ) \s* )? (?: # (?: {STORAGE_CLASS} \s* )? (?: {TYPE_QUALIFIER} \s* )? )? # (?: # {COMPOUND_TYPE_KIND} ) (?: \s+ (?: # {STRICT_IDENTIFIER} ) )? \s* {{ ) | # var decl (?: (?: # {STORAGE_CLASS} )? # (?: \s* (?: # {_ind(VAR_DECL, 5)} ) ) (?: (?: # initializer # We expect only basic initializers. \s* = \s* (?: # {_ind(INITIALIZER, 6)} ) )? (?: \s* (?: # [,;] ) ) ) ) | {_ind(COMPOUND_STMT, 2)} | # start-of-block (?: (?: # (?: [^'"{{}};]* {_ind(STRING_LITERAL, 5)} )* [^'"{{}};]* # Presumably we will not see "== {{". [^\s='"{{}});] \s* )? # (?: # {{ ) ) | {_ind(SIMPLE_STMT, 2)} | # end-of-block (?: # }} ) ) ''') LOCAL_STATICS = textwrap.dedent(rf''' (?: # inline type decl (?: (?: (?: # [^;{{}}]+? ) \s* )? (?: # (?: {STORAGE_CLASS} \s* )? (?: {TYPE_QUALIFIER} \s* )? )? (?: # {COMPOUND_TYPE_KIND} ) (?: \s+ (?: # {STRICT_IDENTIFIER} ) )? \s* {{ ) | # var decl (?: # We only look for static variables. (?: # static \b (?: \s* {TYPE_QUALIFIER} )? \s* {_ind(TYPE_SPEC, 4)} \s* {_ind(DECLARATOR, 4)} ) \s* (?: (?: # = \s* {_ind(INITIALIZER, 4)} \s* [,;{{] ) | (?: # [,;] ) ) ) | # everything else (?: (?: # (?: [^'"{{}};]* {_ind(STRING_LITERAL, 4)} )* \s* [^'"{{}};]* ) (?: (?: # {{ ) | (?: # }} ) | (?: # ; ) ) ) ) ''') ####################################### # global declarations GLOBAL = textwrap.dedent(rf''' (?: # an empty statement (?: # ; ) | # compound type decl (maybe inline) (?: (?: (?: # [^;{{}}]+? ) \s* )? (?: # {COMPOUND_TYPE_KIND} ) (?: \s+ (?: # {STRICT_IDENTIFIER} ) )? \s* {{ ) | # bogus inline decl artifact # This simplifies resolving the relative syntactic ambiguity of # inline structs. (?: (?: # {COMPOUND_TYPE_KIND} ) \s* (?: # {ANON_IDENTIFIER} ) (?: # [^=,;({{[*\]]* [=,;({{] ) ) | # typedef (?: \b typedef \b \s* (?: # {_ind(VAR_DECL, 4)} ) (?: # We expect no inline type definitions in the parameters. \s* [(] \s* (?: # [^{{;]* ) \s* [)] )? \s* ; ) | # func decl/definition & var decls # XXX dedicated pattern for funcs (more restricted)? (?: (?: (?: # {STORAGE_CLASS} ) \s* )? (?: (?: # \b inline \b ) \s* )? (?: # {_ind(VAR_DECL, 4)} ) (?: # func decl / definition (?: (?: # We expect no inline type definitions in the parameters. \s* [(] \s* (?: # [^{{;]* ) \s* [)] \s* (?: # [{{;] ) ) | (?: # This is some old-school syntax! \s* [(] \s* # We throw away the bare names: {STRICT_IDENTIFIER} (?: \s* , \s* {STRICT_IDENTIFIER} )* \s* [)] \s* # We keep the trailing param declarations: (?: # # There's at least one! (?: {TYPE_QUALIFIER} \s* )? {_ind(TYPE_SPEC, 7)} \s* {_ind(DECLARATOR, 7)} \s* ; (?: \s* (?: {TYPE_QUALIFIER} \s* )? {_ind(TYPE_SPEC, 8)} \s* {_ind(DECLARATOR, 8)} \s* ; )* ) \s* {{ ) ) | # var / typedef (?: (?: # initializer # We expect only basic initializers. \s* = \s* (?: # {_ind(INITIALIZER, 6)} ) )? \s* (?: # [,;] ) ) ) ) ) ''')