cpython/Mac/Tools/IDE/PyFontify.py

"""Module to analyze Python source code; for syntax coloring tools.

Interface:
	tags = fontify(pytext, searchfrom, searchto)

The 'pytext' argument is a string containing Python source code.
The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext. 
The returned value is a list of tuples, formatted like this:
	[('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
The tuple contents are always like this:
	(tag, startindex, endindex, sublist)
tag is one of 'keyword', 'string', 'comment' or 'identifier'
sublist is not used, hence always None. 
"""

# Based on FontText.py by Mitchell S. Chapman,
# which was modified by Zachary Roadhouse,
# then un-Tk'd by Just van Rossum.
# Many thanks for regular expression debugging & authoring are due to:
#	Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
# So, who owns the copyright? ;-) How about this:
# Copyright 1996-2001: 
#	Mitchell S. Chapman,
#	Zachary Roadhouse,
#	Tim Peters,
#	Just van Rossum

__version__ = "0.4"

import string
import re

# First a little helper, since I don't like to repeat things. (Tismer speaking)
import string
def replace(where, what, with):
	return string.join(string.split(where, what), with)

# This list of keywords is taken from ref/node13.html of the
# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
keywordsList = [
	"assert", "exec",
	"del", "from", "lambda", "return",
	"and", "elif", "global", "not", "try",
	"break", "else", "if", "or", "while",
	"class", "except", "import", "pass",
	"continue", "finally", "in", "print",
	"def", "for", "is", "raise", "yield"]

# Build up a regular expression which will match anything
# interesting, including multi-line triple-quoted strings.
commentPat = r"#[^\n]*"

pat = r"q[^\\q\n]*(\\[\000-\377][^\\q\n]*)*q"
quotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"')

# Way to go, Tim!
pat = r"""
	qqq
	[^\\q]*
	(
		(	\\[\000-\377]
		|	q
			(	\\[\000-\377]
			|	[^\q]
			|	q
				(	\\[\000-\377]
				|	[^\\q]
				)
			)
		)
		[^\\q]*
	)*
	qqq
"""
pat = string.join(string.split(pat), '')	# get rid of whitespace
tripleQuotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"')

# Build up a regular expression which matches all and only
# Python keywords. This will let us skip the uninteresting
# identifier references.
# nonKeyPat identifies characters which may legally precede
# a keyword pattern.
nonKeyPat = r"(^|[^a-zA-Z0-9_.\"'])"

keyPat = nonKeyPat + "(" + "|".join(keywordsList) + ")" + nonKeyPat

matchPat = commentPat + "|" + keyPat + "|" + tripleQuotePat + "|" + quotePat
matchRE = re.compile(matchPat)

idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*"	# Ident w. leading whitespace.
idRE = re.compile(idKeyPat)


def fontify(pytext, searchfrom = 0, searchto = None):
	if searchto is None:
		searchto = len(pytext)
	# Cache a few attributes for quicker reference.
	search = matchRE.search
	idSearch = idRE.search
	
	tags = []
	tags_append = tags.append
	commentTag = 'comment'
	stringTag = 'string'
	keywordTag = 'keyword'
	identifierTag = 'identifier'
	
	start = 0
	end = searchfrom
	while 1:
		m = search(pytext, end)
		if m is None:
			break	# EXIT LOOP
		start = m.start()
		if start >= searchto:
			break	# EXIT LOOP
		match = m.group(0)
		end = start + len(match)
		c = match[0]
		if c not in "#'\"":
			# Must have matched a keyword.
			if start <> searchfrom:
				# there's still a redundant char before and after it, strip!
				match = match[1:-1]
				start = start + 1
			else:
				# this is the first keyword in the text.
				# Only a space at the end.
				match = match[:-1]
			end = end - 1
			tags_append((keywordTag, start, end, None))
			# If this was a defining keyword, look ahead to the
			# following identifier.
			if match in ["def", "class"]:
				m = idSearch(pytext, end)
				if m is not None:
					start = m.start()
					if start == end:
						match = m.group(0)
						end = start + len(match)
						tags_append((identifierTag, start, end, None))
		elif c == "#":
			tags_append((commentTag, start, end, None))
		else:
			tags_append((stringTag, start, end, None))
	return tags


def test(path):
	f = open(path)
	text = f.read()
	f.close()
	tags = fontify(text)
	for tag, start, end, sublist in tags:
		print tag, `text[start:end]`
First Checked In. 1999-01-30 18:39:17 -04:00			`"""Module to analyze Python source code; for syntax coloring tools.`

			`Interface:`
			`tags = fontify(pytext, searchfrom, searchto)`

			`The 'pytext' argument is a string containing Python source code.`
			`The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.`
			`The returned value is a list of tuples, formatted like this:`
			`[('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]`
			`The tuple contents are always like this:`
			`(tag, startindex, endindex, sublist)`
			`tag is one of 'keyword', 'string', 'comment' or 'identifier'`
			`sublist is not used, hence always None.`
			`"""`

			`# Based on FontText.py by Mitchell S. Chapman,`
			`# which was modified by Zachary Roadhouse,`
			`# then un-Tk'd by Just van Rossum.`
			`# Many thanks for regular expression debugging & authoring are due to:`
			`# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer`
			`# So, who owns the copyright? ;-) How about this:`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`# Copyright 1996-2001:`
First Checked In. 1999-01-30 18:39:17 -04:00			`# Mitchell S. Chapman,`
			`# Zachary Roadhouse,`
			`# Tim Peters,`
			`# Just van Rossum`

- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`__version__ = "0.4"`
First Checked In. 1999-01-30 18:39:17 -04:00
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`import string`
			`import re`
First Checked In. 1999-01-30 18:39:17 -04:00
			`# First a little helper, since I don't like to repeat things. (Tismer speaking)`
			`import string`
			`def replace(where, what, with):`
			`return string.join(string.split(where, what), with)`

			`# This list of keywords is taken from ref/node13.html of the`
			`# Python 1.3 HTML documentation. ("access" is intentionally omitted.)`
			`keywordsList = [`
oops, keyword list missed exec... doh! thanks to Maik Roeder (jvr) 2000-04-09 16:44:13 -03:00			`"assert", "exec",`
First Checked In. 1999-01-30 18:39:17 -04:00			`"del", "from", "lambda", "return",`
			`"and", "elif", "global", "not", "try",`
			`"break", "else", "if", "or", "while",`
			`"class", "except", "import", "pass",`
			`"continue", "finally", "in", "print",`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`"def", "for", "is", "raise", "yield"]`
First Checked In. 1999-01-30 18:39:17 -04:00
			`# Build up a regular expression which will match anything`
			`# interesting, including multi-line triple-quoted strings.`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`commentPat = r"#[^\n]*"`
First Checked In. 1999-01-30 18:39:17 -04:00
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`pat = r"q[^\\q\n](\\[\000-\377][^\\q\n])*q"`
			`quotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')`
First Checked In. 1999-01-30 18:39:17 -04:00
			`# Way to go, Tim!`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`pat = r"""`
First Checked In. 1999-01-30 18:39:17 -04:00			`qqq`
			`[^\\q]*`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`(`
			`( \\[\000-\377]`
			`\| q`
			`( \\[\000-\377]`
			`\| [^\q]`
			`\| q`
			`( \\[\000-\377]`
			`\| [^\\q]`
			`)`
			`)`
			`)`
First Checked In. 1999-01-30 18:39:17 -04:00			`[^\\q]*`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`)*`
First Checked In. 1999-01-30 18:39:17 -04:00			`qqq`
			`"""`
			`pat = string.join(string.split(pat), '') # get rid of whitespace`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`tripleQuotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')`
First Checked In. 1999-01-30 18:39:17 -04:00
			`# Build up a regular expression which matches all and only`
			`# Python keywords. This will let us skip the uninteresting`
			`# identifier references.`
			`# nonKeyPat identifies characters which may legally precede`
			`# a keyword pattern.`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`nonKeyPat = r"(^\|[^a-zA-Z0-9_.\"'])"`
First Checked In. 1999-01-30 18:39:17 -04:00
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`keyPat = nonKeyPat + "(" + "\|".join(keywordsList) + ")" + nonKeyPat`
First Checked In. 1999-01-30 18:39:17 -04:00
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`matchPat = commentPat + "\|" + keyPat + "\|" + tripleQuotePat + "\|" + quotePat`
Use re in stead of regex, so we get rid of the annoying warning during startup. 2001-02-21 09:54:31 -04:00			`matchRE = re.compile(matchPat)`
First Checked In. 1999-01-30 18:39:17 -04:00
			`idKeyPat = "[ \t][A-Za-z_][A-Za-z_0-9.]" # Ident w. leading whitespace.`
Use re in stead of regex, so we get rid of the annoying warning during startup. 2001-02-21 09:54:31 -04:00			`idRE = re.compile(idKeyPat)`
First Checked In. 1999-01-30 18:39:17 -04:00

			`def fontify(pytext, searchfrom = 0, searchto = None):`
			`if searchto is None:`
			`searchto = len(pytext)`
			`# Cache a few attributes for quicker reference.`
			`search = matchRE.search`
			`idSearch = idRE.search`

			`tags = []`
			`tags_append = tags.append`
			`commentTag = 'comment'`
			`stringTag = 'string'`
			`keywordTag = 'keyword'`
			`identifierTag = 'identifier'`

			`start = 0`
			`end = searchfrom`
			`while 1:`
Use re in stead of regex, so we get rid of the annoying warning during startup. 2001-02-21 09:54:31 -04:00			`m = search(pytext, end)`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`if m is None:`
			`break # EXIT LOOP`
			`start = m.start()`
			`if start >= searchto:`
First Checked In. 1999-01-30 18:39:17 -04:00			`break # EXIT LOOP`
Use re in stead of regex, so we get rid of the annoying warning during startup. 2001-02-21 09:54:31 -04:00			`match = m.group(0)`
First Checked In. 1999-01-30 18:39:17 -04:00			`end = start + len(match)`
			`c = match[0]`
			`if c not in "#'\"":`
			`# Must have matched a keyword.`
			`if start <> searchfrom:`
			`# there's still a redundant char before and after it, strip!`
			`match = match[1:-1]`
			`start = start + 1`
			`else:`
			`# this is the first keyword in the text.`
			`# Only a space at the end.`
			`match = match[:-1]`
			`end = end - 1`
			`tags_append((keywordTag, start, end, None))`
			`# If this was a defining keyword, look ahead to the`
			`# following identifier.`
			`if match in ["def", "class"]:`
Use re in stead of regex, so we get rid of the annoying warning during startup. 2001-02-21 09:54:31 -04:00			`m = idSearch(pytext, end)`
- fixed some re usage, partly so it'll still work when re uses pre instead of sre, and partly fixing re -> regex porting oversights - fixed PyFontify.py so it actually works again.. 2001-07-10 16:25:40 -03:00			`if m is not None:`
			`start = m.start()`
			`if start == end:`
			`match = m.group(0)`
			`end = start + len(match)`
			`tags_append((identifierTag, start, end, None))`
First Checked In. 1999-01-30 18:39:17 -04:00			`elif c == "#":`
			`tags_append((commentTag, start, end, None))`
			`else:`
			`tags_append((stringTag, start, end, None))`
			`return tags`


			`def test(path):`
			`f = open(path)`
			`text = f.read()`
			`f.close()`
			`tags = fontify(text)`
			`for tag, start, end, sublist in tags:`
			print tag, `text[start:end]`