"""A parser for HTML and XHTML.""" # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import re import string # Regular expressions used for parsing interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piopen = re.compile(r'<\?') piclose = re.compile('>') endtagopen = re.compile(']*>') commentopen = re.compile('