1 __all__ = ['findHTMLMeta', 'MetaNotFound']
2
3 from HTMLParser import HTMLParser, HTMLParseError
4 import htmlentitydefs
5 import re
6
7 from openid.yadis.constants import YADIS_HEADER_NAME
8
9
10
11 CHUNK_SIZE = 1024 * 16
12
14 """Exception to hold the URI that was located when the parse is
15 finished. If the parse finishes without finding the URI, set it to
16 None."""
17
21
22 re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE
23 ent_pat = r'''
24 &
25
26 (?: \#x (?P<hex> [a-f0-9]+ )
27 | \# (?P<dec> \d+ )
28 | (?P<word> \w+ )
29 )
30
31 ;'''
32
33 ent_re = re.compile(ent_pat, re_flags)
34
36 if mo.lastgroup == 'hex':
37 codepoint = int(mo.group('hex'), 16)
38 elif mo.lastgroup == 'dec':
39 codepoint = int(mo.group('dec'))
40 else:
41 assert mo.lastgroup == 'word'
42 codepoint = htmlentitydefs.name2codepoint.get(mo.group('word'))
43
44 if codepoint is None:
45 return mo.group()
46 else:
47 return unichr(codepoint)
48
50 return ent_re.sub(substituteMO, s)
51
53 """Parser that finds a meta http-equiv tag in the head of a html
54 document.
55
56 When feeding in data, if the tag is matched or it will never be
57 found, the parser will raise ParseDone with the uri as the first
58 attribute.
59
60 Parsing state diagram
61 =====================
62
63 Any unlisted input does not affect the state::
64
65 1, 2, 5 8
66 +--------------------------+ +-+
67 | | | |
68 4 | 3 1, 2, 5, 7 v | v
69 TOP -> HTML -> HEAD ----------> TERMINATED
70 | | ^ | ^ ^
71 | | 3 | | | |
72 | +------------+ +-> FOUND ------+ |
73 | 6 8 |
74 | 1, 2 |
75 +------------------------------------+
76
77 1. any of </body>, </html>, </head> -> TERMINATE
78 2. <body> -> TERMINATE
79 3. <head> -> HEAD
80 4. <html> -> HTML
81 5. <html> -> TERMINATE
82 6. <meta http-equiv='X-XRDS-Location'> -> FOUND
83 7. <head> -> TERMINATE
84 8. Any input -> TERMINATE
85 """
86 TOP = 0
87 HTML = 1
88 HEAD = 2
89 FOUND = 3
90 TERMINATED = 4
91
93 HTMLParser.__init__(self)
94 self.phase = self.TOP
95
97 self.phase = self.TERMINATED
98 raise ParseDone(None)
99
101
102
103 if tag in ['head', 'body', 'html']:
104 self._terminate()
105
107
108
109
110 if tag=='body':
111 self._terminate()
112
113 if self.phase == self.TOP:
114
115
116 if tag == 'head':
117
118 self.phase = self.HEAD
119 elif tag == 'html':
120
121 self.phase = self.HTML
122
123 elif self.phase == self.HTML:
124
125
126
127 if tag == 'head':
128
129 self.phase = self.HEAD
130 elif tag == 'html':
131
132 self._terminate()
133
134 elif self.phase == self.HEAD:
135
136
137 if tag == 'meta':
138 attrs_d = dict(attrs)
139 http_equiv = attrs_d.get('http-equiv', '').lower()
140 if http_equiv == YADIS_HEADER_NAME.lower():
141 raw_attr = attrs_d.get('content')
142 yadis_loc = substituteEntities(raw_attr)
143
144 self.phase = self.FOUND
145 raise ParseDone(yadis_loc)
146
147 elif tag in ['head', 'html']:
148
149 self._terminate()
150
151 - def feed(self, chars):
152
153 if self.phase in [self.TERMINATED, self.FOUND]:
154 self._terminate()
155
156 return HTMLParser.feed(self, chars)
157
198