Package openid :: Package yadis :: Module parsehtml
[frames] | no frames]

Source Code for Module openid.yadis.parsehtml

  1  __all__ = ['findHTMLMeta', 'MetaNotFound'] 
  2   
  3  from HTMLParser import HTMLParser, HTMLParseError 
  4  import htmlentitydefs 
  5  import re 
  6   
  7  from openid.yadis.constants import YADIS_HEADER_NAME 
  8   
  9  # Size of the chunks to search at a time (also the amount that gets 
 10  # read at a time) 
 11  CHUNK_SIZE = 1024 * 16 # 16 KB 
 12   
13 -class ParseDone(Exception):
14 """Exception to hold the URI that was located when the parse is 15 finished. If the parse finishes without finding the URI, set it to 16 None."""
17
18 -class MetaNotFound(Exception):
19 """Exception to hold the content of the page if we did not find 20 the appropriate <meta> tag"""
21 22 re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE 23 ent_pat = r''' 24 & 25 26 (?: \#x (?P<hex> [a-f0-9]+ ) 27 | \# (?P<dec> \d+ ) 28 | (?P<word> \w+ ) 29 ) 30 31 ;''' 32 33 ent_re = re.compile(ent_pat, re_flags) 34
35 -def substituteMO(mo):
36 if mo.lastgroup == 'hex': 37 codepoint = int(mo.group('hex'), 16) 38 elif mo.lastgroup == 'dec': 39 codepoint = int(mo.group('dec')) 40 else: 41 assert mo.lastgroup == 'word' 42 codepoint = htmlentitydefs.name2codepoint.get(mo.group('word')) 43 44 if codepoint is None: 45 return mo.group() 46 else: 47 return unichr(codepoint)
48
49 -def substituteEntities(s):
50 return ent_re.sub(substituteMO, s)
51
52 -class YadisHTMLParser(HTMLParser):
53 """Parser that finds a meta http-equiv tag in the head of a html 54 document. 55 56 When feeding in data, if the tag is matched or it will never be 57 found, the parser will raise ParseDone with the uri as the first 58 attribute. 59 60 Parsing state diagram 61 ===================== 62 63 Any unlisted input does not affect the state:: 64 65 1, 2, 5 8 66 +--------------------------+ +-+ 67 | | | | 68 4 | 3 1, 2, 5, 7 v | v 69 TOP -> HTML -> HEAD ----------> TERMINATED 70 | | ^ | ^ ^ 71 | | 3 | | | | 72 | +------------+ +-> FOUND ------+ | 73 | 6 8 | 74 | 1, 2 | 75 +------------------------------------+ 76 77 1. any of </body>, </html>, </head> -> TERMINATE 78 2. <body> -> TERMINATE 79 3. <head> -> HEAD 80 4. <html> -> HTML 81 5. <html> -> TERMINATE 82 6. <meta http-equiv='X-XRDS-Location'> -> FOUND 83 7. <head> -> TERMINATE 84 8. Any input -> TERMINATE 85 """ 86 TOP = 0 87 HTML = 1 88 HEAD = 2 89 FOUND = 3 90 TERMINATED = 4 91
92 - def __init__(self):
93 HTMLParser.__init__(self) 94 self.phase = self.TOP
95
96 - def _terminate(self):
97 self.phase = self.TERMINATED 98 raise ParseDone(None)
99
100 - def handle_endtag(self, tag):
101 # If we ever see an end of head, body, or html, bail out right away. 102 # [1] 103 if tag in ['head', 'body', 'html']: 104 self._terminate()
105
106 - def handle_starttag(self, tag, attrs):
107 # if we ever see a start body tag, bail out right away, since 108 # we want to prevent the meta tag from appearing in the body 109 # [2] 110 if tag=='body': 111 self._terminate() 112 113 if self.phase == self.TOP: 114 # At the top level, allow a html tag or a head tag to move 115 # to the head or html phase 116 if tag == 'head': 117 # [3] 118 self.phase = self.HEAD 119 elif tag == 'html': 120 # [4] 121 self.phase = self.HTML 122 123 elif self.phase == self.HTML: 124 # if we are in the html tag, allow a head tag to move to 125 # the HEAD phase. If we get another html tag, then bail 126 # out 127 if tag == 'head': 128 # [3] 129 self.phase = self.HEAD 130 elif tag == 'html': 131 # [5] 132 self._terminate() 133 134 elif self.phase == self.HEAD: 135 # If we are in the head phase, look for the appropriate 136 # meta tag. If we get a head or body tag, bail out. 137 if tag == 'meta': 138 attrs_d = dict(attrs) 139 http_equiv = attrs_d.get('http-equiv', '').lower() 140 if http_equiv == YADIS_HEADER_NAME.lower(): 141 raw_attr = attrs_d.get('content') 142 yadis_loc = substituteEntities(raw_attr) 143 # [6] 144 self.phase = self.FOUND 145 raise ParseDone(yadis_loc) 146 147 elif tag in ['head', 'html']: 148 # [5], [7] 149 self._terminate()
150
151 - def feed(self, chars):
152 # [8] 153 if self.phase in [self.TERMINATED, self.FOUND]: 154 self._terminate() 155 156 return HTMLParser.feed(self, chars)
157
158 -def findHTMLMeta(stream):
159 """Look for a meta http-equiv tag with the YADIS header name. 160 161 @param stream: Source of the html text 162 @type stream: Object that implements a read() method that works 163 like file.read 164 165 @return: The URI from which to fetch the XRDS document 166 @rtype: str 167 168 @raises MetaNotFound: raised with the content that was 169 searched as the first parameter. 170 """ 171 parser = YadisHTMLParser() 172 chunks = [] 173 174 while 1: 175 chunk = stream.read(CHUNK_SIZE) 176 if not chunk: 177 # End of file 178 break 179 180 chunks.append(chunk) 181 try: 182 parser.feed(chunk) 183 except HTMLParseError, why: 184 # HTML parse error, so bail 185 chunks.append(stream.read()) 186 break 187 except ParseDone, why: 188 uri = why[0] 189 if uri is None: 190 # Parse finished, but we may need the rest of the file 191 chunks.append(stream.read()) 192 break 193 else: 194 return uri 195 196 content = ''.join(chunks) 197 raise MetaNotFound(content)
198