1  """ 
  2  This module implements a VERY limited parser that finds <link> tags in 
  3  the head of HTML or XHTML documents and parses out their attributes 
  4  according to the OpenID spec. It is a liberal parser, but it requires 
  5  these things from the data in order to work: 
  6   
  7   - There must be an open <html> tag 
  8   
  9   - There must be an open <head> tag inside of the <html> tag 
 10   
 11   - Only <link>s that are found inside of the <head> tag are parsed 
 12     (this is by design) 
 13   
 14   - The parser follows the OpenID specification in resolving the 
 15     attributes of the link tags. This means that the attributes DO NOT 
 16     get resolved as they would by an XML or HTML parser. In particular, 
 17     only certain entities get replaced, and href attributes do not get 
 18     resolved relative to a base URL. 
 19   
 20  From http://openid.net/specs.bml#linkrel: 
 21   
 22   - The openid.server URL MUST be an absolute URL. OpenID consumers 
 23     MUST NOT attempt to resolve relative URLs. 
 24   
 25   - The openid.server URL MUST NOT include entities other than &, 
 26     <, >, and ". 
 27   
 28  The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds of 
 29  quoting are allowed for attributes. 
 30   
 31  The parser deals with invalid markup in these ways: 
 32   
 33   - Tag names are not case-sensitive 
 34   
 35   - The <html> tag is accepted even when it is not at the top level 
 36   
 37   - The <head> tag is accepted even when it is not a direct child of 
 38     the <html> tag, but a <html> tag must be an ancestor of the <head> 
 39     tag 
 40   
 41   - <link> tags are accepted even when they are not direct children of 
 42     the <head> tag, but a <head> tag must be an ancestor of the <link> 
 43     tag 
 44   
 45   - If there is no closing tag for an open <html> or <head> tag, the 
 46     remainder of the document is viewed as being inside of the tag. If 
 47     there is no closing tag for a <link> tag, the link tag is treated 
 48     as a short tag. Exceptions to this rule are that <html> closes 
 49     <html> and <body> or <head> closes <head> 
 50   
 51   - Attributes of the <link> tag are not required to be quoted. 
 52   
 53   - In the case of duplicated attribute names, the attribute coming 
 54     last in the tag will be the value returned. 
 55   
 56   - Any text that does not parse as an attribute within a link tag will 
 57     be ignored. (e.g. <link pumpkin rel='openid.server' /> will ignore 
 58     pumpkin) 
 59   
 60   - If there are more than one <html> or <head> tag, the parser only 
 61     looks inside of the first one. 
 62   
 63   - The contents of <script> tags are ignored entirely, except unclosed 
 64     <script> tags. Unclosed <script> tags are ignored. 
 65   
 66   - Any other invalid markup is ignored, including unclosed SGML 
 67     comments and unclosed <![CDATA[blocks. 
 68  """ 
 69   
 70  __all__ = ['parseLinkAttrs'] 
 71   
 72  import re 
 73   
 74  flags = ( re.DOTALL  
 75          | re.IGNORECASE 
 76          | re.VERBOSE  
 77          | re.UNICODE  
 78          ) 
 79   
 80   
 81  removed_re = re.compile(r''' 
 82    # Comments 
 83    <!--.*?--> 
 84   
 85    # CDATA blocks 
 86  | <!\[CDATA\[.*?\]\]> 
 87   
 88    # script blocks 
 89  | <script\b 
 90   
 91    # make sure script is not an XML namespace 
 92    (?!:) 
 93   
 94    [^>]*>.*?</script> 
 95   
 96  ''', flags) 
 97   
 98  tag_expr = r''' 
 99  # Starts with the tag name at a word boundary, where the tag name is 
100  # not a namespace 
101  <%(tag_name)s\b(?!:) 
102   
103  # All of the stuff up to a ">", hopefully attributes. 
104  (?P<attrs>[^>]*?) 
105   
106  (?: # Match a short tag 
107      /> 
108   
109  |   # Match a full tag 
110      > 
111   
112      (?P<contents>.*?) 
113   
114      # Closed by 
115      (?: # One of the specified close tags 
116          </?%(closers)s\s*> 
117   
118          # End of the string 
119      |   \Z 
120   
121      ) 
122   
123  ) 
124  ''' 
125   
127      if close_tags: 
128          options = '|'.join((tag_name,) + close_tags) 
129          closers = '(?:%s)' % (options,) 
130      else: 
131          closers = tag_name 
132   
133      expr = tag_expr % locals() 
134      return re.compile(expr, flags) 
 135   
136   
137  html_find = tagMatcher('html') 
138  head_find = tagMatcher('head', 'body') 
139  link_find = re.compile(r'<link\b(?!:)', flags) 
140   
141  attr_find = re.compile(r''' 
142  # Must start with a sequence of word-characters, followed by an equals sign 
143  (?P<attr_name>\w+)= 
144   
145  # Then either a quoted or unquoted attribute 
146  (?: 
147   
148   # Match everything that\'s between matching quote marks 
149   (?P<qopen>["\'])(?P<q_val>.*?)(?P=qopen) 
150  | 
151   
152   # If the value is not quoted, match up to whitespace 
153   (?P<unq_val>(?:[^\s<>/]|/(?!>))+) 
154  ) 
155   
156  | 
157   
158  (?P<end_link>[<>]) 
159  ''', flags) 
160   
161   
162  replacements = { 
163      'amp':'&', 
164      'lt':'<', 
165      'gt':'>', 
166      'quot':'"', 
167      } 
168   
169  ent_replace = re.compile(r'&(%s);' % '|'.join(replacements.keys())) 
171      "Replace the entities that are specified by OpenID" 
172      return replacements.get(mo.group(1), mo.group()) 
 173   
175      """Find all link tags in a string representing a HTML document and 
176      return a list of their attributes. 
177   
178      @param html: the text to parse 
179      @type html: str or unicode 
180   
181      @return: A list of dictionaries of attributes, one for each link tag 
182      @rtype: [[(type(html), type(html))]] 
183      """ 
184      stripped = removed_re.sub('', html) 
185      html_mo = html_find.search(stripped) 
186      if html_mo is None or html_mo.start('contents') == -1: 
187          return [] 
188   
189      start, end = html_mo.span('contents') 
190      head_mo = head_find.search(stripped, start, end) 
191      if head_mo is None or head_mo.start('contents') == -1: 
192          return [] 
193   
194      start, end = head_mo.span('contents') 
195      link_mos = link_find.finditer(stripped, head_mo.start(), head_mo.end()) 
196   
197      matches = [] 
198      for link_mo in link_mos: 
199          start = link_mo.start() + 5 
200          link_attrs = {} 
201          for attr_mo in attr_find.finditer(stripped, start): 
202              if attr_mo.lastgroup == 'end_link': 
203                  break 
204   
205               
206               
207              attr_name, q_val, unq_val = attr_mo.group( 
208                  'attr_name', 'q_val', 'unq_val') 
209              attr_val = ent_replace.sub(replaceEnt, unq_val or q_val) 
210   
211              link_attrs[attr_name] = attr_val 
212   
213          matches.append(link_attrs) 
214   
215      return matches 
 216   
218      """Does this target_rel appear in the rel_str?""" 
219       
220      rels = rel_attr.strip().split() 
221      for rel in rels: 
222          rel = rel.lower() 
223          if rel == target_rel: 
224              return 1 
225   
226      return 0 
 227   
229      """Does this link have target_rel as a relationship?""" 
230       
231      rel_attr = link_attrs.get('rel') 
232      return rel_attr and relMatches(rel_attr, target_rel) 
 233   
235      """Filter the list of link attributes on whether it has target_rel 
236      as a relationship.""" 
237       
238      matchesTarget = lambda attrs: linkHasRel(attrs, target_rel) 
239      return filter(matchesTarget, link_attrs_list) 
 240   
242      """Return the value of the href attribute for the first link tag 
243      in the list that has target_rel as a relationship.""" 
244       
245      matches = findLinksRel(link_attrs_list, target_rel) 
246      if not matches: 
247          return None 
248      first = matches[0] 
249      return first.get('href') 
 250