Class: HTMLTokenizer

A class to tokenize HTML.

Example:

  page = "<HTML>
  <HEAD>
  <TITLE>This is the title</TITLE>
  </HEAD>
   <!-- Here comes the <a href=\"missing.link\">blah</a>
   comment body
    -->
   <BODY>
     <H1>This is the header</H1>
     <P>
       This is the paragraph, it contains
       <a href=\"link.html\">links</a>,
       <img src=\"blah.gif\" optional alt='images
       are
       really cool'>.  Ok, here is some more text and
       <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
     </P>
   </body>
   </HTML>
   "
   toke = HTMLTokenizer.new(page)

   assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
   assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
   assert("links" == toke.getTrimmedText)
   assert(toke.getTag("IMG", "A").attr_hash['optional'])
   assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])