class HTML::Tokenizer
:nodoc:
end
p token
while token = tokenizer.next
tokenizer = HTML::Tokenizer.new(text)
Usage:
This currently assumes valid XHTML, which means no free < or > characters.
token is a string. Each string represents either “text”, or an HTML element.
A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
def consume_quoted_regions
Skips over quoted strings, so that less-than and greater-than characters
def consume_quoted_regions text = "" loop do match = @scanner.scan_until(/['"<>]/) or break delim = @scanner.matched if delim == "<" match = match.chop @scanner.pos -= 1 end text << match break if delim == "<" || delim == ">" # consume the quoted region while match = @scanner.scan_until(/[\\#{delim}]/) text << match break if @scanner.matched == delim break if @scanner.eos? text << @scanner.getch # skip the escaped character end end text end
def initialize(text)
def initialize(text) text.encode! @scanner = StringScanner.new(text) @position = 0 @line = 0 @current_line = 1 end
def next
Returns the next token in the sequence, or +nil+ if there are no more tokens in
def next return nil if @scanner.eos? @position = @scanner.pos @line = @current_line if @scanner.check(/<\S/) update_current_line(scan_tag) else update_current_line(scan_text) end end
def scan_tag
comments, doctype tags, and regular tags, and ignores less-than and
Treat the text at the current position as a tag, and scan it. Supports
def scan_tag tag = @scanner.getch if @scanner.scan(/!--/) # comment tag << @scanner.matched tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/)) elsif @scanner.scan(/!\[CDATA\[/) tag << @scanner.matched tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/)) elsif @scanner.scan(/!/) # doctype tag << @scanner.matched tag << consume_quoted_regions else tag << consume_quoted_regions end tag end
def scan_text
def scan_text "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}" end
def update_current_line(text)
Counts the number of newlines in the text and updates the current line
def update_current_line(text) text.scan(/\r?\n/) { @current_line += 1 } end