lib/hpricot/parse.rb



require 'hpricot/htmlinfo'

def Hpricot(input = nil, opts = {}, &blk)
  Hpricot.parse(input, opts, &blk)
end

module Hpricot
  # Exception class used for any errors related to deficiencies in the system when
  # handling the character encodings of a document.
  class EncodingError < StandardError; end

  # Hpricot.parse parses <i>input</i> and return a document tree.
  # represented by Hpricot::Doc.
  def Hpricot.parse(input = nil, opts = {}, &blk)
    Doc.new(make(input, opts, &blk), opts)
  end

  # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
  # and returning a document tree.
  def Hpricot.XML(input = nil, opts = {}, &blk)
    opts.merge! :xml => true
    Doc.new(make(input, opts, &blk), opts)
  end

  # :stopdoc:

  def Hpricot.make(input = nil, opts = {}, &blk)
    opts = {:fixup_tags => false}.merge(opts)
    unless input or blk
      raise ArgumentError, "An Hpricot document must be built from an input source (a String) or a block."
    end

    conv = opts[:xml] ? :to_s : :downcase

    fragment =
    if input
      case opts[:encoding]
      when nil
      when 'utf-8'
        unless defined? Encoding::Character::UTF8
          raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
        end
      else
        raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
      end

      if opts[:xhtml_strict]
        opts[:fixup_tags] = true
      end

      stack = [[nil, nil, [], [], [], []]]
      Hpricot.scan(input) do |token|
        if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
            !(token[0] == :etag and token[1].casecmp(stack.last[0]).zero?)
          token[0] = :text
          token[1] = token[3] if token[3]
        end

        if !opts[:xml] and token[0] == :emptytag
          token[1] = token[1].send(conv)
          if ElementContent[token[1].downcase] != :EMPTY
            token[0] = :stag
          end
        end

        # TODO: downcase instead when parsing attributes?
        if !opts[:xml] and token[2].is_a?(Hash)
          token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
        end

        case token[0]
        when :stag
          case opts[:encoding] when 'utf-8'
            token.map! { |str| u(str) if str.is_a? String }
          end

          stagname = token[0] = token[1] = token[1].send(conv)
          if ElementContent[stagname] == :EMPTY and !opts[:xml]
            token[0] = :emptytag
            stack.last[2] << token
          else
            unless opts[:xml]
              if opts[:fixup_tags]
                # obey the tag rules set up by the current element
                if ElementContent.has_key? stagname
                  trans = nil
                  (stack.length-1).downto(0) do |i|
                    untags = stack[i][5]
                    break unless untags.include? stagname
                    # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
                    trans = i
                  end
                  if trans.to_i > 1
                    eles = stack.slice!(trans..-1)
                    stack.last[2] += eles
                    # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
                  end
                elsif opts[:xhtml_strict]
                  token[2] = {'class' => stagname}
                  stagname = token[0] = "div"
                end
              end

              # setup tag rules for inside this element
              if ElementContent[stagname] == :CDATA
                uncontainable_tags = :CDATA
              elsif opts[:fixup_tags]
                possible_tags = ElementContent[stagname]
                excluded_tags, included_tags = stack.last[3..4]
                if possible_tags
                  excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
                  included_tags = included_tags | (ElementInclusions[stagname] || [])
                  containable_tags = (possible_tags | included_tags) - excluded_tags
                  uncontainable_tags = ElementContent.keys - containable_tags
                else
                  # If the tagname is unknown, it is assumed that any element
                  # except excluded can be contained.
                  uncontainable_tags = excluded_tags
                end
              end
            end
            unless opts[:xml]
              case token[2] when Hash
                token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
              end
            end
            stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
          end
        when :etag
          etagname = token[0] = token[1].send(conv)
          if opts[:xhtml_strict] and not ElementContent.has_key? etagname
            etagname = token[0] = "div"
          end
          matched_elem = nil
          (stack.length-1).downto(0) do |i|
            stagname, = stack[i]
            if stagname == etagname
              matched_elem = stack[i]
              stack[i][1] += token
              eles = stack.slice!((i+1)..-1)
              stack.last[2] += eles if eles
              break
            end
          end
          unless matched_elem
            stack.last[2] << [:bogus_etag, token.first, token.last]
          else
            ele = stack.pop
            stack.last[2] << ele
          end
        when :text
          l = stack.last[2].last
          if l and l[0] == :text
            l[1] += token[1]
          else
            stack.last[2] << token
          end
        else
          stack.last[2] << token
        end
      end

      while 1 < stack.length
        ele = stack.pop
        stack.last[2] << ele
      end

      structure_list = stack[0][2]
      structure_list.map {|s| build_node(s, opts) }
    elsif blk
      Hpricot.build(&blk).children
    end
  end

  def Hpricot.build_node(structure, opts = {})
    case structure[0]
    when String
      tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
      children = structure[2]
      etag = eraw && ETag.parse(tagname, eraw)
      stag = STag.parse(tagname, attrs, sraw, true)
      if !children.empty? || etag
        Elem.new(stag,
                  children.map {|c| build_node(c, opts) },
                  etag)
      else
        Elem.new(stag)
      end
    when :text
      Text.parse_pcdata(structure[1])
    when :emptytag
      Elem.new(STag.parse(structure[1], structure[2], structure[3], false))
    when :bogus_etag
      BogusETag.parse(structure[1], structure[2])
    when :xmldecl
      XMLDecl.parse(structure[2], structure[3])
    when :doctype
      if opts[:xhtml_strict]
        structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
        structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
      end
      DocType.parse(structure[1], structure[2], structure[3])
    when :procins
      ProcIns.parse(structure[1])
    when :comment
      Comment.parse(structure[1])
    when :cdata_content
      Text.parse_cdata_content(structure[1])
    when :cdata
      Text.parse_cdata_section(structure[1])
    else
      raise "[bug] unknown structure: #{structure.inspect}"
    end
  end

  def STag.parse(qname, attrs, raw_string, is_stag)
    result = STag.new(qname, attrs)
    result.raw_string = raw_string
    result
  end

  def ETag.parse(qname, raw_string)
    result = self.new(qname)
    result.raw_string = raw_string
    result
  end

  def BogusETag.parse(qname, raw_string)
    result = self.new(qname)
    result.raw_string = raw_string
    result
  end

  def Text.parse_pcdata(raw_string)
    result = Text.new(raw_string)
    result
  end

  def Text.parse_cdata_content(raw_string)
    result = CData.new(raw_string)
    result
  end

  def Text.parse_cdata_section(content)
    result = CData.new(content)
    result
  end

  def XMLDecl.parse(attrs, raw_string)
    attrs ||= {}
    version = attrs['version']
    encoding = attrs['encoding']
    case attrs['standalone']
    when 'yes'
      standalone = true
    when 'no'
      standalone = false
    else
      standalone = nil
    end

    result = XMLDecl.new(version, encoding, standalone)
    result.raw_string = raw_string
    result
  end

  def DocType.parse(root_element_name, attrs, raw_string)
    if attrs
      public_identifier = attrs['public_id']
      system_identifier = attrs['system_id']
    end

    root_element_name = root_element_name.downcase

    result = DocType.new(root_element_name, public_identifier, system_identifier)
    result.raw_string = raw_string
    result
  end

  def ProcIns.parse(raw_string)
    _, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
    result = ProcIns.new(target, content)
    result
  end

  def Comment.parse(content)
    result = Comment.new(content)
    result
  end

  module Pat
    NameChar = /[-A-Za-z0-9._:]/
    Name = /[A-Za-z_:]#{NameChar}*/
    Nmtoken = /#{NameChar}+/
  end

  # :startdoc:
end