# -*- coding: utf-8 -*-##--# Copyright (C) 2009 Thomas Leitner <t_leitner@gmx.at>## This file is part of kramdown.## kramdown is free software: you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation, either version 3 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program. If not, see <http://www.gnu.org/licenses/>.#++#require'rexml/parsers/baseparser'moduleKramdownmoduleParserclassKramdown#:stopdoc:# The following regexps are based on the ones used by REXML, with some slight modifications.#:startdoc:HTML_COMMENT_RE=/<!--(.*?)-->/mHTML_INSTRUCTION_RE=/<\?(.*?)\?>/mHTML_ATTRIBUTE_RE=/\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/mHTML_TAG_RE=/<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/mHTML_TAG_CLOSE_RE=/<\/(#{REXML::Parsers::BaseParser::NAME_STR})\s*>/mHTML_PARSE_AS_BLOCK=%w{applet button blockquote colgroup dd div dl fieldset form iframe li
map noscript object ol table tbody td th thead tfoot tr ul}HTML_PARSE_AS_SPAN=%w{a abbr acronym address b bdo big cite caption code del dfn dt em
h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p pre q rb rbc
rp rt rtc ruby samp select small span strong sub sup tt var}HTML_PARSE_AS_RAW=%w{script math option textarea}HTML_PARSE_AS=Hash.new{|h,k|h[k]=:raw}HTML_PARSE_AS_BLOCK.each{|i|HTML_PARSE_AS[i]=:block}HTML_PARSE_AS_SPAN.each{|i|HTML_PARSE_AS[i]=:span}HTML_PARSE_AS_RAW.each{|i|HTML_PARSE_AS[i]=:raw}#:stopdoc:# Some HTML elements like script belong to both categories (i.e. are valid in block and# span HTML) and don't appear therefore!#:startdoc:HTML_SPAN_ELEMENTS=%w{a abbr acronym b big bdo br button cite code del dfn em i img input
ins kbd label option q rb rbc rp rt rtc ruby samp select small span
strong sub sup textarea tt var}HTML_BLOCK_ELEMENTS=%w{address applet button blockquote caption col colgroup dd div dl dt fieldset
form h1 h2 h3 h4 h5 h6 hr iframe legend li map ol optgroup p pre table tbody
td th thead tfoot tr ul}HTML_ELEMENTS_WITHOUT_BODY=%w{area br col hr img input}HTML_BLOCK_START=/^#{OPT_SPACE}<(#{REXML::Parsers::BaseParser::UNAME_STR}|\?|!--|\/)/HTML_RAW_START=/(?=<(#{REXML::Parsers::BaseParser::UNAME_STR}|\/))/# Parse the HTML at the current position as block level HTML.defparse_block_htmlifresult=@src.scan(HTML_COMMENT_RE)@tree.children<<Element.new(:xml_comment,result,:type=>:block)@src.scan(/[ \t]*\n/)trueelsifresult=@src.scan(HTML_INSTRUCTION_RE)@tree.children<<Element.new(:xml_pi,result,:type=>:block)@src.scan(/[ \t]*\n/)trueelseifresult=@src.check(/^#{OPT_SPACE}#{HTML_TAG_RE}/)&&!HTML_SPAN_ELEMENTS.include?(@src[1])@src.pos+=@src.matched_sizehandle_html_start_tagtrueelsifresult=@src.check(/^#{OPT_SPACE}#{HTML_TAG_CLOSE_RE}/)&&!HTML_SPAN_ELEMENTS.include?(@src[1])@src.pos+=@src.matched_sizename=@src[1]if@tree.type==:html_element&&@tree.value==namethrow:stop_block_parsing,:foundelsewarning("Found invalidly used HTML closing tag for '#{name}' - ignoring it")trueendelsefalseendendenddefine_parser(:block_html,HTML_BLOCK_START)# Return the HTML parse type defined by the string +val+, i.e. raw when "0", default parsing# (return value +nil+) when "1", span parsing when "span" and block parsing when "block". If# +val+ is nil, then the default parsing mode is used.defget_parse_type(val)casevalwhen"0"then:rawwhen"1"then:defaultwhen"span"then:spanwhen"block"then:blockwhenNilClassthennilelsewarning("Invalid markdown attribute val '#{val}', using default")nilendend# Process the HTML start tag that has already be scanned/checked.defhandle_html_start_tagcurpos=@src.posname=@src[1]closed=!@src[4].nil?attrs={}@src[2].scan(HTML_ATTRIBUTE_RE).each{|attr,sep,val|attrs[attr]=val}parse_type=if@tree.type!=:html_element||@tree.options[:parse_type]!=:raw(@doc.options[:parse_block_html]?HTML_PARSE_AS[name]::raw)else:rawendifval=get_parse_type(attrs.delete('markdown'))parse_type=(val==:default?HTML_PARSE_AS[name]:val)end@src.scan(/[ \t]*\n/)ifparse_type==:blockel=Element.new(:html_element,name,:attr=>attrs,:type=>:block,:parse_type=>parse_type)el.options[:outer_element]=trueif@tree.type!=:html_elementel.options[:parent_is_raw]=trueif@tree.type==:html_element&&@tree.options[:parse_type]==:raw@tree.children<<elif!closed&&HTML_ELEMENTS_WITHOUT_BODY.include?(el.value)warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")elsif!closedifparse_type==:blockend_tag_found=parse_blocks(el)if!end_tag_foundwarning("Found no end tag for '#{el.value}' - auto-closing it")endelsifparse_type==:spanifresult=@src.scan_until(/(?=<\/#{el.value}\s*>)/m)add_text(extract_string(curpos...@src.pos),el)@src.scan(HTML_TAG_CLOSE_RE)elseadd_text(@src.scan(/.*/m),el)warning("Found no end tag for '#{el.value}' - auto-closing it")endelseparse_raw_html(el)end@src.scan(/[ \t]*\n/)unless(@tree.type==:html_element&&@tree.options[:parse_type]==:raw)endend# Parse raw HTML until the matching end tag for +el+ is found or until the end of the# document.defparse_raw_html(el)@stack.push(@tree)@tree=eldone=falseendpos=nilwhile!@src.eos?&&!doneifresult=@src.scan_until(HTML_RAW_START)endpos=@src.posadd_text(result,@tree,:html_text)if@src.scan(HTML_TAG_RE)handle_html_start_tagelsif@src.scan(HTML_TAG_CLOSE_RE)if@tree.value==@src[1]done=trueelsewarning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")endelseadd_text(@src.scan(/./),@tree,:html_text)endelseresult=@src.scan(/.*/m)add_text(result,@tree,:html_text)warning("Found no end tag for '#{@tree.value}' - auto-closing it")done=trueendend@tree=@stack.popendposendHTML_SPAN_START=/<(#{REXML::Parsers::BaseParser::UNAME_STR}|\?|!--|\/)/# Parse the HTML at the current position as span level HTML.defparse_span_htmlifresult=@src.scan(HTML_COMMENT_RE)@tree.children<<Element.new(:xml_comment,result,:type=>:span)elsifresult=@src.scan(HTML_INSTRUCTION_RE)@tree.children<<Element.new(:xml_pi,result,:type=>:span)elsifresult=@src.scan(HTML_TAG_CLOSE_RE)warning("Found invalidly used HTML closing tag for '#{@src[1]}' - ignoring it")elsifresult=@src.scan(HTML_TAG_RE)returnifHTML_BLOCK_ELEMENTS.include?(@src[1])reset_pos=@src.posattrs={}@src[2].scan(HTML_ATTRIBUTE_RE).each{|name,sep,val|attrs[name]=val.gsub(/\n+/,' ')}do_parsing=(HTML_PARSE_AS_RAW.include?(@src[1])?false:@doc.options[:parse_span_html])ifval=get_parse_type(attrs.delete('markdown'))ifval==:blockwarning("Cannot use block level parsing in span level HTML tag - using default mode")elsifval==:spando_parsing=trueelsifval==:default(HTML_PARSE_AS_RAW.include?(@src[1])?false:true)elsifval==:rawdo_parsing=falseendendel=Element.new(:html_element,@src[1],:attr=>attrs,:type=>:span)stop_re=/<\/#{Regexp.escape(@src[1])}\s*>/if@src[4]@tree.children<<elelsifHTML_ELEMENTS_WITHOUT_BODY.include?(el.value)warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it")@tree.children<<elelseifparse_spans(el,stop_re,(do_parsing?nil:[:span_html]),(do_parsing?:text::html_text))end_pos=@src.pos@src.scan(stop_re)elsewarning("Found no end tag for '#{el.value}' - auto-closing it")add_text(@src.scan(/.*/m))end@tree.children<<elendelseadd_text(@src.scan(/./))endenddefine_parser(:span_html,HTML_SPAN_START,'<')endendend