class MaRuKu::In::Markdown::SpanLevelParser::HTMLHelper
This class helps me read and sanitize HTML blocks
def close_script_style
Finish script or style tag content, wrapping it in CDATA if necessary,
def close_script_style tag = @tag_stack.last # See http://www.w3.org/TR/xhtml1/#C_4 for character sequences not allowed within an element body. if @already =~ /<|&|\]\]>|--/ new_already = script_style_cdata_start(tag) new_already << "\n" unless @already.start_with?("\n") new_already << @already new_already << "\n" unless @already.end_with?("\n") new_already << script_style_cdata_end(tag) @already = new_already end @before_already << @already @already = @before_already end
def debug_state(note)
def debug_state(note) my_debug "#{@state}: #{note}: #{@m.to_s.inspect}" end
def eat_this(line)
def eat_this(line) @rest = line + @rest things_read = 0 until @rest.empty? case self.state when :inside_comment if @m = CommentEnd.match(@rest) debug_state 'Comment End' # Workaround for https://bugs.ruby-lang.org/issues/9277 and another bug in 1.9.2 where even a # single dash in a comment will cause REXML to error. @already << @m.pre_match.gsub(/-(?![^\-])/, '- ') << @m.to_s @rest = @m.post_match self.state = :inside_element else @already << @rest.gsub(/-(?![^\-])/, '- ') # Workaround for https://bugs.ruby-lang.org/issues/9277 @rest = "" self.state = :inside_comment end when :inside_element if @m = CommentStart.match(@rest) debug_state 'Comment' things_read += 1 @already << @m.pre_match << @m.to_s @rest = @m.post_match self.state = :inside_comment elsif @m = Tag.match(@rest) debug_state 'Tag' things_read += 1 self.state = :inside_element handle_tag elsif @m = CData.match(@rest) debug_state 'CDATA' @already << @m.pre_match close_script_style if script_style? @already << @m.to_s @rest = @m.post_match self.state = :inside_cdata elsif @m = PartialTag.match(@rest) debug_state 'PartialTag' @already << @m.pre_match @rest = @m.post_match @partial_tag = @m.to_s self.state = :inside_tag elsif @m = EverythingElse.match(@rest) debug_state 'EverythingElse' @already << @m.pre_match << @m.to_s @rest = @m.post_match self.state = :inside_element else error "Malformed HTML: not complete: #{@rest.inspect}" end when :inside_tag if @m = /^[^>]*>/.match(@rest) @partial_tag << @m.to_s @rest = @partial_tag + @m.post_match @partial_tag = nil self.state = :inside_element if @m = Tag.match(@rest) things_read += 1 handle_tag end else @partial_tag << @rest @rest = "" self.state = :inside_tag end when :inside_cdata if @m = CDataEnd.match(@rest) self.state = :inside_element @already << @m.pre_match << @m.to_s @rest = @m.post_match start_script_style if script_style? else @already << @rest @rest = "" self.state = :inside_cdata end else raise "Bug bug: state = #{self.state.inspect}" end break if is_finished? && things_read > 0 end end
def error(s)
def error(s) raise "Error: #{s} \n" + inspect, caller end
def handle_tag
def handle_tag @already << @m.pre_match @rest = @m.post_match is_closing = !!@m[1] tag = @m[2] @first_tag ||= tag attributes = @m[3].to_s is_single = false if attributes[-1, 1] == '/' attributes = attributes[0, attributes.size - 1] is_single = true end if TO_SANITIZE.include? tag attributes.strip! if attributes.size > 0 @already << '<%s %s />' % [tag, attributes] else @already << '<%s />' % [tag] end elsif is_closing if @tag_stack.empty? error "Malformed: closing tag #{tag.inspect} in empty list" elsif @tag_stack.last != tag error "Malformed: tag <#{tag}> closes <#{@tag_stack.last}>" end close_script_style if script_style? @already << @m.to_s @tag_stack.pop else @already << @m.to_s @tag_stack.push(tag) unless is_single start_script_style if script_style? end end
def initialize
def initialize @rest = "" @tag_stack = [] @m = nil @already = "" self.state = :inside_element end
def inspect
def inspect "HTML READER\n state=#{self.state} " + "match=#{@m.to_s.inspect}\n" + "Tag stack = #{@tag_stack.inspect} \n" + "Before:\n" + @already.gsub(/^/, '|') + "\n" + "After:\n" + @rest.gsub(/^/, '|') + "\n" end
def is_finished?
def is_finished? self.state == :inside_element && @tag_stack.empty? end
def my_debug(s)
def my_debug(s) # puts "---" * 10 + "\n" + inspect + "\t>>>\t" + s end
def script_style?
def script_style? %w(script style).include?(@tag_stack.last) end
def script_style_cdata_end(tag)
def script_style_cdata_end(tag) (tag == 'script') ? "//]]>" : "/*]]>*/" end
def script_style_cdata_start(tag)
def script_style_cdata_start(tag) (tag == 'script') ? "//<![CDATA[" : "/*<![CDATA[*/" end
def start_script_style
Save our @already buffer elsewhere, and switch to using @already for the
def start_script_style @before_already, @already = @already, "" end
def stuff_you_read
def stuff_you_read @already end