lib/simple_po_parser/parser.rb



# encoding: utf-8

module SimplePoParser
  # Fast parser directly using Rubys powerful StringScanner (strscan)
  #
  # Important notes about StringScanner.scan:
  # * scan will return nil if there is no match. Using the regex * (zero or more) quantifier will
  #  let scan return an empty string if there is "no match" as the empty string qualifies as
  #  a match of the regex (zero times). We make use of this "trick"
  # * the start of line anchor ^ is obsolete as scan will only match start of line.
  # * rubys regex is by default in single-line mode, therefore scan will only match until
  #  the next newline is hit (unless multi-line mode is explicitly enabled)
  class Parser
    require_relative 'error'
    require 'strscan'

    # parse a single message of the PO format.
    #
    # @param message a single PO message in String format without leading or trailing whitespace
    # @return [Hash] parsed PO message information in Hash format
    def parse(message)
      @result = {}
      @scanner = StringScanner.new(message.strip)
      begin
        lines
      rescue ParserError => pe
        error_msg = "SimplePoParser::ParserError"
        error_msg += pe.message
        error_msg += "\nParseing result before error: '#{@result}'"
        error_msg += "\nSimplePoParser filtered backtrace: SimplePoParser::ParserError"
        backtrace = "#{pe.backtrace.select{|i| i =~ /lib\/simple_po_parser/}.join("\n\tfrom ")}"
        raise ParserError, error_msg, backtrace
      end
      @result
    end

    private

    #########################################
    ###            branching              ###
    #########################################

    # arbitary line of a PO message. Can be comment or message
    # message parsing is always started with checking for msgctxt as content is expected in
    # msgctxt -> msgid -> msgid_plural -> msgstr order
    def lines
      begin
        if @scanner.scan(/#/)
          comment
        else
          msgctxt
        end
      rescue PoSyntaxError => pe
        # throw a normal ParserError to break the recursion
        raise ParserError, "Syntax error in lines\n" + pe.message, pe.backtrace
      end
    end

    # match a comment line. called on lines starting with '#'.
    # Recalls line when the comment line was parsed
    def comment
      begin
        case @scanner.getch
        when ' '
          skip_whitespace
          add_result(:translator_comment, comment_text)
          lines
        when '.'
          skip_whitespace
          add_result(:extracted_comment, comment_text)
          lines
        when ':'
          skip_whitespace
          add_result(:reference, comment_text)
          lines
        when ','
          skip_whitespace
          add_result(:flag, comment_text)
          lines
        when '|'
          skip_whitespace
          previous_comments
          lines
        when "\n"
          add_result(:translator_comment, "") # empty comment line
          lines
        when '~'
          if @result[:previous_msgctxt] || @result[:previous_msgid] || @result[:previous_msgid_plural]
            raise PoSyntaxError, "Previous comment entries need to be marked obsolete too in obsolete message entries. But already got: #{@result}"
          end
          skip_whitespace
          add_result(:obsolete, comment_text)
          obsoletes
        else
          @scanner.pos = @scanner.pos - 2
          raise PoSyntaxError, "Unknown comment type #{@scanner.peek(10).inspect}"
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in comment\n" + pe.message, pe.backtrace
      end
    end

    # matches the msgctxt line and will continue to check for msgid afterwards
    #
    # msgctxt is optional
    def msgctxt
      begin
        if @scanner.scan(/msgctxt/)
          skip_whitespace
          text = message_line
          add_result(:msgctxt, text)
          message_multiline(:msgctxt) if @scanner.peek(1) == '"'
        end
        msgid
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgctxt\n" + pe.message, pe.backtrace
      end
    end

    # matches the msgid line. Will check for optional msgid_plural.
    # Will advance to msgstr or msgstr_plural based on msgid_plural
    #
    # msgid is required
    def msgid
      begin
        if @scanner.scan(/msgid/)
          skip_whitespace
          text = message_line
          add_result(:msgid, text)
          message_multiline(:msgid) if @scanner.peek(1) == '"'
          if msgid_plural
            msgstr_plural
          else
            msgstr
          end
        else
          err_msg = "Message without msgid is not allowed."
          err_msg += "The Line started unexpectedly with #{@scanner.peek(10).inspect}."
          raise PoSyntaxError, err_msg
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace
      end

    end

    # matches the msgid_plural line.
    #
    # msgid_plural is optional
    #
    # @return [boolean] true if msgid_plural is present, false otherwise
    def msgid_plural
      begin
        if @scanner.scan(/msgid_plural/)
          skip_whitespace
          text = message_line
          add_result(:msgid_plural, text)
          message_multiline(:msgid_plural) if @scanner.peek(1) == '"'
          true
        else
          false
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace
      end
    end

    # parses the msgstr singular line
    #
    # msgstr is required in singular translations
    def msgstr
      begin
        if @scanner.scan(/msgstr/)
          skip_whitespace
          text = message_line
          add_result(:msgstr, text)
          message_multiline(:msgstr) if @scanner.peek(1) == '"'
          skip_whitespace
          raise PoSyntaxError, "Unexpected content after expected message end #{@scanner.peek(10).inspect}" unless @scanner.eos?
        else
         raise PoSyntaxError, "Singular message without msgstr is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}."
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgstr\n" + pe.message, pe.backtrace
      end
    end

    # parses the msgstr plural lines
    #
    # msgstr plural lines are used when there is msgid_plural.
    # They have the format msgstr[N] where N is incremental number starting from zero representing
    # the plural number as specified in the headers "Plural-Forms" entry. Most languages, like the
    # English language only have two plural forms (singular and plural),
    # but there are languages with more plurals
    def msgstr_plural(num = 0)
      begin
        msgstr_key = @scanner.scan(/msgstr\[\d\]/) # matches 'msgstr[0]' to 'msgstr[9]'
        if msgstr_key
          # msgstr plurals must come in 0-based index in order
          msgstr_num = msgstr_key.match(/\d/)[0].to_i
          raise PoSyntaxError, "Bad 'msgstr[index]' index." if msgstr_num != num
          skip_whitespace
          text = message_line
          add_result(msgstr_key, text)
          message_multiline(msgstr_key) if @scanner.peek(1) == '"'
          msgstr_plural(num+1)
        elsif num == 0 # and msgstr_key was false
          raise PoSyntaxError, "Plural message without msgstr[0] is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}."
        else
          raise PoSyntaxError, "End of message was expected, but line started unexpectedly with #{@scanner.peek(10).inspect}" unless @scanner.eos?
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in msgstr_plural\n" + pe.message, pe.backtrace
      end
    end

    # parses previous comments, which provide additional information on fuzzy matching
    #
    # previous comments are:
    # * #| msgctxt
    # * #| msgid
    # * #| msgid_plural
    def previous_comments
      begin
        # next part must be msgctxt, msgid or msgid_plural
        if @scanner.scan(/msg/)
          if @scanner.scan(/id/)
            if @scanner.scan(/_plural/)
              key = :previous_msgid_plural
            else
              key = :previous_msgid
            end
          elsif @scanner.scan(/ctxt/)
            key = :previous_msgctxt
          else
            raise PoSyntaxError, "Previous comment type #{("msg" + @scanner.peek(10)).inspect} unknown."
          end
          skip_whitespace
          text = message_line
          add_result(key, text)
          previous_multiline(key) if @scanner.match?(/#\|\p{Blank}*"/)
        else
          raise PoSyntaxError, "Previous comments must start with '#| msg'. #{@scanner.peek(10).inspect} unknown."
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in previous_comments\n" + pe.message, pe.backtrace
      end
    end

    # parses the multiline messages of the previous comment lines
    def previous_multiline(key)
      begin
        # scan multilines until no further multiline is hit
        # /#\|\p{Blank}"/ needs to catch the double quote to ensure it hits a previous
        # multiline and not another line type.
        if @scanner.scan(/#\|\p{Blank}*"/)
          @scanner.pos = @scanner.pos - 1 # go one character back, so we can reuse the "message line" method
          add_result(key, message_line)
          previous_multiline(key) # go on until we no longer hit a multiline line
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in previous_multiline\n" + pe.message, pe.backtrace
      end
    end

    # parses a multiline message
    #
    # Multiline messages are usually indicated by an empty string as the first line,
    # followed by more lines starting with the double quote character.
    #
    # However, according to the PO file standard, the first line can also contain content.
    def message_multiline(key)
      begin
        skip_whitespace
        if @scanner.check(/"/)
          add_result(key, message_line)
          message_multiline(key)
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in message_multiline with key '#{key}'\n" + pe.message, pe.backtrace
      end
    end

    # identifies a message line and returns it's text or raises an error
    #
    # @return [String] message_text
    def message_line
      begin
        if @scanner.getch == '"'
          text = message_text
          unless @scanner.getch == '"'
            err_msg = "The message text '#{text}' must be finished with the double quote character '\"'."
            raise PoSyntaxError, err_msg
          end
          skip_whitespace
          unless end_of_line
            err_msg = "There should be only whitespace until the end of line"
            err_msg += " after the double quote character of a message text."
            raise PoSyntaxError.new(err_msg)
          end
          text
        else
          @scanner.pos = @scanner.pos - 1
          err_msg = "A message text needs to start with the double quote character '\"',"
          err_msg += " but this was found: #{@scanner.peek(10).inspect}"
          raise PoSyntaxError, err_msg
        end
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in message_line\n" + pe.message, pe.backtrace
      end
    end

    # parses all obsolete lines.
    # An obsolete message may only contain obsolete lines
    def obsoletes
      if @scanner.scan(/#~/)
        skip_whitespace
        add_result(:obsolete, comment_text)
        obsoletes
      else
        raise PoSyntaxError, "All lines must be obsolete after the first obsolete line, but got #{@scanner.peek(10).inspect}." unless @scanner.eos?
      end
    end

    #########################################
    ###             scanning              ###
    #########################################

    # returns the text of a comment
    #
    # @return [String] text
    def comment_text
      begin
        text = @scanner.scan(/.*/) # everything until newline
        text.rstrip! # benchmarked faster too rstrip the string in place
        raise PoSyntaxError, "Comment text should advance to next line or stop at eos" unless end_of_line
        text
      rescue PoSyntaxError => pe
        raise PoSyntaxError, "Syntax error in commtent_text\n" + pe.message, pe.backtrace
      end
    end

    # returns the text of a message line
    #
    # @return [String] text
    def message_text
      @scanner.scan_until(/(\\(\\|")|[^"])*/) # this parses anything until an unescaped quote is hit
    end

    # advances the scanner until the next non whitespace position.
    # Does not match newlines. See WHITESPACE_REGEX constant
    def skip_whitespace
      @scanner.skip(/\p{Blank}+/)
    end

    # returns true if the scanner is at beginning of next line or end of string
    #
    # @return [Boolean] true if scanner at beginning of line or eos
    def end_of_line
      @scanner.scan(/\n/)
      @scanner.eos? || @scanner.bol?
    end

    # adds text to the given key in results
    # creates an array if the given key already has a result
    def add_result(key, text)
      if @result[key]
        if @result[key].is_a? Array
          @result[key].push(text)
        else
          @result[key] = [@result[key], text]
        end
      else
        @result[key] = text
      end
    end
  end
end