class SimplePoParser::Parser

the next newline is hit (unless multi-line mode is explicitly enabled)
* rubys regex is by default in single-line mode, therefore scan will only match until
* the start of line anchor ^ is obsolete as scan will only match start of line.
a match of the regex (zero times). We make use of this “trick”
let scan return an empty string if there is “no match” as the empty string qualifies as
* scan will return nil if there is no match. Using the regex * (zero or more) quantifier will
Important notes about StringScanner.scan:
Fast parser directly using Rubys powerful StringScanner (strscan)

def add_result(key, text)

creates an array if the given key already has a result
adds text to the given key in results
def add_result(key, text)
  if @result[key]
    if @result[key].is_a? Array
      @result[key].push(text)
    else
      @result[key] = [@result[key], text]
    end
  else
    @result[key] = text
  end
end

def comment

Recalls line when the comment line was parsed
match a comment line. called on lines starting with '#'.
def comment
  begin
    case @scanner.getch
    when ' '
      skip_whitespace
      add_result(:translator_comment, comment_text)
      lines
    when '.'
      skip_whitespace
      add_result(:extracted_comment, comment_text)
      lines
    when ':'
      skip_whitespace
      add_result(:reference, comment_text)
      lines
    when ','
      skip_whitespace
      add_result(:flag, comment_text)
      lines
    when '|'
      skip_whitespace
      previous_comments
      lines
    when "\n"
      add_result(:translator_comment, "") # empty comment line
      lines
    when '~'
      if @result[:previous_msgctxt] || @result[:previous_msgid] || @result[:previous_msgid_plural]
        raise PoSyntaxError, "Previous comment entries need to be marked obsolete too in obsolete message entries. But already got: #{@result}"
      end
      skip_whitespace
      add_result(:obsolete, comment_text)
      obsoletes
    else
      @scanner.pos = @scanner.pos - 2
      raise PoSyntaxError, "Unknown comment type #{@scanner.peek(10).inspect}"
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in comment\n" + pe.message, pe.backtrace
  end
end

def comment_text

Returns:
  • (String) - text
def comment_text
  begin
    text = @scanner.scan(/.*/) # everything until newline
    text.rstrip! # benchmarked faster too rstrip the string in place
    raise PoSyntaxError, "Comment text should advance to next line or stop at eos" unless end_of_line
    text
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in commtent_text\n" + pe.message, pe.backtrace
  end
end

def end_of_line

Returns:
  • (Boolean) - true if scanner at beginning of line or eos
def end_of_line
  @scanner.scan(/\n/)
  @scanner.eos? || @scanner.bol?
end

def lines

msgctxt -> msgid -> msgid_plural -> msgstr order
message parsing is always started with checking for msgctxt as content is expected in
arbitary line of a PO message. Can be comment or message
def lines
  begin
    if @scanner.scan(/#/)
      comment
    else
      msgctxt
    end
  rescue PoSyntaxError => pe
    # throw a normal ParserError to break the recursion
    raise ParserError, "Syntax error in lines\n" + pe.message, pe.backtrace
  end
end

def message_line

Returns:
  • (String) - message_text
def message_line
  begin
    if @scanner.getch == '"'
      text = message_text
      unless @scanner.getch == '"'
        err_msg = "The message text '#{text}' must be finished with the double quote character '\"'."
        raise PoSyntaxError, err_msg
      end
      skip_whitespace
      unless end_of_line
        err_msg = "There should be only whitespace until the end of line"
        err_msg += " after the double quote character of a message text."
        raise PoSyntaxError.new(err_msg)
      end
      text
    else
      @scanner.pos = @scanner.pos - 1
      err_msg = "A message text needs to start with the double quote character '\"',"
      err_msg += " but this was found: #{@scanner.peek(10).inspect}"
      raise PoSyntaxError, err_msg
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in message_line\n" + pe.message, pe.backtrace
  end
end

def message_multiline(key)

However, according to the PO file standard, the first line can also contain content.

followed by more lines starting with the double quote character.
Multiline messages are usually indicated by an empty string as the first line,

parses a multiline message
def message_multiline(key)
  begin
    skip_whitespace
    if @scanner.check(/"/)
      add_result(key, message_line)
      message_multiline(key)
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in message_multiline with key '#{key}'\n" + pe.message, pe.backtrace
  end
end

def message_text

Returns:
  • (String) - text
def message_text
  @scanner.scan_until(/(\\(\\|")|[^"])*/) # this parses anything until an unescaped quote is hit
end

def msgctxt

msgctxt is optional

matches the msgctxt line and will continue to check for msgid afterwards
def msgctxt
  begin
    if @scanner.scan(/msgctxt/)
      skip_whitespace
      text = message_line
      add_result(:msgctxt, text)
      message_multiline(:msgctxt) if @scanner.peek(1) == '"'
    end
    msgid
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in msgctxt\n" + pe.message, pe.backtrace
  end
end

def msgid

msgid is required

Will advance to msgstr or msgstr_plural based on msgid_plural
matches the msgid line. Will check for optional msgid_plural.
def msgid
  begin
    if @scanner.scan(/msgid/)
      skip_whitespace
      text = message_line
      add_result(:msgid, text)
      message_multiline(:msgid) if @scanner.peek(1) == '"'
      if msgid_plural
        msgstr_plural
      else
        msgstr
      end
    else
      err_msg = "Message without msgid is not allowed."
      err_msg += "The Line started unexpectedly with #{@scanner.peek(10).inspect}."
      raise PoSyntaxError, err_msg
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace
  end
end

def msgid_plural

Returns:
  • (boolean) - true if msgid_plural is present, false otherwise
def msgid_plural
  begin
    if @scanner.scan(/msgid_plural/)
      skip_whitespace
      text = message_line
      add_result(:msgid_plural, text)
      message_multiline(:msgid_plural) if @scanner.peek(1) == '"'
      true
    else
      false
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace
  end
end

def msgstr

msgstr is required in singular translations

parses the msgstr singular line
def msgstr
  begin
    if @scanner.scan(/msgstr/)
      skip_whitespace
      text = message_line
      add_result(:msgstr, text)
      message_multiline(:msgstr) if @scanner.peek(1) == '"'
      skip_whitespace
      raise PoSyntaxError, "Unexpected content after expected message end #{@scanner.peek(10).inspect}" unless @scanner.eos?
    else
     raise PoSyntaxError, "Singular message without msgstr is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}."
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in msgstr\n" + pe.message, pe.backtrace
  end
end

def msgstr_plural(num = 0)

but there are languages with more plurals
English language only have two plural forms (singular and plural),
the plural number as specified in the headers "Plural-Forms" entry. Most languages, like the
They have the format msgstr[N] where N is incremental number starting from zero representing
msgstr plural lines are used when there is msgid_plural.

parses the msgstr plural lines
def msgstr_plural(num = 0)
  begin
    msgstr_key = @scanner.scan(/msgstr\[\d\]/) # matches 'msgstr[0]' to 'msgstr[9]'
    if msgstr_key
      # msgstr plurals must come in 0-based index in order
      msgstr_num = msgstr_key.match(/\d/)[0].to_i
      raise PoSyntaxError, "Bad 'msgstr[index]' index." if msgstr_num != num
      skip_whitespace
      text = message_line
      add_result(msgstr_key, text)
      message_multiline(msgstr_key) if @scanner.peek(1) == '"'
      msgstr_plural(num+1)
    elsif num == 0 # and msgstr_key was false
      raise PoSyntaxError, "Plural message without msgstr[0] is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}."
    else
      raise PoSyntaxError, "End of message was expected, but line started unexpectedly with #{@scanner.peek(10).inspect}" unless @scanner.eos?
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in msgstr_plural\n" + pe.message, pe.backtrace
  end
end

def obsoletes

An obsolete message may only contain obsolete lines
parses all obsolete lines.
def obsoletes
  if @scanner.scan(/#~/)
    skip_whitespace
    add_result(:obsolete, comment_text)
    obsoletes
  else
    raise PoSyntaxError, "All lines must be obsolete after the first obsolete line, but got #{@scanner.peek(10).inspect}." unless @scanner.eos?
  end
end

def parse(message)

Returns:
  • (Hash) - parsed PO message information in Hash format

Parameters:
  • message () -- a single PO message in String format without leading or trailing whitespace
def parse(message)
  @result = {}
  @scanner = StringScanner.new(message.strip)
  begin
    lines
  rescue ParserError => pe
    error_msg = "SimplePoParser::ParserError"
    error_msg += pe.message
    error_msg += "\nParseing result before error: '#{@result}'"
    error_msg += "\nSimplePoParser filtered backtrace: SimplePoParser::ParserError"
    backtrace = "#{pe.backtrace.select{|i| i =~ /lib\/simple_po_parser/}.join("\n\tfrom ")}"
    raise ParserError, error_msg, backtrace
  end
  @result
end

def previous_comments

* #| msgid_plural
* #| msgid
* #| msgctxt
previous comments are:

parses previous comments, which provide additional information on fuzzy matching
def previous_comments
  begin
    # next part must be msgctxt, msgid or msgid_plural
    if @scanner.scan(/msg/)
      if @scanner.scan(/id/)
        if @scanner.scan(/_plural/)
          key = :previous_msgid_plural
        else
          key = :previous_msgid
        end
      elsif @scanner.scan(/ctxt/)
        key = :previous_msgctxt
      else
        raise PoSyntaxError, "Previous comment type #{("msg" + @scanner.peek(10)).inspect} unknown."
      end
      skip_whitespace
      text = message_line
      add_result(key, text)
      previous_multiline(key) if @scanner.match?(/#\|\p{Blank}*"/)
    else
      raise PoSyntaxError, "Previous comments must start with '#| msg'. #{@scanner.peek(10).inspect} unknown."
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in previous_comments\n" + pe.message, pe.backtrace
  end
end

def previous_multiline(key)

parses the multiline messages of the previous comment lines
def previous_multiline(key)
  begin
    # scan multilines until no further multiline is hit
    # /#\|\p{Blank}"/ needs to catch the double quote to ensure it hits a previous
    # multiline and not another line type.
    if @scanner.scan(/#\|\p{Blank}*"/)
      @scanner.pos = @scanner.pos - 1 # go one character back, so we can reuse the "message line" method
      add_result(key, message_line)
      previous_multiline(key) # go on until we no longer hit a multiline line
    end
  rescue PoSyntaxError => pe
    raise PoSyntaxError, "Syntax error in previous_multiline\n" + pe.message, pe.backtrace
  end
end

def skip_whitespace

Does not match newlines. See WHITESPACE_REGEX constant
advances the scanner until the next non whitespace position.
def skip_whitespace
  @scanner.skip(/\p{Blank}+/)
end