lib/kleene/dsl.rb



# Most of the machines constructed here are based on section 2.5 of the Ragel User Guide (http://www.colm.net/files/ragel/ragel-guide-6.10.pdf)

module Kleene
  module DSL
    extend self

    ############### The following methods create FSAs given a stream of input tokens #################

    # given a string with N characters in it:
    # N+1 states: start state and N other states
    # structure: start state -> transition for first character in the string -> state for having observed first character in the string ->
    #                           transition for second character in the string -> state for having observed second character in the string ->
    #                           ...
    #                           transition for last character in the string -> state for having observed last character in the string (marked final)
    def literal(token_stream, alphabet = DEFAULT_ALPHABET)
      start = current_state = State.new
      nfa = NFA.new(start, alphabet)
      token_stream.each_char do |token|
        next_state = State.new
        nfa.add_transition(token, current_state, next_state)
        current_state = next_state
      end
      current_state.final = true
      nfa.update_final_states
      nfa.set_regex_pattern(token_stream)
      nfa
    end

    # two states: start state and final state
    # structure: start state -> transitions for each token in the token collection -> final state
    def any(token_collection, alphabet = DEFAULT_ALPHABET)
      start = State.new
      nfa = NFA.new(start, alphabet)
      final = State.new(true)
      token_collection.each {|token| nfa.add_transition(token, start, final) }
      nfa.update_final_states
      regex_pattern = "[#{token_collection.join("")}]"
      nfa.set_regex_pattern(regex_pattern)
      nfa
    end

    # two states: start state and final state
    # structure: start state -> transitions for every token in the alphabet -> final state
    def dot(alphabet = DEFAULT_ALPHABET)
      any(alphabet, alphabet).set_regex_pattern(".")
    end

    # This implements a character class, and is specifically for use with matching strings
    def range(c_begin, c_end, alphabet = DEFAULT_ALPHABET)
      any((c_begin..c_end).to_a, alphabet).set_regex_pattern("[#{c_begin}-#{c_end}]")
    end

    ############### The following methods create FSAs given other FSAs #################

    # always clones the given nfa and returns a new nfa with a non-final error state
    def with_err(nfa, alphabet = nfa.alphabet)
      with_err!(nfa.deep_clone, alphabet)
    end

    # adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
    # the error state transitions to itself on any token.
    def with_err!(nfa, alphabet = nfa.alphabet)
      error_state = nfa.states.find(&:error?)
      return nfa if error_state

      error_state = State.new_error_state
      nfa.add_state(error_state)

      nfa.states.each do |state|
        tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token)
        missing_tokens = alphabet - tokens_on_outbound_transitions
        missing_tokens.each do |token|
          nfa.add_transition(token, state, error_state)
        end
      end

      nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }

      nfa.set_regex_pattern("/#{nfa.regex_pattern}/E")
    end

    # always clones the given nfa and returns a new nfa with a non-final error state
    def with_err_dead_end(nfa, alphabet = nfa.alphabet)
      with_err_dead_end!(nfa.deep_clone, alphabet)
    end

    # adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
    # the error state doesn't have any outbound transitions.
    def with_err_dead_end!(nfa, alphabet = nfa.alphabet)
      error_state = nfa.states.find(&:error?)
      return nfa if error_state

      error_state = State.new_error_state
      nfa.add_state(error_state)

      nfa.states.each do |state|
        unless state.error?
          tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token).to_set
          only_outbound_transition_is_epsilon_transition = tokens_on_outbound_transitions.size == 1 && tokens_on_outbound_transitions.first == NFATransition::Epsilon
          unless only_outbound_transition_is_epsilon_transition
            missing_tokens = (alphabet - Set[NFATransition::Epsilon]) - tokens_on_outbound_transitions
            missing_tokens.each do |token|
              nfa.add_transition(token, state, error_state)
            end
          end
        end
      end

      # remove the error state if it has no inbound or outbound transitions
      nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }

      nfa.set_regex_pattern("/#{nfa.regex_pattern}/DE")
    end

    # Append b onto a
    # Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
    # This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
    def append(a, b)
      a = a.deep_clone
      b = b.deep_clone
      append!(a, b)
    end

    # Destructively append b onto a
    # Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
    # This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
    def append!(a, b)
      a.alphabet = a.alphabet | b.alphabet

      # add an epsilon transition from each final state of machine a to the start state of maachine b.
      a.final_states.each do |final_state|
        a.add_transition(NFATransition::Epsilon, final_state, b.start_state)
      end

      # add all of machine b's transitions to machine a
      b.all_transitions.each {|transition| a.add_transition(transition.token, transition.from, transition.to) }
      # a.final_states = a.final_states | b.final_states
      a.update_final_states

      a
    end

    def seq(*nfas)
      nfas.flatten.reduce {|memo_nfa, nfa| seq2(memo_nfa, nfa) }
    end

    # Implements concatenation, as defined in the Ragel manual in section 2.5.5 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
    # Seq produces a machine that matches all the strings in machine `a` followed by all the strings in machine `b`.
    # Seq draws epsilon transitions from the final states of thefirst machine to the start state of the second machine.
    # The final states of the first machine lose their final state status, unless the start state of the second machine is final as well.
    def seq2(a, b)
      a = a.deep_clone
      b = b.deep_clone

      a = append!(a, b)

      # make sure that b's final states are the only final states in a after we have appended b onto a
      a.states.each {|state| state.final = b.final_states.includes?(state) }
      a.update_final_states

      a.set_regex_pattern("#{a.regex_pattern}#{b.regex_pattern}")
    end

    # Build a new machine consisting of a new start state with epsilon transitions to the start state of all the given NFAs in `nfas`.
    # The resulting machine's final states are the set of final states from all the NFAs in `nfas`.
    #
    # Implements Union, as defined in the Ragel manual in section 2.5.1 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
    # The union operation produces a machine that matches any string in machine one or machine two.
    # The operation first creates a new start state.
    # Epsilon transitions are drawn from the new start state to the start states of both input machines.
    # The resulting machine has a final state setequivalent to the union of the final state sets of both input machines.
    def union(*nfas)
      nfas.flatten!
      nfas = nfas.map(&:deep_clone)
      union!(nfas)
    end

    # same as union, but doesn't deep clone the constituent nfas
    def union!(nfas)
      start = State.new
      composite_alphabet = nfas.map(&:alphabet).reduce {|memo, alphabet| memo | alphabet }
      new_nfa = NFA.new(start, composite_alphabet)

      # add epsilon transitions from the start state of the new machine to the start state of machines a and b
      nfas.each do |nfa|
        new_nfa.add_states(nfa.states)
        new_nfa.add_transition(NFATransition::Epsilon, start, nfa.start_state)
        nfa.all_transitions.each {|t| new_nfa.add_transition(t.token, t.from, t.to) }
      end

      new_nfa.update_final_states

      new_nfa.set_regex_pattern("#{nfas.map(&:regex_pattern).join("|")}")
    end

    # Implements Kleene Star, as defined in the Ragel manual in section 2.5.6 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
    # The machine resulting from the Kleene Star operator will match zero or more repetitions of the machine it is applied to.
    # It creates a new start state and an additional final state.
    # Epsilon transitions are drawn between the new start state and the old start state,
    # between the new start state and the new final state, and between the final states of the machine and the new start state.
    def kleene(machine)
      machine = machine.deep_clone
      start = State.new
      final = State.new(true)

      nfa = NFA.new(start, machine.alphabet)
      nfa.add_states(machine.states)
      nfa.add_transition(NFATransition::Epsilon, start, final)
      nfa.add_transition(NFATransition::Epsilon, start, machine.start_state)
      machine.final_states.each do |final_state|
        nfa.add_transition(NFATransition::Epsilon, final_state, start)
        final_state.final = false
      end

      # add all of machine's transitions to the new machine
      (machine.all_transitions).each {|t| nfa.add_transition(t.token, t.from, t.to) }
      nfa.update_final_states

      nfa.set_regex_pattern("#{machine.regex_pattern}*")
    end

    def plus(machine)
      seq(machine, kleene(machine)).set_regex_pattern("#{machine.regex_pattern}+")
    end

    def optional(machine)
      empty = NFA.new(State.new(true), machine.alphabet).set_regex_pattern("")
      union(machine, empty).set_regex_pattern("#{machine.regex_pattern}?")
    end

    # def repeat(machine, min, max = nil)
    #   max ||= min
    #   m = NFA.new(State.new(true), machine.alphabet)
    #   min.times { m = seq(m, machine) }
    #   (max - min).times { m = append(m, machine) }
    #   if min != max
    #     m.set_regex_pattern("#{machine.regex_pattern}{#{min},#{max}}")
    #   else
    #     m.set_regex_pattern("#{machine.regex_pattern}{#{min}}")
    #   end
    # end

    # def negate(machine)
    #   machine = machine.to_dfa

    #   # invert the final flag of every state
    #   machine.states.each {|state| state.final = !state.final? }
    #   machine.update_final_states

    #   machine.to_nfa.set_regex_pattern("(!#{machine.regex_pattern})")
    # end

    # # a - b == a && !b
    # def difference(a, b)
    #   intersection(a, negate(b))
    # end

    # # By De Morgan's Law: !(!a || !b) = a && b
    # def intersection(a, b)
    #   negate(union(negate(a), negate(b)))
    # end
  end
end