def result
tokens = [] #: Array[LexCompat::Token]
state = :default
heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
result = Prism.lex(source, **options)
result_value = result.value
previous_state = nil #: Ripper::Lexer::State?
last_heredoc_end = nil #: Integer?
# In previous versions of Ruby, Ripper wouldn't flush the bom before the
# first token, so we had to have a hack in place to account for that. This
# checks for that behavior.
bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
result_value.each_with_index do |(token, lex_state), index|
lineno = token.location.start_line
column = token.location.start_column
# If there's a UTF-8 byte-order mark as the start of the file, then for
# certain tokens ripper sets the first token back by 3 bytes. It also
# keeps the byte order mark in the first token's value. This is weird,
# and I don't want to mirror that in our parser. So instead, we'll match
# up the columns and values here.
if bom && lineno == 1
column -= 3
if index == 0 && column == 0 && !bom_flushed
flushed =
case token.type
when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
:GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
:PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
:PERCENT_UPPER_W, :STRING_BEGIN
true
when :REGEXP_BEGIN, :SYMBOL_BEGIN
token.value.start_with?("%")
else
false
end
unless flushed
column -= 3
value = token.value
value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
end
end
end
event = RIPPER.fetch(token.type)
value = token.value
lex_state = Ripper::Lexer::State.new(lex_state)
token =
case event
when :on___end__
EndContentToken.new([[lineno, column], event, value, lex_state])
when :on_comment
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
when :on_heredoc_end
# Heredoc end tokens can be emitted in an odd order, so we don't
# want to bother comparing the state on them.
last_heredoc_end = token.location.end_offset
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
when :on_ident
if lex_state == Ripper::EXPR_END
# If we have an identifier that follows a method name like:
#
# def foo bar
#
# then Ripper will mark bar as END|LABEL if there is a local in a
# parent scope named bar because it hasn't pushed the local table
# yet. We do this more accurately, so we need to allow comparing
# against both END and END|LABEL.
ParamToken.new([[lineno, column], event, value, lex_state])
elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
# In the event that we're comparing identifiers, we're going to
# allow a little divergence. Ripper doesn't account for local
# variables introduced through named captures in regexes, and we
# do, which accounts for this difference.
IdentToken.new([[lineno, column], event, value, lex_state])
else
Token.new([[lineno, column], event, value, lex_state])
end
when :on_embexpr_end
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
when :on_ignored_nl
# Ignored newlines can occasionally have a LABEL state attached to
# them which doesn't actually impact anything. We don't mirror that
# state so we ignored it.
IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
when :on_regexp_end
# On regex end, Ripper scans and then sets end state, so the ripper
# lexed output is begin, when it should be end. prism sets lex state
# correctly to end state, but we want to be able to compare against
# Ripper's lexed state. So here, if it's a regexp end token, we
# output the state as the previous state, solely for the sake of
# comparison.
previous_token = result_value[index - 1][0]
lex_state =
if RIPPER.fetch(previous_token.type) == :on_embexpr_end
# If the previous token is embexpr_end, then we have to do even
# more processing. The end of an embedded expression sets the
# state to the state that it had at the beginning of the
# embedded expression. So we have to go and find that state and
# set it here.
counter = 1
current_index = index - 1
until counter == 0
current_index -= 1
current_event = RIPPER.fetch(result_value[current_index][0].type)
counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
end
Ripper::Lexer::State.new(result_value[current_index][1])
else
previous_state
end
Token.new([[lineno, column], event, value, lex_state])
when :on_eof
previous_token = result_value[index - 1][0]
# If we're at the end of the file and the previous token was a
# comment and there is still whitespace after the comment, then
# Ripper will append a on_nl token (even though there isn't
# necessarily a newline). We mirror that here.
if previous_token.type == :COMMENT
# If the comment is at the start of a heredoc: <<HEREDOC # comment
# then the comment's end_offset is up near the heredoc_beg.
# This is not the correct offset to use for figuring out if
# there is trailing whitespace after the last token.
# Use the greater offset of the two to determine the start of
# the trailing whitespace.
start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max
end_offset = token.location.start_offset
if start_offset < end_offset
if bom
start_offset += 3
end_offset += 3
end
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
end
end
Token.new([[lineno, column], event, value, lex_state])
else
Token.new([[lineno, column], event, value, lex_state])
end
previous_state = lex_state
# The order in which tokens appear in our lexer is different from the
# order that they appear in Ripper. When we hit the declaration of a
# heredoc in prism, we skip forward and lex the rest of the content of
# the heredoc before going back and lexing at the end of the heredoc
# identifier.
#
# To match up to ripper, we keep a small state variable around here to
# track whether we're in the middle of a heredoc or not. In this way we
# can shuffle around the token to match Ripper's output.
case state
when :default
# The default state is when there are no heredocs at all. In this
# state we can append the token to the list of tokens and move on.
tokens << token
# If we get the declaration of a heredoc, then we open a new heredoc
# and move into the heredoc_opened state.
if event == :on_heredoc_beg
state = :heredoc_opened
heredoc_stack.last << Heredoc.build(token)
end
when :heredoc_opened
# The heredoc_opened state is when we've seen the declaration of a
# heredoc and are now lexing the body of the heredoc. In this state we
# push tokens onto the most recently created heredoc.
heredoc_stack.last.last << token
case event
when :on_heredoc_beg
# If we receive a heredoc declaration while lexing the body of a
# heredoc, this means we have nested heredocs. In this case we'll
# push a new heredoc onto the stack and stay in the heredoc_opened
# state since we're now lexing the body of the new heredoc.
heredoc_stack << [Heredoc.build(token)]
when :on_heredoc_end
# If we receive the end of a heredoc, then we're done lexing the
# body of the heredoc. In this case we now have a completed heredoc
# but need to wait for the next newline to push it into the token
# stream.
state = :heredoc_closed
end
when :heredoc_closed
if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
if heredoc_stack.size > 1
flushing = heredoc_stack.pop
heredoc_stack.last.last << token
flushing.each do |heredoc|
heredoc.to_a.each do |flushed_token|
heredoc_stack.last.last << flushed_token
end
end
state = :heredoc_opened
next
end
elsif event == :on_heredoc_beg
tokens << token
state = :heredoc_opened
heredoc_stack.last << Heredoc.build(token)
next
elsif heredoc_stack.size > 1
heredoc_stack[-2].last << token
next
end
heredoc_stack.last.each do |heredoc|
tokens.concat(heredoc.to_a)
end
heredoc_stack.last.clear
state = :default
tokens << token
end
end
# Drop the EOF token from the list
tokens = tokens[0...-1]
# We sort by location to compare against Ripper's output
tokens.sort_by!(&:location)
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
end