class Prism::Translation::Parser::Lexer
Accepts a list of prism tokens and converts them into the expected format for the parser gem.
Attributes
An array of tuples that contain prism tokens and their associated lex state when they were lexed.
A hash that maps offsets in bytes to offsets in characters.
The Parser::Source::Buffer that the tokens were lexed from.
Public Class Methods
Initialize the lexer with the given source buffer, prism tokens, and offset cache.
# File lib/prism/translation/parser/lexer.rb, line 229 def initialize(source_buffer, lexed, offset_cache) @source_buffer = source_buffer @lexed = lexed @offset_cache = offset_cache end
Public Instance Methods
Convert the prism tokens into the expected format for the parser gem.
# File lib/prism/translation/parser/lexer.rb, line 239 def to_a tokens = [] index = 0 length = lexed.length heredoc_stack = [] quote_stack = [] # The parser gem emits the newline tokens for comments out of order. This saves # that token location to emit at a later time to properly line everything up. # https://github.com/whitequark/parser/issues/1025 comment_newline_location = nil while index < length token, state = lexed[index] index += 1 next if TYPES_ALWAYS_SKIP.include?(token.type) type = TYPES.fetch(token.type) value = token.value location = range(token.location.start_offset, token.location.end_offset) case type when :kDO nearest_lambda_token = tokens.reverse_each.find do |token| LAMBDA_TOKEN_TYPES.include?(token.first) end if nearest_lambda_token&.first == :tLAMBDA type = :kDO_LAMBDA end when :tCHARACTER value.delete_prefix!("?") # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism. value = unescape_string(value, "?") when :tCOMMENT if token.type == :EMBDOC_BEGIN while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1) value += next_token.value index += 1 end value += next_token.value location = range(token.location.start_offset, lexed[index][0].location.end_offset) index += 1 else is_at_eol = value.chomp!.nil? location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1)) prev_token = lexed[index - 2][0] if index - 2 >= 0 next_token = lexed[index][0] is_inline_comment = prev_token&.location&.start_line == token.location.start_line if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type) tokens << [:tCOMMENT, [value, location]] nl_location = range(token.location.end_offset - 1, token.location.end_offset) tokens << [:tNL, [nil, nl_location]] next elsif is_inline_comment && next_token&.type == :COMMENT comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset) elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type) tokens << [:tCOMMENT, [value, location]] tokens << [:tNL, [nil, comment_newline_location]] comment_newline_location = nil next end end when :tNL next_token = next_token = lexed[index][0] # Newlines after comments are emitted out of order. if next_token&.type == :COMMENT comment_newline_location = location next end value = nil when :tFLOAT value = parse_float(value) when :tIMAGINARY value = parse_complex(value) when :tINTEGER if value.start_with?("+") tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]] location = range(token.location.start_offset + 1, token.location.end_offset) end value = parse_integer(value) when :tLABEL value.chomp!(":") when :tLABEL_END value.chomp!(":") when :tLCURLY type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL when :tLPAREN2 type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0)) when :tNTH_REF value = parse_integer(value.delete_prefix("$")) when :tOP_ASGN value.chomp!("=") when :tRATIONAL value = parse_rational(value) when :tSPACE location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value)) value = nil when :tSTRING_BEG next_token = lexed[index][0] next_next_token = lexed[index + 1][0] basic_quotes = value == '"' || value == "'" if basic_quotes && next_token&.type == :STRING_END next_location = token.location.join(next_token.location) type = :tSTRING value = "" location = range(next_location.start_offset, next_location.end_offset) index += 1 elsif value.start_with?("'", '"', "%") if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END string_value = next_token.value if simplify_string?(string_value, value) next_location = token.location.join(next_next_token.location) if percent_array?(value) value = percent_array_unescape(string_value) else value = unescape_string(string_value, value) end type = :tSTRING location = range(next_location.start_offset, next_location.end_offset) index += 2 tokens << [type, [value, location]] next end end quote_stack.push(value) elsif token.type == :HEREDOC_START quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2] heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : "" heredoc = HeredocData.new( identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier], common_whitespace: 0, ) if quote == "`" type = :tXSTRING_BEG end # The parser gem trims whitespace from squiggly heredocs. We must record # the most common whitespace to later remove. if heredoc_type == "~" || heredoc_type == "`" heredoc.common_whitespace = calculate_heredoc_whitespace(index) end if quote == "'" || quote == '"' || quote == "`" value = "<<#{quote}" else value = '<<"' end heredoc_stack.push(heredoc) quote_stack.push(value) end when :tSTRING_CONTENT is_percent_array = percent_array?(quote_stack.last) if (lines = token.value.lines).one? # Prism usually emits a single token for strings with line continuations. # For squiggly heredocs they are not joined so we do that manually here. current_string = +"" current_length = 0 start_offset = token.location.start_offset while token.type == :STRING_CONTENT current_length += token.value.bytesize # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line # The parser gem only removes indentation when the heredoc is not nested not_nested = heredoc_stack.size == 1 if is_percent_array value = percent_array_unescape(token.value) elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0 value = trim_heredoc_whitespace(token.value, current_heredoc) end current_string << unescape_string(value, quote_stack.last) if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last) tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]] break end token = lexed[index][0] index += 1 end else # When the parser gem encounters a line continuation inside of a multiline string, # it emits a single string node. The backslash (and remaining newline) is removed. current_line = +"" adjustment = 0 start_offset = token.location.start_offset emit = false lines.each.with_index do |line, index| chomped_line = line.chomp backslash_count = chomped_line[/\\{1,}\z/]&.length || 0 is_interpolation = interpolation?(quote_stack.last) if backslash_count.odd? && (is_interpolation || is_percent_array) if is_percent_array current_line << percent_array_unescape(line) adjustment += 1 else chomped_line.delete_suffix!("\\") current_line << chomped_line adjustment += 2 end # If the string ends with a line continuation emit the remainder emit = index == lines.count - 1 else current_line << line emit = true end if emit end_offset = start_offset + current_line.bytesize + adjustment tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]] start_offset = end_offset current_line = +"" adjustment = 0 end end end next when :tSTRING_DVAR value = nil when :tSTRING_END if token.type == :HEREDOC_END && value.end_with?("\n") newline_length = value.end_with?("\r\n") ? 2 : 1 value = heredoc_stack.pop.identifier location = range(token.location.start_offset, token.location.end_offset - newline_length) elsif token.type == :REGEXP_END value = value[0] location = range(token.location.start_offset, token.location.start_offset + 1) end if percent_array?(quote_stack.pop) prev_token = lexed[index - 2][0] if index - 2 >= 0 empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type) ends_with_whitespace = prev_token&.type == :WORDS_SEP # parser always emits a space token after content in a percent array, even if no actual whitespace is present. if !empty && !ends_with_whitespace tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]] end end when :tSYMBEG if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END next_location = token.location.join(next_token.location) type = :tSYMBOL value = next_token.value value = { "~@" => "~", "!@" => "!" }.fetch(value, value) location = range(next_location.start_offset, next_location.end_offset) index += 1 else quote_stack.push(value) end when :tFID if !tokens.empty? && tokens.dig(-1, 0) == :kDEF type = :tIDENTIFIER end when :tXSTRING_BEG if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type) # self.`() type = :tBACK_REF2 end quote_stack.push(value) when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP index += 1 end quote_stack.push(value) when :tREGEXP_BEG quote_stack.push(value) end tokens << [type, [value, location]] if token.type == :REGEXP_END tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]] end end tokens end