class Prism::Translation::Parser::Lexer

Accepts a list of prism tokens and converts them into the expected format for the parser gem.

Attributes

An array of tuples that contain prism tokens and their associated lex state when they were lexed.

A hash that maps offsets in bytes to offsets in characters.

The Parser::Source::Buffer that the tokens were lexed from.

Public Class Methods

Initialize the lexer with the given source buffer, prism tokens, and offset cache.

# File lib/prism/translation/parser/lexer.rb, line 229
def initialize(source_buffer, lexed, offset_cache)
  @source_buffer = source_buffer
  @lexed = lexed
  @offset_cache = offset_cache
end

Public Instance Methods

Convert the prism tokens into the expected format for the parser gem.

# File lib/prism/translation/parser/lexer.rb, line 239
def to_a
  tokens = []

  index = 0
  length = lexed.length

  heredoc_stack = []
  quote_stack = []

  # The parser gem emits the newline tokens for comments out of order. This saves
  # that token location to emit at a later time to properly line everything up.
  # https://github.com/whitequark/parser/issues/1025
  comment_newline_location = nil

  while index < length
    token, state = lexed[index]
    index += 1
    next if TYPES_ALWAYS_SKIP.include?(token.type)

    type = TYPES.fetch(token.type)
    value = token.value
    location = range(token.location.start_offset, token.location.end_offset)

    case type
    when :kDO
      nearest_lambda_token = tokens.reverse_each.find do |token|
        LAMBDA_TOKEN_TYPES.include?(token.first)
      end

      if nearest_lambda_token&.first == :tLAMBDA
        type = :kDO_LAMBDA
      end
    when :tCHARACTER
      value.delete_prefix!("?")
      # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
      value = unescape_string(value, "?")
    when :tCOMMENT
      if token.type == :EMBDOC_BEGIN

        while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
          value += next_token.value
          index += 1
        end

        value += next_token.value
        location = range(token.location.start_offset, lexed[index][0].location.end_offset)
        index += 1
      else
        is_at_eol = value.chomp!.nil?
        location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))

        prev_token = lexed[index - 2][0] if index - 2 >= 0
        next_token = lexed[index][0]

        is_inline_comment = prev_token&.location&.start_line == token.location.start_line
        if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
          tokens << [:tCOMMENT, [value, location]]

          nl_location = range(token.location.end_offset - 1, token.location.end_offset)
          tokens << [:tNL, [nil, nl_location]]
          next
        elsif is_inline_comment && next_token&.type == :COMMENT
          comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
        elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
          tokens << [:tCOMMENT, [value, location]]
          tokens << [:tNL, [nil, comment_newline_location]]
          comment_newline_location = nil
          next
        end
      end
    when :tNL
      next_token = next_token = lexed[index][0]
      # Newlines after comments are emitted out of order.
      if next_token&.type == :COMMENT
        comment_newline_location = location
        next
      end

      value = nil
    when :tFLOAT
      value = parse_float(value)
    when :tIMAGINARY
      value = parse_complex(value)
    when :tINTEGER
      if value.start_with?("+")
        tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
        location = range(token.location.start_offset + 1, token.location.end_offset)
      end

      value = parse_integer(value)
    when :tLABEL
      value.chomp!(":")
    when :tLABEL_END
      value.chomp!(":")
    when :tLCURLY
      type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
    when :tLPAREN2
      type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
    when :tNTH_REF
      value = parse_integer(value.delete_prefix("$"))
    when :tOP_ASGN
      value.chomp!("=")
    when :tRATIONAL
      value = parse_rational(value)
    when :tSPACE
      location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
      value = nil
    when :tSTRING_BEG
      next_token = lexed[index][0]
      next_next_token = lexed[index + 1][0]
      basic_quotes = value == '"' || value == "'"

      if basic_quotes && next_token&.type == :STRING_END
        next_location = token.location.join(next_token.location)
        type = :tSTRING
        value = ""
        location = range(next_location.start_offset, next_location.end_offset)
        index += 1
      elsif value.start_with?("'", '"', "%")
        if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
          string_value = next_token.value
          if simplify_string?(string_value, value)
            next_location = token.location.join(next_next_token.location)
            if percent_array?(value)
              value = percent_array_unescape(string_value)
            else
              value = unescape_string(string_value, value)
            end
            type = :tSTRING
            location = range(next_location.start_offset, next_location.end_offset)
            index += 2
            tokens << [type, [value, location]]

            next
          end
        end

        quote_stack.push(value)
      elsif token.type == :HEREDOC_START
        quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
        heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
        heredoc = HeredocData.new(
          identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
          common_whitespace: 0,
        )

        if quote == "`"
          type = :tXSTRING_BEG
        end

        # The parser gem trims whitespace from squiggly heredocs. We must record
        # the most common whitespace to later remove.
        if heredoc_type == "~" || heredoc_type == "`"
          heredoc.common_whitespace = calculate_heredoc_whitespace(index)
        end

        if quote == "'" || quote == '"' || quote == "`"
          value = "<<#{quote}"
        else
          value = '<<"'
        end

        heredoc_stack.push(heredoc)
        quote_stack.push(value)
      end
    when :tSTRING_CONTENT
      is_percent_array = percent_array?(quote_stack.last)

      if (lines = token.value.lines).one?
        # Prism usually emits a single token for strings with line continuations.
        # For squiggly heredocs they are not joined so we do that manually here.
        current_string = +""
        current_length = 0
        start_offset = token.location.start_offset
        while token.type == :STRING_CONTENT
          current_length += token.value.bytesize
          # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
          is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
          # The parser gem only removes indentation when the heredoc is not nested
          not_nested = heredoc_stack.size == 1
          if is_percent_array
            value = percent_array_unescape(token.value)
          elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
            value = trim_heredoc_whitespace(token.value, current_heredoc)
          end

          current_string << unescape_string(value, quote_stack.last)
          if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
            tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
            break
          end
          token = lexed[index][0]
          index += 1
        end
      else
        # When the parser gem encounters a line continuation inside of a multiline string,
        # it emits a single string node. The backslash (and remaining newline) is removed.
        current_line = +""
        adjustment = 0
        start_offset = token.location.start_offset
        emit = false

        lines.each.with_index do |line, index|
          chomped_line = line.chomp
          backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
          is_interpolation = interpolation?(quote_stack.last)

          if backslash_count.odd? && (is_interpolation || is_percent_array)
            if is_percent_array
              current_line << percent_array_unescape(line)
              adjustment += 1
            else
              chomped_line.delete_suffix!("\\")
              current_line << chomped_line
              adjustment += 2
            end
            # If the string ends with a line continuation emit the remainder
            emit = index == lines.count - 1
          else
            current_line << line
            emit = true
          end

          if emit
            end_offset = start_offset + current_line.bytesize + adjustment
            tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
            start_offset = end_offset
            current_line = +""
            adjustment = 0
          end
        end
      end
      next
    when :tSTRING_DVAR
      value = nil
    when :tSTRING_END
      if token.type == :HEREDOC_END && value.end_with?("\n")
        newline_length = value.end_with?("\r\n") ? 2 : 1
        value = heredoc_stack.pop.identifier
        location = range(token.location.start_offset, token.location.end_offset - newline_length)
      elsif token.type == :REGEXP_END
        value = value[0]
        location = range(token.location.start_offset, token.location.start_offset + 1)
      end

      if percent_array?(quote_stack.pop)
        prev_token = lexed[index - 2][0] if index - 2 >= 0
        empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
        ends_with_whitespace = prev_token&.type == :WORDS_SEP
        # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
        if !empty && !ends_with_whitespace
          tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
        end
      end
    when :tSYMBEG
      if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
        next_location = token.location.join(next_token.location)
        type = :tSYMBOL
        value = next_token.value
        value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
        location = range(next_location.start_offset, next_location.end_offset)
        index += 1
      else
        quote_stack.push(value)
      end
    when :tFID
      if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
        type = :tIDENTIFIER
      end
    when :tXSTRING_BEG
      if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
        # self.`()
        type = :tBACK_REF2
      end
      quote_stack.push(value)
    when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
      if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
        index += 1
      end

      quote_stack.push(value)
    when :tREGEXP_BEG
      quote_stack.push(value)
    end

    tokens << [type, [value, location]]

    if token.type == :REGEXP_END
      tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
    end
  end

  tokens
end