class Prism::Translation::Parser::Lexer

Accepts a list of prism tokens and converts them into the expected format for the parser gem.

Attributes

lexed [R]

An array of tuples that contain prism tokens and their associated lex state when they were lexed.

offset_cache [R]

A hash that maps offsets in bytes to offsets in characters.

source_buffer [R]

The Parser::Source::Buffer that the tokens were lexed from.

Public Class Methods

new (source_buffer, lexed, offset_cache)

Initialize the lexer with the given source buffer, prism tokens, and offset cache.

# File lib/prism/translation/parser/lexer.rb, line 229
def initialize(source_buffer, lexed, offset_cache)
  @source_buffer = source_buffer
  @lexed = lexed
  @offset_cache = offset_cache
end

Public Instance Methods

to_a ()

Convert the prism tokens into the expected format for the parser gem.

# File lib/prism/translation/parser/lexer.rb, line 239
def to_a
  tokens = []

  index = 0
  length = lexed.length

  heredoc_stack = []
  quote_stack = []

  # The parser gem emits the newline tokens for comments out of order. This saves
  # that token location to emit at a later time to properly line everything up.
  # https://github.com/whitequark/parser/issues/1025
  comment_newline_location = nil

  while index < length
    token, state = lexed[index]
    index += 1
    next if TYPES_ALWAYS_SKIP.include?(token.type)

    type = TYPES.fetch(token.type)
    value = token.value
    location = range(token.location.start_offset, token.location.end_offset)

    case type
    when :kDO
      nearest_lambda_token = tokens.reverse_each.find do |token|
        LAMBDA_TOKEN_TYPES.include?(token.first)
      end

      if nearest_lambda_token&.first == :tLAMBDA
        type = :kDO_LAMBDA
      end
    when :tCHARACTER
      value.delete_prefix!("?")
      # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
      value = unescape_string(value, "?")
    when :tCOMMENT
      if token.type == :EMBDOC_BEGIN

        while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
          value += next_token.value
          index += 1
        end

        value += next_token.value
        location = range(token.location.start_offset, lexed[index][0].location.end_offset)
        index += 1
      else
        is_at_eol = value.chomp!.nil?
        location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))

        prev_token = lexed[index - 2][0] if index - 2 >= 0
        next_token = lexed[index][0]

        is_inline_comment = prev_token&.location&.start_line == token.location.start_line
        if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
          tokens << [:tCOMMENT, [value, location]]

          nl_location = range(token.location.end_offset - 1, token.location.end_offset)
          tokens << [:tNL, [nil, nl_location]]
          next
        elsif is_inline_comment && next_token&.type == :COMMENT
          comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
        elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
          tokens << [:tCOMMENT, [value, location]]
          tokens << [:tNL, [nil, comment_newline_location]]
          comment_newline_location = nil
          next
        end
      end
    when :tNL
      next_token = next_token = lexed[index][0]
      # Newlines after comments are emitted out of order.
      if next_token&.type == :COMMENT
        comment_newline_location = location
        next
      end

      value = nil
    when :tFLOAT
      value = parse_float(value)
    when :tIMAGINARY
      value = parse_complex(value)
    when :tINTEGER
      if value.start_with?("+")
        tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
        location = range(token.location.start_offset + 1, token.location.end_offset)
      end

      value = parse_integer(value)
    when :tLABEL
      value.chomp!(":")
    when :tLABEL_END
      value.chomp!(":")
    when :tLCURLY
      type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
    when :tLPAREN2
      type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
    when :tNTH_REF
      value = parse_integer(value.delete_prefix("$"))
    when :tOP_ASGN
      value.chomp!("=")
    when :tRATIONAL
      value = parse_rational(value)
    when :tSPACE
      location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
      value = nil
    when :tSTRING_BEG
      next_token = lexed[index][0]
      next_next_token = lexed[index + 1][0]
      basic_quotes = value == '"' || value == "'"

      if basic_quotes && next_token&.type == :STRING_END
        next_location = token.location.join(next_token.location)
        type = :tSTRING
        value = ""
        location = range(next_location.start_offset, next_location.end_offset)
        index += 1
      elsif value.start_with?("'", '"', "%")
        if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
          string_value = next_token.value
          if simplify_string?(string_value, value)
            next_location = token.location.join(next_next_token.location)
            if percent_array?(value)
              value = percent_array_unescape(string_value)
            else
              value = unescape_string(string_value, value)
            end
            type = :tSTRING
            location = range(next_location.start_offset, next_location.end_offset)
            index += 2
            tokens << [type, [value, location]]

            next
          end
        end

        quote_stack.push(value)
      elsif token.type == :HEREDOC_START
        quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
        heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
        heredoc = HeredocData.new(
          identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
          common_whitespace: 0,
        )

        if quote == "`"
          type = :tXSTRING_BEG
        end

        # The parser gem trims whitespace from squiggly heredocs. We must record
        # the most common whitespace to later remove.
        if heredoc_type == "~" || heredoc_type == "`"
          heredoc.common_whitespace = calculate_heredoc_whitespace(index)
        end

        if quote == "'" || quote == '"' || quote == "`"
          value = "<<#{quote}"
        else
          value = '<<"'
        end

        heredoc_stack.push(heredoc)
        quote_stack.push(value)
      end
    when :tSTRING_CONTENT
      is_percent_array = percent_array?(quote_stack.last)

      if (lines = token.value.lines).one?
        # Prism usually emits a single token for strings with line continuations.
        # For squiggly heredocs they are not joined so we do that manually here.
        current_string = +""
        current_length = 0
        start_offset = token.location.start_offset
        while token.type == :STRING_CONTENT
          current_length += token.value.bytesize
          # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
          is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
          # The parser gem only removes indentation when the heredoc is not nested
          not_nested = heredoc_stack.size == 1
          if is_percent_array
            value = percent_array_unescape(token.value)
          elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
            value = trim_heredoc_whitespace(token.value, current_heredoc)
          end

          current_string << unescape_string(value, quote_stack.last)
          if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
            tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
            break
          end
          token = lexed[index][0]
          index += 1
        end
      else
        # When the parser gem encounters a line continuation inside of a multiline string,
        # it emits a single string node. The backslash (and remaining newline) is removed.
        current_line = +""
        adjustment = 0
        start_offset = token.location.start_offset
        emit = false

        lines.each.with_index do |line, index|
          chomped_line = line.chomp
          backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
          is_interpolation = interpolation?(quote_stack.last)

          if backslash_count.odd? && (is_interpolation || is_percent_array)
            if is_percent_array
              current_line << percent_array_unescape(line)
              adjustment += 1
            else
              chomped_line.delete_suffix!("\\")
              current_line << chomped_line
              adjustment += 2
            end
            # If the string ends with a line continuation emit the remainder
            emit = index == lines.count - 1
          else
            current_line << line
            emit = true
          end

          if emit
            end_offset = start_offset + current_line.bytesize + adjustment
            tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
            start_offset = end_offset
            current_line = +""
            adjustment = 0
          end
        end
      end
      next
    when :tSTRING_DVAR
      value = nil
    when :tSTRING_END
      if token.type == :HEREDOC_END && value.end_with?("\n")
        newline_length = value.end_with?("\r\n") ? 2 : 1
        value = heredoc_stack.pop.identifier
        location = range(token.location.start_offset, token.location.end_offset - newline_length)
      elsif token.type == :REGEXP_END
        value = value[0]
        location = range(token.location.start_offset, token.location.start_offset + 1)
      end

      if percent_array?(quote_stack.pop)
        prev_token = lexed[index - 2][0] if index - 2 >= 0
        empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
        ends_with_whitespace = prev_token&.type == :WORDS_SEP
        # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
        if !empty && !ends_with_whitespace
          tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
        end
      end
    when :tSYMBEG
      if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
        next_location = token.location.join(next_token.location)
        type = :tSYMBOL
        value = next_token.value
        value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
        location = range(next_location.start_offset, next_location.end_offset)
        index += 1
      else
        quote_stack.push(value)
      end
    when :tFID
      if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
        type = :tIDENTIFIER
      end
    when :tXSTRING_BEG
      if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
        # self.`()
        type = :tBACK_REF2
      end
      quote_stack.push(value)
    when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
      if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
        index += 1
      end

      quote_stack.push(value)
    when :tREGEXP_BEG
      quote_stack.push(value)
    end

    tokens << [type, [value, location]]

    if token.type == :REGEXP_END
      tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
    end
  end

  tokens
end