packages/jmespath/src/Lexer.ts

import {
  SIMPLE_TOKENS,
  START_IDENTIFIER,
  VALID_IDENTIFIER,
  VALID_NUMBER,
  WHITESPACE,
} from './constants.js';
import { EmptyExpressionError, LexerError } from './errors.js';
import type { Token } from './types.js';

/**
 * A lexer for JMESPath expressions.
 *
 * This lexer tokenizes a JMESPath expression into a sequence of tokens.
 */
class Lexer {
  #position!: number;
  #expression!: string;
  #chars!: string[];
  #current!: string;
  #length!: number;

  /**
   * Tokenize a JMESPath expression.
   *
   * This method is a generator that yields tokens for the given expression.
   *
   * @param expression The JMESPath expression to tokenize.
   */
  public *tokenize(expression: string): Generator<Token> {
    this.#initializeForExpression(expression);
    while (this.#current !== '' && this.#current !== undefined) {
      if (SIMPLE_TOKENS.has(this.#current)) {
        yield {
          // biome-ignore lint/style/noNonNullAssertion: We know that SIMPLE_TOKENS has this.#current as a key because we checked for that above.
          type: SIMPLE_TOKENS.get(this.#current)!,
          value: this.#current,
          start: this.#position,
          end: this.#position + 1,
        };

        this.#next();
      } else if (START_IDENTIFIER.has(this.#current)) {
        yield this.#consumeIdentifier();
      } else if (WHITESPACE.has(this.#current)) {
        this.#next();
      } else if (this.#current === '[') {
        yield this.#consumeSquareBracket();
      } else if (this.#current === `'`) {
        yield this.#consumeRawStringLiteral();
      } else if (this.#current === '`') {
        yield this.#consumeLiteral();
      } else if (VALID_NUMBER.has(this.#current)) {
        const start = this.#position;
        const buff = this.#consumeNumber();
        yield {
          type: 'number',
          value: Number.parseInt(buff),
          start: start,
          end: start + buff.length,
        };
      } else if (this.#current === '-') {
        yield this.#consumeNegativeNumber();
      } else if (this.#current === '"') {
        yield this.#consumeQuotedIdentifier();
      } else if (['<', '>', '!', '=', '|', '&'].includes(this.#current)) {
        yield this.#consumeComparatorSigns(
          this.#current as '<' | '>' | '!' | '=' | '|' | '&'
        );
      } else {
        throw new LexerError(this.#position, this.#current);
      }
    }
    yield { type: 'eof', value: '', start: this.#length, end: this.#length };
  }

  /**
   * Consume a comparator sign.
   *
   * This method is called when the lexer encounters a comparator sign.
   *
   * @param current The current character
   */
  #consumeComparatorSigns = (
    current: '<' | '>' | '!' | '=' | '|' | '&'
  ): Token => {
    switch (current) {
      case '<':
        return this.#matchOrElse('=', 'lte', 'lt');
      case '>':
        return this.#matchOrElse('=', 'gte', 'gt');
      case '!':
        return this.#matchOrElse('=', 'ne', 'not');
      case '|':
        return this.#matchOrElse('|', 'or', 'pipe');
      case '&':
        return this.#matchOrElse('&', 'and', 'expref');
      default:
        return this.#consumeEqualSign();
    }
  };

  /**
   * Consume an equal sign.
   *
   * This method is called when the lexer encounters an equal sign.
   * It checks if the next character is also an equal sign and returns
   * the corresponding token.
   */
  #consumeEqualSign(): Token {
    if (this.#next() === '=') {
      this.#next();

      return {
        type: 'eq',
        value: '==',
        start: this.#position - 1,
        end: this.#position,
      };
    }
    throw new LexerError(this.#position - 1, '=');
  }

  /**
   * Consume an unquoted identifier.
   *
   * This method is called when the lexer encounters a character that is a valid
   * identifier. It advances the lexer until it finds a character that is not a
   * valid identifier and returns the corresponding token.
   */
  #consumeIdentifier(): Token {
    const start = this.#position;
    let buff = this.#current;
    while (VALID_IDENTIFIER.has(this.#next())) {
      buff += this.#current;
    }

    return {
      type: 'unquoted_identifier',
      value: buff,
      start,
      end: start + buff.length,
    };
  }

  /**
   * Consume a negative number.
   *
   * This method is called when the lexer encounters a negative sign.
   * It checks if the next character is a number and returns the corresponding token.
   */
  #consumeNegativeNumber(): Token {
    const start = this.#position;
    const buff = this.#consumeNumber();
    if (buff.length > 1) {
      return {
        type: 'number',
        value: Number.parseInt(buff),
        start: start,
        end: start + buff.length,
      };
    }
    // If the negative sign is not followed by a number, it is an error.
    throw new LexerError(start, 'Unknown token after "-"');
  }

  /**
   * Consume a raw string that is a number.
   *
   * It takes the current position and advances
   * the lexer until it finds a character that
   * is not a number.
   */
  #consumeNumber(): string {
    let buff = this.#current;
    while (VALID_NUMBER.has(this.#next())) {
      buff += this.#current;
    }

    return buff;
  }

  /**
   * Consume a square bracket.
   *
   * This method is called when the lexer encounters a square bracket.
   * It checks if the next character is a question mark or a closing
   * square bracket and returns the corresponding token.
   */
  #consumeSquareBracket(): Token {
    const start = this.#position;
    const nextChar = this.#next();
    if (nextChar === ']') {
      this.#next();

      return { type: 'flatten', value: '[]', start: start, end: start + 2 };
    }
    if (nextChar === '?') {
      this.#next();

      return { type: 'filter', value: '[?', start: start, end: start + 2 };
    }
    return { type: 'lbracket', value: '[', start: start, end: start + 1 };
  }

  /**
   * Initializes the lexer for the given expression.
   *
   * We use a separate method for this instead of the constructor
   * because we want to be able to reuse the same lexer instance
   * and also because we want to be able to expose a public API
   * for tokenizing expressions like `new Lexer().tokenize(expression)`.
   *
   * @param expression The JMESPath expression to tokenize.
   */
  #initializeForExpression(expression: string): void {
    if (typeof expression !== 'string') {
      throw new EmptyExpressionError();
    }

    this.#position = 0;
    this.#expression = expression;
    this.#chars = Array.from(expression);
    this.#current = this.#chars[0];
    this.#length = this.#expression.length;
  }

  /**
   * Advance the lexer to the next character in the expression.
   */
  #next(): string {
    if (this.#position === this.#length - 1) {
      this.#current = '';
    } else {
      this.#position += 1;
      this.#current = this.#chars[this.#position];
    }

    return this.#current;
  }

  /**
   * Consume until the given delimiter is reached allowing
   * for escaping of the delimiter with a backslash (`\`).
   *
   * @param delimiter The delimiter to consume until.
   */
  #consumeUntil(delimiter: string): string {
    const start = this.#position;
    let buff = '';
    this.#next();
    while (this.#current !== delimiter) {
      if (this.#current === '\\') {
        buff += '\\';
        this.#next();
      }
      if (this.#current === '') {
        // We've reached the end of the expression (EOF) before
        // we found the delimiter. This is an error.
        throw new LexerError(start, this.#expression.substring(start));
      }
      buff += this.#current;
      this.#next();
    }
    // Skip the closing delimiter
    this.#next();

    return buff;
  }

  /**
   * Process a literal.
   *
   * A literal is a JSON string that is enclosed in backticks.
   */
  #consumeLiteral(): Token {
    const start = this.#position;
    const lexeme = this.#consumeUntil('`').replace('\\`', '`');
    try {
      const parsedJson = JSON.parse(lexeme);

      return {
        type: 'literal',
        value: parsedJson,
        start,
        end: this.#position - start,
      };
    } catch (error) {
      throw new LexerError(start, lexeme);
    }
  }

  /**
   * Process a quoted identifier.
   *
   * A quoted identifier is a string that is enclosed in double quotes.
   */
  #consumeQuotedIdentifier(): Token {
    const start = this.#position;
    const lexeme = `"${this.#consumeUntil('"')}"`;
    const tokenLen = this.#position - start;

    return {
      type: 'quoted_identifier',
      value: JSON.parse(lexeme),
      start,
      end: tokenLen,
    };
  }

  /**
   * Process a raw string literal.
   *
   * A raw string literal is a string that is enclosed in single quotes.
   */
  #consumeRawStringLiteral(): Token {
    const start = this.#position;
    const lexeme = this.#consumeUntil(`'`).replace(`\\'`, `'`);
    const tokenLen = this.#position - start;

    return {
      type: 'literal',
      value: lexeme,
      start,
      end: tokenLen,
    };
  }

  /**
   * Match the expected character and return the corresponding token type.
   *
   * @param expected The expected character
   * @param matchType The token type to return if the expected character is found
   * @param elseType  The token type to return if the expected character is not found
   */
  #matchOrElse(
    expected: string,
    matchType: Token['type'],
    elseType: Token['type']
  ): Token {
    const start = this.#position;
    const current = this.#current;
    const nextChar = this.#next();
    if (nextChar === expected) {
      this.#next();

      return {
        type: matchType,
        value: current + nextChar,
        start,
        end: start + 2,
      };
    }

    return {
      type: elseType,
      value: current,
      start,
      end: start,
    };
  }
}

export { Lexer };