Add fast-path for comment detection (#9808)

charliermarsh · web-flow · commit 9781563ef636 · 2024-02-05T11:00:18.000-05:00
## Summary

When we fall through to parsing, the comment-detection rule is a
significant portion of lint time. This PR adds an additional fast
heuristic whereby we abort if a comment contains two consecutive name
tokens (via the zero-allocation lexer). For the `ctypeslib.py`, which
has a few cases that are now caught by this, it's a 2.5x speedup for the
rule (and a 20% speedup for token-based rules).
diff --git a/crates/ruff_linter/src/rules/eradicate/detection.rs b/crates/ruff_linter/src/rules/eradicate/detection.rs
@@ -1,14 +1,16 @@
 /// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py)
 use aho_corasick::AhoCorasick;
+use itertools::Itertools;
 use once_cell::sync::Lazy;
 use regex::{Regex, RegexSet};
 
 use ruff_python_parser::parse_suite;
+use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer};
+use ruff_text_size::TextSize;
 
 static CODE_INDICATORS: Lazy<AhoCorasick> = Lazy::new(|| {
     AhoCorasick::new([
-        "(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue",
-        "import",
+        "(", ")", "[", "]", "{", "}", ":", "=", "%", "return", "break", "continue", "import",
     ])
     .unwrap()
 });
@@ -44,6 +46,14 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
         return false;
     }
 
+    // Fast path: if the comment contains consecutive identifiers, we know it won't parse.
+    let tokenizer = SimpleTokenizer::starts_at(TextSize::default(), line).skip_trivia();
+    if tokenizer.tuple_windows().any(|(first, second)| {
+        first.kind == SimpleTokenKind::Name && second.kind == SimpleTokenKind::Name
+    }) {
+        return false;
+    }
+
     // Ignore task tag comments (e.g., "# TODO(tom): Refactor").
     if line
         .split(&[' ', ':', '('])
@@ -123,9 +133,10 @@ mod tests {
 
     #[test]
     fn comment_contains_code_with_print() {
-        assert!(comment_contains_code("#print", &[]));
         assert!(comment_contains_code("#print(1)", &[]));
 
+        assert!(!comment_contains_code("#print", &[]));
+        assert!(!comment_contains_code("#print 1", &[]));
         assert!(!comment_contains_code("#to print", &[]));
     }
 
diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap
@@ -4,7 +4,7 @@ expression: test_case.tokens()
 ---
 [
     SimpleToken {
-        kind: Other,
+        kind: Name,
         range: 0..2,
     },
 ]
diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_starting_with_string_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_starting_with_string_kind.snap
@@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Name,
+        range: 0..3,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 3..4,
+    },
+    SimpleToken {
+        kind: Name,
+        range: 4..7,
+    },
+]
diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_byte_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_byte_kind.snap
@@ -0,0 +1,14 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Other,
+        range: 0..2,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 2..7,
+    },
+]
diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_invalid_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_invalid_kind.snap
@@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Name,
+        range: 0..3,
+    },
+    SimpleToken {
+        kind: Other,
+        range: 3..4,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 4..8,
+    },
+]
diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_kind.snap
@@ -0,0 +1,14 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Other,
+        range: 0..1,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 1..6,
+    },
+]
diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap
@@ -4,7 +4,7 @@ expression: test_case.tokens()
 ---
 [
     SimpleToken {
-        kind: Other,
+        kind: Name,
         range: 0..6,
     },
 ]
diff --git a/crates/ruff_python_trivia/src/tokenizer.rs b/crates/ruff_python_trivia/src/tokenizer.rs
@@ -182,7 +182,7 @@ fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
         "case" => SimpleTokenKind::Case,
         "with" => SimpleTokenKind::With,
         "yield" => SimpleTokenKind::Yield,
-        _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+        _ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
     }
 }
 
@@ -467,6 +467,9 @@ pub enum SimpleTokenKind {
     /// `yield`
     Yield,
 
+    /// An identifier or keyword.
+    Name,
+
     /// Any other non trivia token.
     Other,
 
@@ -566,10 +569,42 @@ impl<'a> SimpleTokenizer<'a> {
                 let range = TextRange::at(self.offset, token_len);
                 let kind = to_keyword_or_other(&self.source[range]);
 
-                if kind == SimpleTokenKind::Other {
+                // If the next character is a quote, we may be in a string prefix. For example:
+                // `f"foo`.
+                if kind == SimpleTokenKind::Name
+                    && matches!(self.cursor.first(), '"' | '\'')
+                    && matches!(
+                        &self.source[range],
+                        "B" | "BR"
+                            | "Br"
+                            | "F"
+                            | "FR"
+                            | "Fr"
+                            | "R"
+                            | "RB"
+                            | "RF"
+                            | "Rb"
+                            | "Rf"
+                            | "U"
+                            | "b"
+                            | "bR"
+                            | "br"
+                            | "f"
+                            | "fR"
+                            | "fr"
+                            | "r"
+                            | "rB"
+                            | "rF"
+                            | "rb"
+                            | "rf"
+                            | "u"
+                    )
+                {
                     self.bogus = true;
+                    SimpleTokenKind::Other
+                } else {
+                    kind
                 }
-                kind
             }
 
             // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
@@ -1153,6 +1188,45 @@ mod tests {
         test_case.assert_reverse_tokenization();
     }
 
+    #[test]
+    fn string_with_kind() {
+        let source = "f'foo'";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus] vs [bogus, other]
+    }
+
+    #[test]
+    fn string_with_byte_kind() {
+        let source = "BR'foo'";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus] vs [bogus, other]
+    }
+
+    #[test]
+    fn string_with_invalid_kind() {
+        let source = "abc'foo'";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus] vs [bogus, other]
+    }
+
+    #[test]
+    fn identifier_starting_with_string_kind() {
+        let source = "foo bar";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
     #[test]
     fn ignore_word_with_only_id_continuing_chars() {
         let source = "555";

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ expression: test_case.tokens()`
`4`	`4`	`---`
`5`	`5`	`[`
`6`	`6`	`SimpleToken {`
`7`		`- kind: Other,`
	`7`	`+ kind: Name,`
`8`	`8`	`range: 0..2,`
`9`	`9`	`},`
`10`	`10`	`]`