Skip to content

Commit 9781563

Browse files
Add fast-path for comment detection (#9808)
## Summary When we fall through to parsing, the comment-detection rule is a significant portion of lint time. This PR adds an additional fast heuristic whereby we abort if a comment contains two consecutive name tokens (via the zero-allocation lexer). For the `ctypeslib.py`, which has a few cases that are now caught by this, it's a 2.5x speedup for the rule (and a 20% speedup for token-based rules).
1 parent 84aea7f commit 9781563

8 files changed

+157
-8
lines changed

crates/ruff_linter/src/rules/eradicate/detection.rs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
/// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py)
22
use aho_corasick::AhoCorasick;
3+
use itertools::Itertools;
34
use once_cell::sync::Lazy;
45
use regex::{Regex, RegexSet};
56

67
use ruff_python_parser::parse_suite;
8+
use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer};
9+
use ruff_text_size::TextSize;
710

811
static CODE_INDICATORS: Lazy<AhoCorasick> = Lazy::new(|| {
912
AhoCorasick::new([
10-
"(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue",
11-
"import",
13+
"(", ")", "[", "]", "{", "}", ":", "=", "%", "return", "break", "continue", "import",
1214
])
1315
.unwrap()
1416
});
@@ -44,6 +46,14 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
4446
return false;
4547
}
4648

49+
// Fast path: if the comment contains consecutive identifiers, we know it won't parse.
50+
let tokenizer = SimpleTokenizer::starts_at(TextSize::default(), line).skip_trivia();
51+
if tokenizer.tuple_windows().any(|(first, second)| {
52+
first.kind == SimpleTokenKind::Name && second.kind == SimpleTokenKind::Name
53+
}) {
54+
return false;
55+
}
56+
4757
// Ignore task tag comments (e.g., "# TODO(tom): Refactor").
4858
if line
4959
.split(&[' ', ':', '('])
@@ -123,9 +133,10 @@ mod tests {
123133

124134
#[test]
125135
fn comment_contains_code_with_print() {
126-
assert!(comment_contains_code("#print", &[]));
127136
assert!(comment_contains_code("#print(1)", &[]));
128137

138+
assert!(!comment_contains_code("#print", &[]));
139+
assert!(!comment_contains_code("#print 1", &[]));
129140
assert!(!comment_contains_code("#to print", &[]));
130141
}
131142

crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ expression: test_case.tokens()
44
---
55
[
66
SimpleToken {
7-
kind: Other,
7+
kind: Name,
88
range: 0..2,
99
},
1010
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
source: crates/ruff_python_trivia/src/tokenizer.rs
3+
expression: test_case.tokens()
4+
---
5+
[
6+
SimpleToken {
7+
kind: Name,
8+
range: 0..3,
9+
},
10+
SimpleToken {
11+
kind: Whitespace,
12+
range: 3..4,
13+
},
14+
SimpleToken {
15+
kind: Name,
16+
range: 4..7,
17+
},
18+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
source: crates/ruff_python_trivia/src/tokenizer.rs
3+
expression: test_case.tokens()
4+
---
5+
[
6+
SimpleToken {
7+
kind: Other,
8+
range: 0..2,
9+
},
10+
SimpleToken {
11+
kind: Bogus,
12+
range: 2..7,
13+
},
14+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
source: crates/ruff_python_trivia/src/tokenizer.rs
3+
expression: test_case.tokens()
4+
---
5+
[
6+
SimpleToken {
7+
kind: Name,
8+
range: 0..3,
9+
},
10+
SimpleToken {
11+
kind: Other,
12+
range: 3..4,
13+
},
14+
SimpleToken {
15+
kind: Bogus,
16+
range: 4..8,
17+
},
18+
]
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
source: crates/ruff_python_trivia/src/tokenizer.rs
3+
expression: test_case.tokens()
4+
---
5+
[
6+
SimpleToken {
7+
kind: Other,
8+
range: 0..1,
9+
},
10+
SimpleToken {
11+
kind: Bogus,
12+
range: 1..6,
13+
},
14+
]

crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ expression: test_case.tokens()
44
---
55
[
66
SimpleToken {
7-
kind: Other,
7+
kind: Name,
88
range: 0..6,
99
},
1010
]

crates/ruff_python_trivia/src/tokenizer.rs

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
182182
"case" => SimpleTokenKind::Case,
183183
"with" => SimpleTokenKind::With,
184184
"yield" => SimpleTokenKind::Yield,
185-
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
185+
_ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
186186
}
187187
}
188188

@@ -467,6 +467,9 @@ pub enum SimpleTokenKind {
467467
/// `yield`
468468
Yield,
469469

470+
/// An identifier or keyword.
471+
Name,
472+
470473
/// Any other non trivia token.
471474
Other,
472475

@@ -566,10 +569,42 @@ impl<'a> SimpleTokenizer<'a> {
566569
let range = TextRange::at(self.offset, token_len);
567570
let kind = to_keyword_or_other(&self.source[range]);
568571

569-
if kind == SimpleTokenKind::Other {
572+
// If the next character is a quote, we may be in a string prefix. For example:
573+
// `f"foo`.
574+
if kind == SimpleTokenKind::Name
575+
&& matches!(self.cursor.first(), '"' | '\'')
576+
&& matches!(
577+
&self.source[range],
578+
"B" | "BR"
579+
| "Br"
580+
| "F"
581+
| "FR"
582+
| "Fr"
583+
| "R"
584+
| "RB"
585+
| "RF"
586+
| "Rb"
587+
| "Rf"
588+
| "U"
589+
| "b"
590+
| "bR"
591+
| "br"
592+
| "f"
593+
| "fR"
594+
| "fr"
595+
| "r"
596+
| "rB"
597+
| "rF"
598+
| "rb"
599+
| "rf"
600+
| "u"
601+
)
602+
{
570603
self.bogus = true;
604+
SimpleTokenKind::Other
605+
} else {
606+
kind
571607
}
572-
kind
573608
}
574609

575610
// Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
@@ -1153,6 +1188,45 @@ mod tests {
11531188
test_case.assert_reverse_tokenization();
11541189
}
11551190

1191+
#[test]
1192+
fn string_with_kind() {
1193+
let source = "f'foo'";
1194+
1195+
let test_case = tokenize(source);
1196+
assert_debug_snapshot!(test_case.tokens());
1197+
1198+
// note: not reversible: [other, bogus] vs [bogus, other]
1199+
}
1200+
1201+
#[test]
1202+
fn string_with_byte_kind() {
1203+
let source = "BR'foo'";
1204+
1205+
let test_case = tokenize(source);
1206+
assert_debug_snapshot!(test_case.tokens());
1207+
1208+
// note: not reversible: [other, bogus] vs [bogus, other]
1209+
}
1210+
1211+
#[test]
1212+
fn string_with_invalid_kind() {
1213+
let source = "abc'foo'";
1214+
1215+
let test_case = tokenize(source);
1216+
assert_debug_snapshot!(test_case.tokens());
1217+
1218+
// note: not reversible: [other, bogus] vs [bogus, other]
1219+
}
1220+
1221+
#[test]
1222+
fn identifier_starting_with_string_kind() {
1223+
let source = "foo bar";
1224+
1225+
let test_case = tokenize(source);
1226+
assert_debug_snapshot!(test_case.tokens());
1227+
test_case.assert_reverse_tokenization();
1228+
}
1229+
11561230
#[test]
11571231
fn ignore_word_with_only_id_continuing_chars() {
11581232
let source = "555";

0 commit comments

Comments
 (0)