Skip to content

Commit c28d9f7

Browse files
committed
clang: Tokenize more lazily.
Instead of converting all the tokens to utf-8 before-hand, which is costly, and allocating a new vector unconditionally (on top of the one clang already allocates), just do the tokenization more lazily. There's actually only one place in the codebase which needs the utf-8 string, all the others can just work with the byte slice from clang. This should have no behavior change, other than be faster. In particular, this halves the time on my machine spent on the test-case from #1465. I'm not completely sure that this is going to be enough to make it acceptable, but we should probably do it regardless.
1 parent 698758e commit c28d9f7

File tree

3 files changed

+123
-69
lines changed

3 files changed

+123
-69
lines changed

src/clang.rs

Lines changed: 114 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,9 @@ impl Cursor {
507507
let mut found_attr = false;
508508
self.visit(|cur| {
509509
if cur.kind() == CXCursor_UnexposedAttr {
510-
found_attr = cur.tokens().map(|tokens| {
511-
tokens.iter().any(|t| {
512-
t.kind == CXToken_Identifier && t.spelling == attr
513-
})
514-
}).unwrap_or(false);
510+
found_attr = cur.tokens().iter().any(|t| {
511+
t.kind == CXToken_Identifier && t.spelling() == attr.as_bytes()
512+
});
515513

516514
if found_attr {
517515
return CXChildVisit_Break;
@@ -653,64 +651,124 @@ impl Cursor {
653651
}
654652

655653
/// Gets the tokens that correspond to that cursor.
656-
pub fn tokens(&self) -> Option<Vec<Token>> {
657-
let range = self.extent();
658-
let mut tokens = vec![];
659-
unsafe {
660-
let tu = clang_Cursor_getTranslationUnit(self.x);
661-
let mut token_ptr = ptr::null_mut();
662-
let mut num_tokens: c_uint = 0;
663-
clang_tokenize(tu, range, &mut token_ptr, &mut num_tokens);
664-
if token_ptr.is_null() {
665-
return None;
666-
}
654+
pub fn tokens(&self) -> RawTokens {
655+
RawTokens::new(self)
656+
}
667657

668-
let token_array =
669-
slice::from_raw_parts(token_ptr, num_tokens as usize);
670-
for &token in token_array.iter() {
671-
let kind = clang_getTokenKind(token);
672-
let spelling =
673-
cxstring_into_string(clang_getTokenSpelling(tu, token));
658+
/// Gets the tokens that correspond to that cursor as `cexpr` tokens.
659+
pub fn cexpr_tokens(self) -> Vec<cexpr::token::Token> {
660+
use cexpr::token;
674661

675-
tokens.push(Token {
676-
kind: kind,
677-
spelling: spelling,
678-
});
662+
self.tokens().iter().filter_map(|token| {
663+
let kind = match token.kind {
664+
CXToken_Punctuation => token::Kind::Punctuation,
665+
CXToken_Literal => token::Kind::Literal,
666+
CXToken_Identifier => token::Kind::Identifier,
667+
CXToken_Keyword => token::Kind::Keyword,
668+
// NB: cexpr is not too happy about comments inside
669+
// expressions, so we strip them down here.
670+
CXToken_Comment => return None,
671+
_ => {
672+
error!("Found unexpected token kind: {:?}", token);
673+
return None;
674+
}
675+
};
676+
677+
Some(token::Token {
678+
kind,
679+
raw: token.spelling().to_vec().into_boxed_slice(),
680+
})
681+
}).collect()
682+
}
683+
}
684+
685+
/// A struct that owns the tokenizer result from a given cursor.
686+
pub struct RawTokens<'a> {
687+
cursor: &'a Cursor,
688+
tu: CXTranslationUnit,
689+
tokens: *mut CXToken,
690+
token_count: c_uint,
691+
}
692+
693+
impl<'a> RawTokens<'a> {
694+
fn new(cursor: &'a Cursor) -> Self {
695+
let mut tokens = ptr::null_mut();
696+
let mut token_count = 0;
697+
let range = cursor.extent();
698+
let tu = unsafe {
699+
clang_Cursor_getTranslationUnit(cursor.x)
700+
};
701+
unsafe { clang_tokenize(tu, range, &mut tokens, &mut token_count) };
702+
Self { cursor, tu, tokens, token_count }
703+
}
704+
705+
fn as_slice(&self) -> &[CXToken] {
706+
if self.tokens.is_null() {
707+
return &[];
708+
}
709+
unsafe { slice::from_raw_parts(self.tokens, self.token_count as usize) }
710+
}
711+
712+
/// Get an iterator over these tokens.
713+
pub fn iter(&self) -> ClangTokenIterator {
714+
ClangTokenIterator {
715+
tu: self.tu,
716+
raw: self.as_slice().iter(),
717+
}
718+
}
719+
}
720+
721+
impl<'a> Drop for RawTokens<'a> {
722+
fn drop(&mut self) {
723+
if !self.tokens.is_null() {
724+
unsafe {
725+
clang_disposeTokens(self.tu, self.tokens, self.token_count as c_uint);
679726
}
680-
clang_disposeTokens(tu, token_ptr, num_tokens);
681727
}
682-
Some(tokens)
683728
}
729+
}
684730

685-
/// Gets the tokens that correspond to that cursor as `cexpr` tokens.
686-
pub fn cexpr_tokens(self) -> Option<Vec<cexpr::token::Token>> {
687-
use cexpr::token;
731+
/// A raw clang token, that exposes only the kind and spelling. This is a
732+
/// slightly more convenient version of `CXToken` which owns the spelling
733+
/// string.
734+
#[derive(Debug)]
735+
pub struct ClangToken {
736+
spelling: CXString,
737+
pub kind: CXTokenKind,
738+
}
688739

689-
self.tokens().map(|tokens| {
690-
tokens
691-
.into_iter()
692-
.filter_map(|token| {
693-
let kind = match token.kind {
694-
CXToken_Punctuation => token::Kind::Punctuation,
695-
CXToken_Literal => token::Kind::Literal,
696-
CXToken_Identifier => token::Kind::Identifier,
697-
CXToken_Keyword => token::Kind::Keyword,
698-
// NB: cexpr is not too happy about comments inside
699-
// expressions, so we strip them down here.
700-
CXToken_Comment => return None,
701-
_ => {
702-
error!("Found unexpected token kind: {:?}", token);
703-
return None;
704-
}
705-
};
706-
707-
Some(token::Token {
708-
kind: kind,
709-
raw: token.spelling.into_bytes().into_boxed_slice(),
710-
})
711-
})
712-
.collect::<Vec<_>>()
713-
})
740+
impl ClangToken {
741+
/// Get the token spelling, without being converted to utf-8.
742+
pub fn spelling(&self) -> &[u8] {
743+
let c_str = unsafe {
744+
CStr::from_ptr(clang_getCString(self.spelling) as *const _)
745+
};
746+
c_str.to_bytes()
747+
}
748+
}
749+
750+
impl Drop for ClangToken {
751+
fn drop(&mut self) {
752+
unsafe { clang_disposeString(self.spelling) }
753+
}
754+
}
755+
756+
/// An iterator over a set of Tokens.
757+
pub struct ClangTokenIterator<'a> {
758+
tu: CXTranslationUnit,
759+
raw: slice::Iter<'a, CXToken>,
760+
}
761+
762+
impl<'a> Iterator for ClangTokenIterator<'a> {
763+
type Item = ClangToken;
764+
765+
fn next(&mut self) -> Option<Self::Item> {
766+
let raw = self.raw.next()?;
767+
unsafe {
768+
let kind = clang_getTokenKind(*raw);
769+
let spelling = clang_getTokenSpelling(self.tu, *raw);
770+
Some(ClangToken { kind, spelling })
771+
}
714772
}
715773
}
716774

src/ir/context.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2163,21 +2163,17 @@ If you encounter an error missing from this list, please file an issue or a PR!"
21632163

21642164
let mut module_name = None;
21652165
let spelling = cursor.spelling();
2166-
if !spelling.is_empty()
2167-
{
2166+
if !spelling.is_empty() {
21682167
module_name = Some(spelling)
21692168
}
21702169

2171-
let tokens = match cursor.tokens() {
2172-
Some(tokens) => tokens,
2173-
None => return (module_name, ModuleKind::Normal),
2174-
};
2170+
let tokens = cursor.tokens();
21752171
let mut iter = tokens.iter();
21762172
let mut kind = ModuleKind::Normal;
21772173
let mut found_namespace_keyword = false;
21782174
while let Some(token) = iter.next() {
2179-
match &*token.spelling {
2180-
"inline" => {
2175+
match token.spelling() {
2176+
b"inline" => {
21812177
assert!(!found_namespace_keyword);
21822178
assert!(kind != ModuleKind::Inline);
21832179
kind = ModuleKind::Inline;
@@ -2192,16 +2188,16 @@ If you encounter an error missing from this list, please file an issue or a PR!"
21922188
//
21932189
// Fortunately enough, inline nested namespace specifiers aren't
21942190
// a thing, and are invalid C++ :)
2195-
"namespace" | "::" => {
2191+
b"namespace" | b"::" => {
21962192
found_namespace_keyword = true;
21972193
}
2198-
"{" => {
2194+
b"{" => {
21992195
assert!(found_namespace_keyword);
22002196
break;
22012197
}
22022198
name if found_namespace_keyword => {
22032199
if module_name.is_none() {
2204-
module_name = Some(name.to_owned());
2200+
module_name = Some(String::from_utf8_lossy(name).into_owned());
22052201
}
22062202
break;
22072203
}

src/ir/var.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ fn parse_macro(
309309
) -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
310310
use cexpr::expr;
311311

312-
let mut cexpr_tokens = cursor.cexpr_tokens()?;
312+
let mut cexpr_tokens = cursor.cexpr_tokens();
313313

314314
let parser = expr::IdentifierParser::new(ctx.parsed_macros());
315315

@@ -338,7 +338,7 @@ fn parse_int_literal_tokens(cursor: &clang::Cursor) -> Option<i64> {
338338
use cexpr::expr;
339339
use cexpr::expr::EvalResult;
340340

341-
let cexpr_tokens = cursor.cexpr_tokens()?;
341+
let cexpr_tokens = cursor.cexpr_tokens();
342342

343343
// TODO(emilio): We can try to parse other kinds of literals.
344344
match expr::expr(&cexpr_tokens) {

0 commit comments

Comments
 (0)