Skip to content

Commit eb97c14

Browse files
authored
Merge pull request #1466 from emilio/token-lazy
clang: Tokenize more lazily.
2 parents 698758e + 7109c48 commit eb97c14

File tree

3 files changed

+125
-69
lines changed

3 files changed

+125
-69
lines changed

src/clang.rs

+116-56
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,9 @@ impl Cursor {
507507
let mut found_attr = false;
508508
self.visit(|cur| {
509509
if cur.kind() == CXCursor_UnexposedAttr {
510-
found_attr = cur.tokens().map(|tokens| {
511-
tokens.iter().any(|t| {
512-
t.kind == CXToken_Identifier && t.spelling == attr
513-
})
514-
}).unwrap_or(false);
510+
found_attr = cur.tokens().iter().any(|t| {
511+
t.kind == CXToken_Identifier && t.spelling() == attr.as_bytes()
512+
});
515513

516514
if found_attr {
517515
return CXChildVisit_Break;
@@ -653,64 +651,126 @@ impl Cursor {
653651
}
654652

655653
/// Gets the tokens that correspond to that cursor.
656-
pub fn tokens(&self) -> Option<Vec<Token>> {
657-
let range = self.extent();
658-
let mut tokens = vec![];
659-
unsafe {
660-
let tu = clang_Cursor_getTranslationUnit(self.x);
661-
let mut token_ptr = ptr::null_mut();
662-
let mut num_tokens: c_uint = 0;
663-
clang_tokenize(tu, range, &mut token_ptr, &mut num_tokens);
664-
if token_ptr.is_null() {
665-
return None;
666-
}
654+
pub fn tokens(&self) -> RawTokens {
655+
RawTokens::new(self)
656+
}
667657

668-
let token_array =
669-
slice::from_raw_parts(token_ptr, num_tokens as usize);
670-
for &token in token_array.iter() {
671-
let kind = clang_getTokenKind(token);
672-
let spelling =
673-
cxstring_into_string(clang_getTokenSpelling(tu, token));
658+
/// Gets the tokens that correspond to that cursor as `cexpr` tokens.
659+
pub fn cexpr_tokens(self) -> Vec<cexpr::token::Token> {
660+
use cexpr::token;
674661

675-
tokens.push(Token {
676-
kind: kind,
677-
spelling: spelling,
678-
});
662+
self.tokens().iter().filter_map(|token| {
663+
let kind = match token.kind {
664+
CXToken_Punctuation => token::Kind::Punctuation,
665+
CXToken_Literal => token::Kind::Literal,
666+
CXToken_Identifier => token::Kind::Identifier,
667+
CXToken_Keyword => token::Kind::Keyword,
668+
// NB: cexpr is not too happy about comments inside
669+
// expressions, so we strip them down here.
670+
CXToken_Comment => return None,
671+
_ => {
672+
error!("Found unexpected token kind: {:?}", token);
673+
return None;
674+
}
675+
};
676+
677+
Some(token::Token {
678+
kind,
679+
raw: token.spelling().to_vec().into_boxed_slice(),
680+
})
681+
}).collect()
682+
}
683+
}
684+
685+
/// A struct that owns the tokenizer result from a given cursor.
686+
pub struct RawTokens<'a> {
687+
cursor: &'a Cursor,
688+
tu: CXTranslationUnit,
689+
tokens: *mut CXToken,
690+
token_count: c_uint,
691+
}
692+
693+
impl<'a> RawTokens<'a> {
694+
fn new(cursor: &'a Cursor) -> Self {
695+
let mut tokens = ptr::null_mut();
696+
let mut token_count = 0;
697+
let range = cursor.extent();
698+
let tu = unsafe {
699+
clang_Cursor_getTranslationUnit(cursor.x)
700+
};
701+
unsafe { clang_tokenize(tu, range, &mut tokens, &mut token_count) };
702+
Self { cursor, tu, tokens, token_count }
703+
}
704+
705+
fn as_slice(&self) -> &[CXToken] {
706+
if self.tokens.is_null() {
707+
return &[];
708+
}
709+
unsafe { slice::from_raw_parts(self.tokens, self.token_count as usize) }
710+
}
711+
712+
/// Get an iterator over these tokens.
713+
pub fn iter(&self) -> ClangTokenIterator {
714+
ClangTokenIterator {
715+
tu: self.tu,
716+
raw: self.as_slice().iter(),
717+
}
718+
}
719+
}
720+
721+
impl<'a> Drop for RawTokens<'a> {
722+
fn drop(&mut self) {
723+
if !self.tokens.is_null() {
724+
unsafe {
725+
clang_disposeTokens(self.tu, self.tokens, self.token_count as c_uint);
679726
}
680-
clang_disposeTokens(tu, token_ptr, num_tokens);
681727
}
682-
Some(tokens)
683728
}
729+
}
684730

685-
/// Gets the tokens that correspond to that cursor as `cexpr` tokens.
686-
pub fn cexpr_tokens(self) -> Option<Vec<cexpr::token::Token>> {
687-
use cexpr::token;
731+
/// A raw clang token, that exposes only the kind and spelling. This is a
732+
/// slightly more convenient version of `CXToken` which owns the spelling
733+
/// string.
734+
#[derive(Debug)]
735+
pub struct ClangToken {
736+
spelling: CXString,
737+
/// The kind of token, this is the same as the relevant member from
738+
/// `CXToken`.
739+
pub kind: CXTokenKind,
740+
}
688741

689-
self.tokens().map(|tokens| {
690-
tokens
691-
.into_iter()
692-
.filter_map(|token| {
693-
let kind = match token.kind {
694-
CXToken_Punctuation => token::Kind::Punctuation,
695-
CXToken_Literal => token::Kind::Literal,
696-
CXToken_Identifier => token::Kind::Identifier,
697-
CXToken_Keyword => token::Kind::Keyword,
698-
// NB: cexpr is not too happy about comments inside
699-
// expressions, so we strip them down here.
700-
CXToken_Comment => return None,
701-
_ => {
702-
error!("Found unexpected token kind: {:?}", token);
703-
return None;
704-
}
705-
};
706-
707-
Some(token::Token {
708-
kind: kind,
709-
raw: token.spelling.into_bytes().into_boxed_slice(),
710-
})
711-
})
712-
.collect::<Vec<_>>()
713-
})
742+
impl ClangToken {
743+
/// Get the token spelling, without being converted to utf-8.
744+
pub fn spelling(&self) -> &[u8] {
745+
let c_str = unsafe {
746+
CStr::from_ptr(clang_getCString(self.spelling) as *const _)
747+
};
748+
c_str.to_bytes()
749+
}
750+
}
751+
752+
impl Drop for ClangToken {
753+
fn drop(&mut self) {
754+
unsafe { clang_disposeString(self.spelling) }
755+
}
756+
}
757+
758+
/// An iterator over a set of Tokens.
759+
pub struct ClangTokenIterator<'a> {
760+
tu: CXTranslationUnit,
761+
raw: slice::Iter<'a, CXToken>,
762+
}
763+
764+
impl<'a> Iterator for ClangTokenIterator<'a> {
765+
type Item = ClangToken;
766+
767+
fn next(&mut self) -> Option<Self::Item> {
768+
let raw = self.raw.next()?;
769+
unsafe {
770+
let kind = clang_getTokenKind(*raw);
771+
let spelling = clang_getTokenSpelling(self.tu, *raw);
772+
Some(ClangToken { kind, spelling })
773+
}
714774
}
715775
}
716776

src/ir/context.rs

+7-11
Original file line numberDiff line numberDiff line change
@@ -2163,21 +2163,17 @@ If you encounter an error missing from this list, please file an issue or a PR!"
21632163

21642164
let mut module_name = None;
21652165
let spelling = cursor.spelling();
2166-
if !spelling.is_empty()
2167-
{
2166+
if !spelling.is_empty() {
21682167
module_name = Some(spelling)
21692168
}
21702169

2171-
let tokens = match cursor.tokens() {
2172-
Some(tokens) => tokens,
2173-
None => return (module_name, ModuleKind::Normal),
2174-
};
2170+
let tokens = cursor.tokens();
21752171
let mut iter = tokens.iter();
21762172
let mut kind = ModuleKind::Normal;
21772173
let mut found_namespace_keyword = false;
21782174
while let Some(token) = iter.next() {
2179-
match &*token.spelling {
2180-
"inline" => {
2175+
match token.spelling() {
2176+
b"inline" => {
21812177
assert!(!found_namespace_keyword);
21822178
assert!(kind != ModuleKind::Inline);
21832179
kind = ModuleKind::Inline;
@@ -2192,16 +2188,16 @@ If you encounter an error missing from this list, please file an issue or a PR!"
21922188
//
21932189
// Fortunately enough, inline nested namespace specifiers aren't
21942190
// a thing, and are invalid C++ :)
2195-
"namespace" | "::" => {
2191+
b"namespace" | b"::" => {
21962192
found_namespace_keyword = true;
21972193
}
2198-
"{" => {
2194+
b"{" => {
21992195
assert!(found_namespace_keyword);
22002196
break;
22012197
}
22022198
name if found_namespace_keyword => {
22032199
if module_name.is_none() {
2204-
module_name = Some(name.to_owned());
2200+
module_name = Some(String::from_utf8_lossy(name).into_owned());
22052201
}
22062202
break;
22072203
}

src/ir/var.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ fn parse_macro(
309309
) -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
310310
use cexpr::expr;
311311

312-
let mut cexpr_tokens = cursor.cexpr_tokens()?;
312+
let mut cexpr_tokens = cursor.cexpr_tokens();
313313

314314
let parser = expr::IdentifierParser::new(ctx.parsed_macros());
315315

@@ -338,7 +338,7 @@ fn parse_int_literal_tokens(cursor: &clang::Cursor) -> Option<i64> {
338338
use cexpr::expr;
339339
use cexpr::expr::EvalResult;
340340

341-
let cexpr_tokens = cursor.cexpr_tokens()?;
341+
let cexpr_tokens = cursor.cexpr_tokens();
342342

343343
// TODO(emilio): We can try to parse other kinds of literals.
344344
match expr::expr(&cexpr_tokens) {

0 commit comments

Comments
 (0)