clang: Tokenize more lazily.

emilio · emilio · commit c28d9f75eb44 · 2018-12-14T03:17:07.000+01:00
Instead of converting all the tokens to utf-8 before-hand, which is costly, and allocating a new vector unconditionally (on top of the one clang already allocates), just do the tokenization more lazily. There's actually only one place in the codebase which needs the utf-8 string, all the others can just work with the byte slice from clang. This should have no behavior change, other than be faster. In particular, this halves the time on my machine spent on the test-case from #1465. I'm not completely sure that this is going to be enough to make it acceptable, but we should probably do it regardless.
diff --git a/src/clang.rs b/src/clang.rs
@@ -507,11 +507,9 @@ impl Cursor {
         let mut found_attr = false;
         self.visit(|cur| {
             if cur.kind() == CXCursor_UnexposedAttr {
-                found_attr = cur.tokens().map(|tokens| {
-                    tokens.iter().any(|t| {
-                        t.kind == CXToken_Identifier && t.spelling == attr
-                    })
-                }).unwrap_or(false);
+                found_attr = cur.tokens().iter().any(|t| {
+                    t.kind == CXToken_Identifier && t.spelling() == attr.as_bytes()
+                });
 
                 if found_attr {
                     return CXChildVisit_Break;
@@ -653,64 +651,124 @@ impl Cursor {
     }
 
     /// Gets the tokens that correspond to that cursor.
-    pub fn tokens(&self) -> Option<Vec<Token>> {
-        let range = self.extent();
-        let mut tokens = vec![];
-        unsafe {
-            let tu = clang_Cursor_getTranslationUnit(self.x);
-            let mut token_ptr = ptr::null_mut();
-            let mut num_tokens: c_uint = 0;
-            clang_tokenize(tu, range, &mut token_ptr, &mut num_tokens);
-            if token_ptr.is_null() {
-                return None;
-            }
+    pub fn tokens(&self) -> RawTokens {
+        RawTokens::new(self)
+    }
 
-            let token_array =
-                slice::from_raw_parts(token_ptr, num_tokens as usize);
-            for &token in token_array.iter() {
-                let kind = clang_getTokenKind(token);
-                let spelling =
-                    cxstring_into_string(clang_getTokenSpelling(tu, token));
+    /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
+    pub fn cexpr_tokens(self) -> Vec<cexpr::token::Token> {
+        use cexpr::token;
 
-                tokens.push(Token {
-                    kind: kind,
-                    spelling: spelling,
-                });
+        self.tokens().iter().filter_map(|token| {
+            let kind = match token.kind {
+                CXToken_Punctuation => token::Kind::Punctuation,
+                CXToken_Literal => token::Kind::Literal,
+                CXToken_Identifier => token::Kind::Identifier,
+                CXToken_Keyword => token::Kind::Keyword,
+                // NB: cexpr is not too happy about comments inside
+                // expressions, so we strip them down here.
+                CXToken_Comment => return None,
+                _ => {
+                    error!("Found unexpected token kind: {:?}", token);
+                    return None;
+                }
+            };
+
+            Some(token::Token {
+                kind,
+                raw: token.spelling().to_vec().into_boxed_slice(),
+            })
+        }).collect()
+    }
+}
+
+/// A struct that owns the tokenizer result from a given cursor.
+pub struct RawTokens<'a> {
+    cursor: &'a Cursor,
+    tu: CXTranslationUnit,
+    tokens: *mut CXToken,
+    token_count: c_uint,
+}
+
+impl<'a> RawTokens<'a> {
+    fn new(cursor: &'a Cursor) -> Self {
+        let mut tokens = ptr::null_mut();
+        let mut token_count = 0;
+        let range = cursor.extent();
+        let tu = unsafe {
+            clang_Cursor_getTranslationUnit(cursor.x)
+        };
+        unsafe { clang_tokenize(tu, range, &mut tokens, &mut token_count) };
+        Self { cursor, tu, tokens, token_count }
+    }
+
+    fn as_slice(&self) -> &[CXToken] {
+        if self.tokens.is_null() {
+            return &[];
+        }
+        unsafe { slice::from_raw_parts(self.tokens, self.token_count as usize) }
+    }
+
+    /// Get an iterator over these tokens.
+    pub fn iter(&self) -> ClangTokenIterator {
+        ClangTokenIterator {
+            tu: self.tu,
+            raw: self.as_slice().iter(),
+        }
+    }
+}
+
+impl<'a> Drop for RawTokens<'a> {
+    fn drop(&mut self) {
+        if !self.tokens.is_null() {
+            unsafe {
+                clang_disposeTokens(self.tu, self.tokens, self.token_count as c_uint);
             }
-            clang_disposeTokens(tu, token_ptr, num_tokens);
         }
-        Some(tokens)
     }
+}
 
-    /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
-    pub fn cexpr_tokens(self) -> Option<Vec<cexpr::token::Token>> {
-        use cexpr::token;
+/// A raw clang token, that exposes only the kind and spelling. This is a
+/// slightly more convenient version of `CXToken` which owns the spelling
+/// string.
+#[derive(Debug)]
+pub struct ClangToken {
+    spelling: CXString,
+    pub kind: CXTokenKind,
+}
 
-        self.tokens().map(|tokens| {
-            tokens
-                .into_iter()
-                .filter_map(|token| {
-                    let kind = match token.kind {
-                        CXToken_Punctuation => token::Kind::Punctuation,
-                        CXToken_Literal => token::Kind::Literal,
-                        CXToken_Identifier => token::Kind::Identifier,
-                        CXToken_Keyword => token::Kind::Keyword,
-                        // NB: cexpr is not too happy about comments inside
-                        // expressions, so we strip them down here.
-                        CXToken_Comment => return None,
-                        _ => {
-                            error!("Found unexpected token kind: {:?}", token);
-                            return None;
-                        }
-                    };
-
-                    Some(token::Token {
-                        kind: kind,
-                        raw: token.spelling.into_bytes().into_boxed_slice(),
-                    })
-                })
-                .collect::<Vec<_>>()
-        })
+impl ClangToken {
+    /// Get the token spelling, without being converted to utf-8.
+    pub fn spelling(&self) -> &[u8] {
+        let c_str = unsafe {
+            CStr::from_ptr(clang_getCString(self.spelling) as *const _)
+        };
+        c_str.to_bytes()
+    }
+}
+
+impl Drop for ClangToken {
+    fn drop(&mut self) {
+        unsafe { clang_disposeString(self.spelling) }
+    }
+}
+
+/// An iterator over a set of Tokens.
+pub struct ClangTokenIterator<'a> {
+    tu: CXTranslationUnit,
+    raw: slice::Iter<'a, CXToken>,
+}
+
+impl<'a> Iterator for ClangTokenIterator<'a> {
+    type Item = ClangToken;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let raw = self.raw.next()?;
+        unsafe {
+            let kind = clang_getTokenKind(*raw);
+            let spelling = clang_getTokenSpelling(self.tu, *raw);
+            Some(ClangToken { kind, spelling })
+        }
     }
 }
 
diff --git a/src/ir/context.rs b/src/ir/context.rs
@@ -2163,21 +2163,17 @@ If you encounter an error missing from this list, please file an issue or a PR!"
 
         let mut module_name = None;
         let spelling = cursor.spelling();
-        if !spelling.is_empty()
-        {
+        if !spelling.is_empty() {
             module_name = Some(spelling)
         }
 
-        let tokens = match cursor.tokens() {
-            Some(tokens) => tokens,
-            None => return (module_name, ModuleKind::Normal),
-        };
+        let tokens = cursor.tokens();
         let mut iter = tokens.iter();
         let mut kind = ModuleKind::Normal;
         let mut found_namespace_keyword = false;
         while let Some(token) = iter.next() {
-            match &*token.spelling {
-                "inline" => {
+            match token.spelling() {
+                b"inline" => {
                     assert!(!found_namespace_keyword);
                     assert!(kind != ModuleKind::Inline);
                     kind = ModuleKind::Inline;
@@ -2192,16 +2188,16 @@ If you encounter an error missing from this list, please file an issue or a PR!"
                 //
                 // Fortunately enough, inline nested namespace specifiers aren't
                 // a thing, and are invalid C++ :)
-                "namespace" | "::" => {
+                b"namespace" | b"::" => {
                     found_namespace_keyword = true;
                 }
-                "{" => {
+                b"{" => {
                     assert!(found_namespace_keyword);
                     break;
                 }
                 name if found_namespace_keyword => {
                     if module_name.is_none() {
-                        module_name = Some(name.to_owned());
+                        module_name = Some(String::from_utf8_lossy(name).into_owned());
                     }
                     break;
                 }
diff --git a/src/ir/var.rs b/src/ir/var.rs
@@ -309,7 +309,7 @@ fn parse_macro(
 ) -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
     use cexpr::expr;
 
-    let mut cexpr_tokens = cursor.cexpr_tokens()?;
+    let mut cexpr_tokens = cursor.cexpr_tokens();
 
     let parser = expr::IdentifierParser::new(ctx.parsed_macros());
 
@@ -338,7 +338,7 @@ fn parse_int_literal_tokens(cursor: &clang::Cursor) -> Option<i64> {
     use cexpr::expr;
     use cexpr::expr::EvalResult;
 
-    let cexpr_tokens = cursor.cexpr_tokens()?;
+    let cexpr_tokens = cursor.cexpr_tokens();
 
     // TODO(emilio): We can try to parse other kinds of literals.
     match expr::expr(&cexpr_tokens) {