Merge pull request #1466 from emilio/token-lazy

emilio · web-flow · commit eb97c1494d5d · 2018-12-14T11:59:40.000+01:00
clang: Tokenize more lazily.
diff --git a/src/clang.rs b/src/clang.rs
@@ -507,11 +507,9 @@ impl Cursor {
         let mut found_attr = false;
         self.visit(|cur| {
             if cur.kind() == CXCursor_UnexposedAttr {
-                found_attr = cur.tokens().map(|tokens| {
-                    tokens.iter().any(|t| {
-                        t.kind == CXToken_Identifier && t.spelling == attr
-                    })
-                }).unwrap_or(false);
+                found_attr = cur.tokens().iter().any(|t| {
+                    t.kind == CXToken_Identifier && t.spelling() == attr.as_bytes()
+                });
 
                 if found_attr {
                     return CXChildVisit_Break;
@@ -653,64 +651,126 @@ impl Cursor {
     }
 
     /// Gets the tokens that correspond to that cursor.
-    pub fn tokens(&self) -> Option<Vec<Token>> {
-        let range = self.extent();
-        let mut tokens = vec![];
-        unsafe {
-            let tu = clang_Cursor_getTranslationUnit(self.x);
-            let mut token_ptr = ptr::null_mut();
-            let mut num_tokens: c_uint = 0;
-            clang_tokenize(tu, range, &mut token_ptr, &mut num_tokens);
-            if token_ptr.is_null() {
-                return None;
-            }
+    pub fn tokens(&self) -> RawTokens {
+        RawTokens::new(self)
+    }
 
-            let token_array =
-                slice::from_raw_parts(token_ptr, num_tokens as usize);
-            for &token in token_array.iter() {
-                let kind = clang_getTokenKind(token);
-                let spelling =
-                    cxstring_into_string(clang_getTokenSpelling(tu, token));
+    /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
+    pub fn cexpr_tokens(self) -> Vec<cexpr::token::Token> {
+        use cexpr::token;
 
-                tokens.push(Token {
-                    kind: kind,
-                    spelling: spelling,
-                });
+        self.tokens().iter().filter_map(|token| {
+            let kind = match token.kind {
+                CXToken_Punctuation => token::Kind::Punctuation,
+                CXToken_Literal => token::Kind::Literal,
+                CXToken_Identifier => token::Kind::Identifier,
+                CXToken_Keyword => token::Kind::Keyword,
+                // NB: cexpr is not too happy about comments inside
+                // expressions, so we strip them down here.
+                CXToken_Comment => return None,
+                _ => {
+                    error!("Found unexpected token kind: {:?}", token);
+                    return None;
+                }
+            };
+
+            Some(token::Token {
+                kind,
+                raw: token.spelling().to_vec().into_boxed_slice(),
+            })
+        }).collect()
+    }
+}
+
+/// A struct that owns the tokenizer result from a given cursor.
+pub struct RawTokens<'a> {
+    cursor: &'a Cursor,
+    tu: CXTranslationUnit,
+    tokens: *mut CXToken,
+    token_count: c_uint,
+}
+
+impl<'a> RawTokens<'a> {
+    fn new(cursor: &'a Cursor) -> Self {
+        let mut tokens = ptr::null_mut();
+        let mut token_count = 0;
+        let range = cursor.extent();
+        let tu = unsafe {
+            clang_Cursor_getTranslationUnit(cursor.x)
+        };
+        unsafe { clang_tokenize(tu, range, &mut tokens, &mut token_count) };
+        Self { cursor, tu, tokens, token_count }
+    }
+
+    fn as_slice(&self) -> &[CXToken] {
+        if self.tokens.is_null() {
+            return &[];
+        }
+        unsafe { slice::from_raw_parts(self.tokens, self.token_count as usize) }
+    }
+
+    /// Get an iterator over these tokens.
+    pub fn iter(&self) -> ClangTokenIterator {
+        ClangTokenIterator {
+            tu: self.tu,
+            raw: self.as_slice().iter(),
+        }
+    }
+}
+
+impl<'a> Drop for RawTokens<'a> {
+    fn drop(&mut self) {
+        if !self.tokens.is_null() {
+            unsafe {
+                clang_disposeTokens(self.tu, self.tokens, self.token_count as c_uint);
             }
-            clang_disposeTokens(tu, token_ptr, num_tokens);
         }
-        Some(tokens)
     }
+}
 
-    /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
-    pub fn cexpr_tokens(self) -> Option<Vec<cexpr::token::Token>> {
-        use cexpr::token;
+/// A raw clang token, that exposes only the kind and spelling. This is a
+/// slightly more convenient version of `CXToken` which owns the spelling
+/// string.
+#[derive(Debug)]
+pub struct ClangToken {
+    spelling: CXString,
+    /// The kind of token, this is the same as the relevant member from
+    /// `CXToken`.
+    pub kind: CXTokenKind,
+}
 
-        self.tokens().map(|tokens| {
-            tokens
-                .into_iter()
-                .filter_map(|token| {
-                    let kind = match token.kind {
-                        CXToken_Punctuation => token::Kind::Punctuation,
-                        CXToken_Literal => token::Kind::Literal,
-                        CXToken_Identifier => token::Kind::Identifier,
-                        CXToken_Keyword => token::Kind::Keyword,
-                        // NB: cexpr is not too happy about comments inside
-                        // expressions, so we strip them down here.
-                        CXToken_Comment => return None,
-                        _ => {
-                            error!("Found unexpected token kind: {:?}", token);
-                            return None;
-                        }
-                    };
-
-                    Some(token::Token {
-                        kind: kind,
-                        raw: token.spelling.into_bytes().into_boxed_slice(),
-                    })
-                })
-                .collect::<Vec<_>>()
-        })
+impl ClangToken {
+    /// Get the token spelling, without being converted to utf-8.
+    pub fn spelling(&self) -> &[u8] {
+        let c_str = unsafe {
+            CStr::from_ptr(clang_getCString(self.spelling) as *const _)
+        };
+        c_str.to_bytes()
+    }
+}
+
+impl Drop for ClangToken {
+    fn drop(&mut self) {
+        unsafe { clang_disposeString(self.spelling) }
+    }
+}
+
+/// An iterator over a set of Tokens.
+pub struct ClangTokenIterator<'a> {
+    tu: CXTranslationUnit,
+    raw: slice::Iter<'a, CXToken>,
+}
+
+impl<'a> Iterator for ClangTokenIterator<'a> {
+    type Item = ClangToken;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let raw = self.raw.next()?;
+        unsafe {
+            let kind = clang_getTokenKind(*raw);
+            let spelling = clang_getTokenSpelling(self.tu, *raw);
+            Some(ClangToken { kind, spelling })
+        }
     }
 }
 
diff --git a/src/ir/context.rs b/src/ir/context.rs
@@ -2163,21 +2163,17 @@ If you encounter an error missing from this list, please file an issue or a PR!"
 
         let mut module_name = None;
         let spelling = cursor.spelling();
-        if !spelling.is_empty()
-        {
+        if !spelling.is_empty() {
             module_name = Some(spelling)
         }
 
-        let tokens = match cursor.tokens() {
-            Some(tokens) => tokens,
-            None => return (module_name, ModuleKind::Normal),
-        };
+        let tokens = cursor.tokens();
         let mut iter = tokens.iter();
         let mut kind = ModuleKind::Normal;
         let mut found_namespace_keyword = false;
         while let Some(token) = iter.next() {
-            match &*token.spelling {
-                "inline" => {
+            match token.spelling() {
+                b"inline" => {
                     assert!(!found_namespace_keyword);
                     assert!(kind != ModuleKind::Inline);
                     kind = ModuleKind::Inline;
@@ -2192,16 +2188,16 @@ If you encounter an error missing from this list, please file an issue or a PR!"
                 //
                 // Fortunately enough, inline nested namespace specifiers aren't
                 // a thing, and are invalid C++ :)
-                "namespace" | "::" => {
+                b"namespace" | b"::" => {
                     found_namespace_keyword = true;
                 }
-                "{" => {
+                b"{" => {
                     assert!(found_namespace_keyword);
                     break;
                 }
                 name if found_namespace_keyword => {
                     if module_name.is_none() {
-                        module_name = Some(name.to_owned());
+                        module_name = Some(String::from_utf8_lossy(name).into_owned());
                     }
                     break;
                 }
diff --git a/src/ir/var.rs b/src/ir/var.rs
@@ -309,7 +309,7 @@ fn parse_macro(
 ) -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
     use cexpr::expr;
 
-    let mut cexpr_tokens = cursor.cexpr_tokens()?;
+    let mut cexpr_tokens = cursor.cexpr_tokens();
 
     let parser = expr::IdentifierParser::new(ctx.parsed_macros());
 
@@ -338,7 +338,7 @@ fn parse_int_literal_tokens(cursor: &clang::Cursor) -> Option<i64> {
     use cexpr::expr;
     use cexpr::expr::EvalResult;
 
-    let cexpr_tokens = cursor.cexpr_tokens()?;
+    let cexpr_tokens = cursor.cexpr_tokens();
 
     // TODO(emilio): We can try to parse other kinds of literals.
     match expr::expr(&cexpr_tokens) {