Performance regression in 0.44.0 for headers that (transitively) include AVX512F intrinsics #1465

ghost · 2018-12-13T19:28:23Z

Input C/C++ Header

Unfortunately, this report has #includes because the issue disappears when run on the preprocessed input. To hopefully counteract that a bit, I'm happy to provide more system information, but this has been observed on three different machines of different Intel generation and underlying kernel. For my machine:

Intel Core i7-8650U
Ubuntu 18.04 LTS (kernel 4.15.0-39-generic)
clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)

#ifndef __IMMINTRIN_H
#define __IMMINTRIN_H

#if !defined(_MSC_VER) || __has_feature(modules) || \
    (defined(__SSE4_2__) || defined(__SSE4_1__))
#include <smmintrin.h>
#endif

#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
#include <avxintrin.h>
#endif

#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
#include <avx512fintrin.h>
#endif

#endif /* __IMMINTRIN_H */

Bindgen Invocation

Using either the 0.44.0 release, or the current version from master:

$ time bindgen test.h

Actual Results

The bindings are successfully output, with timing output:

13.94s user 2.12s system 99% cpu 16.081 total

Expected Results

If I change the definition of clang::Cursor::has_simple_attr() to the following:

pub fn has_simple_attr(&self, _attr: &str) -> bool {
    false
}

And run the same command:

$ time bindgen test.h

Then the bindings are successfully output, but with this very different timing output:

0.39s user 0.08s system 95% cpu 0.483 total

The text was updated successfully, but these errors were encountered:

ghost · 2018-12-13T22:46:13Z

I thought of a different way to go about creducing, and came up with a much smaller test case that still exhibits a substantial performance difference in the parsing step between the two variations of has_simple_attr:

#define __MMINTRIN_H
typedef long long __m64 __attribute__((__vector_size__(8)));
 typedef long long __v1di __attribute__((__vector_size__(8)));
 typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
static __inline__ void __DEFAULT_FN_ATTRS _mm_empty(void) {     __builtin_ia32_emms(); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi32_si64(int __i) {     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); }
 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi64_si32(__m64 __m) {     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi64_m64(long long __i) {     return (__m64)__i; }
 long long __DEFAULT_FN_ATTRS _mm_cvtm64_si64(__m64 __m) {     return (long long)__m; }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pu16(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi8(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi16(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi32(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi8(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); }
 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi8(__m64 __m1, __m64 __m2) {     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); }

acfoltzer@stribog:~/src/bindgen-shrink % ./bindgen-good test.h --time-phases 2>&1 >/dev/null
  time:     0.347 ms.   parse
  time:     0.000 ms.   process_replacements
  time:     0.002 ms.   deanonymize_fields
  time:     0.429 ms.   compute_whitelisted_and_codegen_items
  time:     0.075 ms.   compute_has_vtable
  time:     0.255 ms.   compute_sizedness
  time:     0.066 ms.   compute_has_destructor
  time:     0.330 ms.   find_used_template_parameters
  time:     0.211 ms.   compute_cannot_derive_debug
  time:     0.000 ms.   compute_cannot_derive_default
  time:     0.328 ms.   compute_cannot_derive_copy
  time:     0.065 ms.   compute_has_type_param_in_array
  time:     0.000 ms.   compute_has_float
  time:     0.000 ms.   compute_cannot_derive_hash
  time:     0.000 ms.   compute_cannot_derive_partialord_partialeq_or_eq
  time:     0.274 ms.   codegen
  time:     6.510 ms.   rustfmt_generated_string
acfoltzer@stribog:~/src/bindgen-shrink % ./bindgen-bad test.h --time-phases 2>&1 >/dev/null
  time:     1.470 ms.   parse
  time:     0.000 ms.   process_replacements
  time:     0.001 ms.   deanonymize_fields
  time:     0.370 ms.   compute_whitelisted_and_codegen_items
  time:     0.068 ms.   compute_has_vtable
  time:     0.219 ms.   compute_sizedness
  time:     0.053 ms.   compute_has_destructor
  time:     0.267 ms.   find_used_template_parameters
  time:     0.164 ms.   compute_cannot_derive_debug
  time:     0.000 ms.   compute_cannot_derive_default
  time:     0.255 ms.   compute_cannot_derive_copy
  time:     0.051 ms.   compute_has_type_param_in_array
  time:     0.000 ms.   compute_has_float
  time:     0.000 ms.   compute_cannot_derive_hash
  time:     0.000 ms.   compute_cannot_derive_partialord_partialeq_or_eq
  time:     0.211 ms.   codegen
  time:     5.233 ms.   rustfmt_generated_string

I'll note that it could probably get smaller and still exhibit a measurable performance difference, I just set a cutoff of runtime for the "good" version to keep things statistically relevant.

emilio · 2018-12-14T00:25:31Z

Hi, thanks for the report, it's great! So this means this is a regression from d34e10f. I'm not really sure we can do much better to find unexposed attributes here...

Also, could do some profiling to see if we're doing something dumb, I'd be surprised if not...

emilio · 2018-12-14T00:26:57Z

cc @flowbish, in case you are interested in investigating this regression from #1451? Otherwise I'll take a look when I have the time.

emilio · 2018-12-14T00:56:35Z

FWIW just out of curiosity took a quick look, here's what perf has to say about our parsing times for this test on my machine:

-   70.32%     0.00%  bindgen  bindgen                    [.] <bindgen::ir::item::Item as bindgen::parse::ClangItemParser>::parse                                                                                 ◆
   - <bindgen::ir::item::Item as bindgen::parse::ClangItemParser>::parse                                                                                                                                          ▒
      - 70.04% <bindgen::ir::item::Item as bindgen::parse::ClangItemParser>::from_ty_with_id                                                                                                                      ▒
         - 69.98% bindgen::ir::ty::Type::from_clang_ty                                                                                                                                                            ▒
            - 69.93% bindgen::ir::function::FunctionSig::from_ty                                                                                                                                                  ▒
               - 69.63% clang_sys::clang_visitChildren                                                                                                                                                            ▒
                  - clang_visitChildren                                                                                                                                                                           ▒
                     - 69.53% 0x7faccddc1c96                                                                                                                                                                      ▒
                        - 69.52% 0x7faccddc9bda                                                                                                                                                                   ▒
                             0x7faccddc2253                                                                                                                                                                       ▒
                           - bindgen::clang::visit_children                                                                                                                                                       ▒
                              - 60.26% bindgen::clang::Cursor::tokens                                                                                                                                             ▒
                                 - 17.23% clang_sys::clang_tokenize                                                                                                                                               ▒
                                    - 17.22% clang_tokenize                                                                                                                                                       ▒
                                       - 9.72% 0x7faccddb5f74                                                                                                                                                     ▒
                                          - 9.06% clang::Lexer::LexTokenInternal                                                                                                                                  ▒
                                               4.78% clang::Lexer::SkipLineComment                                                                                                                                ▒
                                               1.02% clang::Lexer::LexIdentifier                                                                                                                                  ▒
                                       - 2.90% 0x7faccddb6139                                                                                                                                                     ▒
                                          - 2.87% clang::Preprocessor::LookUpIdentifierInfo                                                                                                                       ▒
                                               1.38% llvm::StringMapImpl::LookupBucketFor                                                                                                                         ▒
                                         1.38% 0x7faccddb5f38                                                                                                                                                     ▒
                                         0.90% 0x7faccddb5f84                                                                                                                                                     ▒
                                         0.51% 0x7faccddb5fd2                                                                                                                                                     ▒
                                 + 12.57% <alloc::borrow::Cow<'a, B>>::into_owned                                                                                                                                 ▒
                                 - 11.25% clang_sys::clang_getTokenSpelling                                                                                                                                       ▒
                                    - 9.79% clang_getTokenSpelling                                                                                                                                                ▒
                                       + 3.20% 0x7faccddf2d56                                                                                                                                                     ▒
                                       + 1.56% clang::SourceManager::getBufferData                                                                                                                                ▒
                                       + 0.86% 0x7faccddf2d6c                                                                                                                                                     ▒
                                      1.06% <std::thread::local::LocalKey<T>>::with                                                                                                                               ▒
                                 - 7.52% std::ffi::c_str::CStr::to_string_lossy                                                                                                                                   ▒
                                    - 7.42% alloc::string::String::from_utf8_lossy                                                                                                                                ▒
                                         5.62% <core::str::lossy::Utf8LossyChunksIter<'a> as core::iter::iterator::Iterator>::next                                                                                ▒
                                 - 1.84% std::ffi::c_str::CStr::from_ptr                                                                                                                                          ▒
                                      __strlen_avx2                                                                                                                                                               ▒
                                 - 1.17% clang_sys::clang_getTokenKind                                                                                                                                            ▒
                                      0.66% <std::thread::local::LocalKey<T>>::with                                                                                                                               ▒
                                 - 1.11% clang_sys::clang_getCString                                                                                                                                              ▒
                                      <std::thread::local::LocalKey<T>>::with                                                                                                                                     ▒
                                 - 0.99% clang_sys::clang_disposeString                                                                                                                                           ▒
                                      <std::thread::local::LocalKey<T>>::with                                                                                                                                     ▒
                                 - 0.89% __GI___libc_realloc (inlined)                                                                                                                                            ▒
                                    - _int_realloc                                                                                                                                                                ▒
                                         0.73% __memmove_avx_unaligned_erms                                                                                                                                       ▒
                                   0.88% _int_free                                                                                                                                                                ▒
                              - 7.51% _int_free                                                                                                                                                                   ▒
                                 - 3.97% malloc_consolidate                                                                                                                                                       ▒
                                      unlink_chunk (inlined)

Instead of converting all the tokens to utf-8 before-hand, which is costly, and allocating a new vector unconditionally (on top of the one clang already allocates), just do the tokenization more lazily. There's actually only one place in the codebase which needs the utf-8 string, all the others can just work with the byte slice from clang. This should have no behavior change, other than be faster. In particular, this halves the time on my machine spent on the test-case from rust-lang#1465. I'm not completely sure that this is going to be enough to make it acceptable, but we should probably do it regardless.

emilio · 2018-12-14T02:35:25Z

I opened #1466 with a partial fix, can you confirm it helps on your machine?

I still think it may not be good enough and we may need to put the attribute detection behind a flag, but...

ghost · 2018-12-14T03:29:32Z

@emilio, thank you for your prompt attention! I really appreciate it.

While I can confirm that it does help, I can also confirm your suspicion that it's not enough.

The parse step on the project where we actually hit this went from 217 seconds on master to 124 seconds with your PR. When I turn has_simple_attr back into a noop, the time goes back to around 1 second.

I'm not too familiar with the clang API, so I'm afraid I can't help diagnose much past this, but it's clear there are some pretty nasty asymptotics with this addition. If we could use a flag to disable it, or if it was opt-in, that would be great.

emilio · 2018-12-14T09:56:17Z

Ok, I'll merge that since it should also help for cases we're already tokenizing, like macros and such, and put the function attribute stuff behind a flag for now.

Instead of converting all the tokens to utf-8 before-hand, which is costly, and allocating a new vector unconditionally (on top of the one clang already allocates), just do the tokenization more lazily. There's actually only one place in the codebase which needs the utf-8 string, all the others can just work with the byte slice from clang. This should have no behavior change, other than be faster. In particular, this halves the time on my machine spent on the test-case from rust-lang#1465. I'm not completely sure that this is going to be enough to make it acceptable, but we should probably do it regardless.

Given it was a considerable performance hit under some workloads. Closes rust-lang#1465.

emilio · 2018-12-14T11:27:00Z

#1467 should resolve this, just waiting for CI.

ghost · 2018-12-14T17:59:01Z

Thanks, this works great!

Although, bindgen needs .enable_function_attribute_detection() to process __attribute__((__warn_unused_result__)) because parsing attrs can be really slow in certain cases. Benches were performed to confirm our case doesn't face that issue. References: rust-lang/rust-bindgen#2149 rust-lang/rust-bindgen#1465 rust-lang/rust-bindgen#1466 rust-lang/rust-bindgen#1467

`bindgen` is able to detect certain function attributes and annotate functions correspondingly in its output for the Rust side, when the `--enable-function-attribute-detection` is passed. In particular, it is currently able to use `__must_check` in C (`#[must_use]` in Rust), which give us a bunch of annotations that are nice to have to prevent possible issues in Rust abstractions, e.g.: extern "C" { + #[must_use] pub fn kobject_add( kobj: *mut kobject, parent: *mut kobject, fmt: *const core::ffi::c_char, ... ) -> core::ffi::c_int; } Apparently, there are edge cases where this can make generation very slow, which is why it is behind a flag [1], but it does not seem to affect us in any major way at the moment. Link: rust-lang/rust-bindgen#1465 [1] Link: https://lore.kernel.org/rust-for-linux/CANiq72=u5Nrz_NW3U3_VqywJkD8pECA07q2pFDd1wjtXOWdkAQ@mail.gmail.com/ Signed-off-by: Miguel Ojeda <[email protected]>

`bindgen` is able to detect certain function attributes and annotate functions correspondingly in its output for the Rust side, when the `--enable-function-attribute-detection` is passed. In particular, it is currently able to use `__must_check` in C (`#[must_use]` in Rust), which give us a bunch of annotations that are nice to have to prevent possible issues in Rust abstractions, e.g.: extern "C" { + #[must_use] pub fn kobject_add( kobj: *mut kobject, parent: *mut kobject, fmt: *const core::ffi::c_char, ... ) -> core::ffi::c_int; } Apparently, there are edge cases where this can make generation very slow, which is why it is behind a flag [1], but it does not seem to affect us in any major way at the moment. Thus enable it. Link: rust-lang/rust-bindgen#1465 [1] Link: https://lore.kernel.org/rust-for-linux/CANiq72=u5Nrz_NW3U3_VqywJkD8pECA07q2pFDd1wjtXOWdkAQ@mail.gmail.com/ Reviewed-by: Alice Ryhl <[email protected]> Tested-by: Alice Ryhl <[email protected]> Reviewed-by: Gary Guo <[email protected]> Acked-by: Danilo Krummrich <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Miguel Ojeda <[email protected]>

emilio added the bug label Dec 14, 2018

emilio mentioned this issue Dec 14, 2018

clang: Tokenize more lazily. #1466

Merged

emilio added a commit to emilio/rust-bindgen that referenced this issue Dec 14, 2018

ir: Point function attribute detection under an opt-in flag.

48928f8

Given it was a considerable performance hit under some workloads. Closes rust-lang#1465.

emilio mentioned this issue Dec 14, 2018

ir: Put function attribute detection under an opt-in flag. #1467

Merged

emilio added a commit to emilio/rust-bindgen that referenced this issue Dec 14, 2018

ir: Put function attribute detection under an opt-in flag.

a4d9ffb

Given it was a considerable performance hit under some workloads. Closes rust-lang#1465.

emilio closed this as completed in #1467 Dec 14, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Performance regression in 0.44.0 for headers that (transitively) include AVX512F intrinsics #1465

Performance regression in 0.44.0 for headers that (transitively) include AVX512F intrinsics #1465

ghost commented Dec 13, 2018

ghost commented Dec 13, 2018 •

edited by ghost

Loading

emilio commented Dec 14, 2018

emilio commented Dec 14, 2018

emilio commented Dec 14, 2018 •

edited

Loading

emilio commented Dec 14, 2018

ghost commented Dec 14, 2018

emilio commented Dec 14, 2018

emilio commented Dec 14, 2018

ghost commented Dec 14, 2018

Performance regression in 0.44.0 for headers that (transitively) include AVX512F intrinsics #1465

Performance regression in 0.44.0 for headers that (transitively) include AVX512F intrinsics #1465

Comments

ghost commented Dec 13, 2018

Input C/C++ Header

Bindgen Invocation

Actual Results

Expected Results

ghost commented Dec 13, 2018 • edited by ghost Loading

emilio commented Dec 14, 2018

emilio commented Dec 14, 2018

emilio commented Dec 14, 2018 • edited Loading

emilio commented Dec 14, 2018

ghost commented Dec 14, 2018

emilio commented Dec 14, 2018

emilio commented Dec 14, 2018

ghost commented Dec 14, 2018

ghost commented Dec 13, 2018 •

edited by ghost

Loading

emilio commented Dec 14, 2018 •

edited

Loading