From 58dc611f57f3e70dff22b34371804262232aa345 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 12 Mar 2018 21:01:13 -0400
Subject: [PATCH 1/6] gitignore: add tmp dir

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6879ab176c..8f7a426bd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ bench-log
 wiki
 tags
 examples/debug.rs
+tmp/

From 9ddc0b5b0a36e24e308490383b894a7f8452ebd9 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 10 Mar 2018 19:15:07 -0500
Subject: [PATCH 2/6] teddy: port teddy searcher to std::arch

This commit ports the Teddy searcher to use std::arch and moves off the
portable SIMD vector API. Performance remains the same, and it looks
like the codegen is identical, which is great!

This also makes the `simd-accel` feature a no-op and adds a new
`unstable` feature which will enable the Teddy optimization. The `-C
target-feature` or `-C target-cpu` settings are no longer necessary,
since this will now do runtime target feature detection.

We also add a new `unstable` feature to the regex crate, which will
enable this new use of std::arch. Once enabled, the Teddy optimizations
becomes available automatically without any additional compile time
flags.
---
 Cargo.toml                                    |  16 +-
 bench/Cargo.toml                              |   2 +-
 build.rs                                      |  26 ++
 src/exec.rs                                   |   2 +-
 src/input.rs                                  |   2 +-
 src/lib.rs                                    |  18 +-
 src/{literals.rs => literal/mod.rs}           |   5 +-
 .../teddy_ssse3/fallback.rs}                  |   5 +-
 .../teddy_ssse3/imp.rs}                       | 285 ++++++++----------
 src/literal/teddy_ssse3/mod.rs                |  16 +
 src/prog.rs                                   |   2 +-
 src/simd_accel/mod.rs                         |   5 -
 src/simd_fallback/mod.rs                      |   1 -
 src/vector/mod.rs                             |   2 +
 src/vector/ssse3.rs                           | 200 ++++++++++++
 15 files changed, 405 insertions(+), 182 deletions(-)
 create mode 100644 build.rs
 rename src/{literals.rs => literal/mod.rs} (99%)
 rename src/{simd_fallback/teddy128.rs => literal/teddy_ssse3/fallback.rs} (89%)
 rename src/{simd_accel/teddy128.rs => literal/teddy_ssse3/imp.rs} (83%)
 create mode 100644 src/literal/teddy_ssse3/mod.rs
 delete mode 100644 src/simd_accel/mod.rs
 delete mode 100644 src/simd_fallback/mod.rs
 create mode 100644 src/vector/mod.rs
 create mode 100644 src/vector/ssse3.rs

diff --git a/Cargo.toml b/Cargo.toml
index 020e1a0881..e3fd239099 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -31,8 +31,6 @@ memchr = "2.0.0"
 thread_local = "0.3.2"
 # For parsing regular expressions.
 regex-syntax = { path = "regex-syntax", version = "0.5.1" }
-# For accelerating text search.
-simd = { version = "0.2.1", optional = true }
 # For compiling UTF-8 decoding into automata.
 utf8-ranges = "1.0.0"
 
@@ -45,10 +43,20 @@ quickcheck = { version = "0.6", default-features = false }
 rand = "0.4"
 
 [features]
-# Enable to use the unstable pattern traits defined in std.
+# We don't enable any features by default currently, but if the compiler
+# supports a specific type of feature, then regex's build.rs might enable
+# some default features.
+default = []
+# A blanket feature that governs whether unstable features are enabled or not.
+# Unstable features are disabled by default, and typically rely on unstable
+# features in rustc itself.
+unstable = ["pattern"]
+# Enable to use the unstable pattern traits defined in std. This is enabled
+# by default if the unstable feature is enabled.
 pattern = []
 # Enable to use simd acceleration.
-simd-accel = ["simd"]
+# Note that this is deprecated and is a no-op.
+simd-accel = []
 
 [lib]
 # There are no benchmarks in the library code itself
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 7a3eb8f8ff..0423e71c1e 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -18,7 +18,7 @@ libc = "0.2"
 onig = { version = "3", optional = true }
 libpcre-sys = { version = "0.2", optional = true }
 memmap = "0.6"
-regex = { version = "0.2.0", path = "..", features = ["simd-accel"] }
+regex = { version = "0.2.0", path = "..", features = ["unstable"] }
 regex-syntax = { version = "0.5.0", path = "../regex-syntax" }
 serde = "1"
 serde_derive = "1"
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000000..645d5ec309
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,26 @@
+use std::env;
+use std::ffi::OsString;
+use std::process::Command;
+
+fn main() {
+    let rustc = env::var_os("RUSTC").unwrap_or(OsString::from("rustc"));
+    let output = Command::new(&rustc)
+        .arg("--version")
+        .output()
+        .unwrap()
+        .stdout;
+    let version = String::from_utf8(output).unwrap();
+
+    // If we're using nightly Rust, then we can enable vector optimizations.
+    // Note that these aren't actually activated unless the `nightly` feature
+    // is enabled.
+    //
+    // We also don't activate these if we've explicitly disabled auto
+    // optimizations. Disabling auto optimizations is intended for use in
+    // tests, so that we can reliably test fallback implementations.
+    if env::var_os("CARGO_CFG_REGEX_DISABLE_AUTO_OPTIMIZATIONS").is_none() {
+        if version.contains("nightly") {
+            println!("cargo:rustc-cfg=regex_runtime_teddy_ssse3");
+        }
+    }
+}
diff --git a/src/exec.rs b/src/exec.rs
index 29d614fddd..95adae575e 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -23,7 +23,7 @@ use compile::Compiler;
 use dfa;
 use error::Error;
 use input::{ByteInput, CharInput};
-use literals::LiteralSearcher;
+use literal::LiteralSearcher;
 use pikevm;
 use prog::Program;
 use re_builder::RegexOptions;
diff --git a/src/input.rs b/src/input.rs
index e24214954e..56097bd562 100644
--- a/src/input.rs
+++ b/src/input.rs
@@ -16,7 +16,7 @@ use std::u32;
 
 use syntax;
 
-use literals::LiteralSearcher;
+use literal::LiteralSearcher;
 use prog::InstEmptyLook;
 use utf8::{decode_utf8, decode_last_utf8};
 
diff --git a/src/lib.rs b/src/lib.rs
index 31ee2553a2..e783b7e36c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -520,14 +520,15 @@ another matching engine with fixed memory requirements.
 #![deny(missing_docs)]
 #![cfg_attr(test, deny(warnings))]
 #![cfg_attr(feature = "pattern", feature(pattern))]
-#![cfg_attr(feature = "simd-accel", feature(cfg_target_feature))]
+#![cfg_attr(feature = "unstable", feature(target_feature, stdsimd))]
 
 extern crate aho_corasick;
 extern crate memchr;
 extern crate thread_local;
-#[macro_use] #[cfg(test)] extern crate quickcheck;
+#[cfg(test)]
+#[macro_use]
+extern crate quickcheck;
 extern crate regex_syntax as syntax;
-#[cfg(feature = "simd-accel")] extern crate simd;
 extern crate utf8_ranges;
 
 pub use error::Error;
@@ -645,7 +646,7 @@ mod exec;
 mod expand;
 mod freqs;
 mod input;
-mod literals;
+mod literal;
 #[cfg(feature = "pattern")]
 mod pattern;
 mod pikevm;
@@ -655,12 +656,9 @@ mod re_bytes;
 mod re_set;
 mod re_trait;
 mod re_unicode;
-#[cfg(feature = "simd-accel")]
-mod simd_accel;
-#[cfg(not(feature = "simd-accel"))]
-#[path = "simd_fallback/mod.rs"]
-mod simd_accel;
 mod sparse;
+#[cfg(feature = "unstable")]
+mod vector;
 
 /// The `internal` module exists to support suspicious activity, such as
 /// testing different matching engines and supporting the `regex-debug` CLI
@@ -670,6 +668,6 @@ pub mod internal {
     pub use compile::Compiler;
     pub use exec::{Exec, ExecBuilder};
     pub use input::{Char, Input, CharInput, InputAt};
-    pub use literals::LiteralSearcher;
+    pub use literal::LiteralSearcher;
     pub use prog::{Program, Inst, EmptyLook, InstRanges};
 }
diff --git a/src/literals.rs b/src/literal/mod.rs
similarity index 99%
rename from src/literals.rs
rename to src/literal/mod.rs
index 98056fa50d..69be8eab01 100644
--- a/src/literals.rs
+++ b/src/literal/mod.rs
@@ -16,8 +16,9 @@ use memchr::{memchr, memchr2, memchr3};
 use syntax::hir::literal::{Literal, Literals};
 
 use freqs::BYTE_FREQUENCIES;
+use self::teddy_ssse3::Teddy;
 
-use simd_accel::teddy128::{Teddy, is_teddy_128_available};
+mod teddy_ssse3;
 
 /// A prefix extracted from a compiled regular expression.
 ///
@@ -219,7 +220,7 @@ impl Matcher {
             }
         }
         let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
-        if is_teddy_128_available() && !is_aho_corasick_fast {
+        if Teddy::available() && !is_aho_corasick_fast {
             // Only try Teddy if Aho-Corasick can't use memchr on an ASCII
             // byte. Also, in its current form, Teddy doesn't scale well to
             // lots of literals.
diff --git a/src/simd_fallback/teddy128.rs b/src/literal/teddy_ssse3/fallback.rs
similarity index 89%
rename from src/simd_fallback/teddy128.rs
rename to src/literal/teddy_ssse3/fallback.rs
index d7ecad6e1b..20524aabfe 100644
--- a/src/simd_fallback/teddy128.rs
+++ b/src/literal/teddy_ssse3/fallback.rs
@@ -1,9 +1,5 @@
 use syntax::hir::literal::Literals;
 
-pub fn is_teddy_128_available() -> bool {
-    false
-}
-
 #[derive(Debug, Clone)]
 pub struct Teddy(());
 
@@ -15,6 +11,7 @@ pub struct Match {
 }
 
 impl Teddy {
+    pub fn available() -> bool { false }
     pub fn new(_pats: &Literals) -> Option<Teddy> { None }
     pub fn patterns(&self) -> &[Vec<u8>] { &[] }
     pub fn len(&self) -> usize { 0 }
diff --git a/src/simd_accel/teddy128.rs b/src/literal/teddy_ssse3/imp.rs
similarity index 83%
rename from src/simd_accel/teddy128.rs
rename to src/literal/teddy_ssse3/imp.rs
index 6a2f3fd8e1..4f3e34fed0 100644
--- a/src/simd_accel/teddy128.rs
+++ b/src/literal/teddy_ssse3/imp.rs
@@ -16,7 +16,7 @@ extended to substring matching. The PCMPESTRI instruction (and its relatives),
 for example, implements substring matching in hardware. It is, however, limited
 to substrings of length 16 bytes or fewer, but this restriction is fine in a
 regex engine, since we rarely care about the performance difference between
-searching for a 16 byte literal and a 16 + N literal—16 is already long
+searching for a 16 byte literal and a 16 + N literal; 16 is already long
 enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
 at least, is its latency and throughput. As a result, it is often faster to do
 substring search with a Boyer-Moore variant and a well placed memchr to quickly
@@ -87,9 +87,8 @@ better. Namely:
    significant because it corresponds to the number of bytes in a SIMD vector.
    If one used AVX2 instructions, then we could scan the haystack in 32 byte
    chunks. Similarly, if one used AVX512 instructions, we could scan the
-   haystack in 64 byte chunks. Hyperscan implements SIMD + AVX2, we only
-   implement SIMD for the moment. (The author doesn't have a CPU with AVX2
-   support... yet.)
+   haystack in 64 byte chunks. Hyperscan implements SSE + AVX2, we only
+   implement SSE for the moment.
 2. Bitwise operations are performed on each chunk to discover if any region of
    it matches a set of precomputed fingerprints from the patterns. If there are
    matches, then a verification step is performed. In this implementation, our
@@ -145,8 +144,8 @@ instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
 and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
 `C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
 for the purposes of this algorithm. For full details, see [Intel's Intrinsics
-Guide][5_u].) This essentially lets us use the values in `B` to lookup values in
-`A`.
+Guide][5_u].) This essentially lets us use the values in `B` to lookup values
+in `A`.
 
 If we could somehow cause `B` to contain our 16 byte block from the haystack,
 and if `A` could contain our bitmasks, then we'd end up with something like
@@ -175,7 +174,7 @@ So our map now looks like:
 ```
 
 Notice that the bitsets for each nybble correspond to the union of all
-fingerprints that contain that nibble. For example, both `f` and `b` have the
+fingerprints that contain that nybble. For example, both `f` and `b` have the
 same upper 4 bits but differ on the lower 4 bits. Putting this together, we
 have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
 our mask for the upper nybble and `B` is our 16 byte block from the haystack:
@@ -223,7 +222,7 @@ the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
 for all of our patterns. But if we combined `C0` and `C1` with an `AND`
 operation:
 
-```
+```ignore
      b         a        ... f         o        ... p
 C  = 00000110  0            00000001  0            0
 ```
@@ -319,27 +318,16 @@ References
 [5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
 */
 
-// TODO: Extend this to use AVX2 instructions.
-// TODO: Extend this to use AVX512 instructions.
-// TODO: Make the inner loop do aligned loads.
-
 use std::cmp;
-use std::ptr;
 
 use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
-use simd::u8x16;
-use simd::x86::sse2::Sse2Bool8ix16;
-use simd::x86::ssse3::Ssse3U8x16;
-
 use syntax::hir::literal::Literals;
 
+use vector::ssse3::{SSSE3VectorBuilder, u8x16};
+
 /// Corresponds to the number of bytes read at a time in the haystack.
 const BLOCK_SIZE: usize = 16;
 
-pub fn is_teddy_128_available() -> bool {
-    true
-}
-
 /// Match reports match information.
 #[derive(Debug, Clone)]
 pub struct Match {
@@ -355,6 +343,8 @@ pub struct Match {
 /// A SIMD accelerated multi substring searcher.
 #[derive(Debug, Clone)]
 pub struct Teddy {
+    /// A builder for SSSE3 empowered vectors.
+    vb: SSSE3VectorBuilder,
     /// A list of substrings to match.
     pats: Vec<Vec<u8>>,
     /// An Aho-Corasick automaton of the patterns. We use this when we need to
@@ -369,26 +359,28 @@ pub struct Teddy {
     masks: Masks,
 }
 
-/// A list of masks. This has length equal to the length of the fingerprint.
-/// The length of the fingerprint is always `min(3, len(smallest_substring))`.
-#[derive(Debug, Clone)]
-struct Masks(Vec<Mask>);
-
-/// A single mask.
-#[derive(Debug, Clone, Copy)]
-struct Mask {
-    /// Bitsets for the low nybbles in a fingerprint.
-    lo: u8x16,
-    /// Bitsets for the high nybbles in a fingerprint.
-    hi: u8x16,
-}
-
 impl Teddy {
+    /// Returns true if and only if Teddy is supported on this platform.
+    ///
+    /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
+    /// return `None`.
+    pub fn available() -> bool {
+        SSSE3VectorBuilder::new().is_some()
+    }
+
     /// Create a new `Teddy` multi substring matcher.
     ///
     /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
     /// an empty substring), then `None` is returned.
     pub fn new(pats: &Literals) -> Option<Teddy> {
+        let vb = match SSSE3VectorBuilder::new() {
+            None => return None,
+            Some(vb) => vb,
+        };
+        if !Teddy::available() {
+            return None;
+        }
+
         let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
         let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
         // Don't allow any empty patterns and require that we have at
@@ -398,7 +390,7 @@ impl Teddy {
         }
         // Pick the largest mask possible, but no larger than 3.
         let nmasks = cmp::min(3, min_len);
-        let mut masks = Masks::new(nmasks);
+        let mut masks = Masks::new(vb, nmasks);
         let mut buckets = vec![vec![]; 8];
         // Assign a substring to each bucket, and add the bucket's bitfield to
         // the appropriate position in the mask.
@@ -408,6 +400,7 @@ impl Teddy {
             masks.add(bucket as u8, pat);
         }
         Some(Teddy {
+            vb: vb,
             pats: pats.to_vec(),
             ac: AcAutomaton::new(pats.to_vec()).into_full(),
             buckets: buckets,
@@ -433,6 +426,13 @@ impl Teddy {
     /// Searches `haystack` for the substrings in this `Teddy`. If a match was
     /// found, then it is returned. Otherwise, `None` is returned.
     pub fn find(&self, haystack: &[u8]) -> Option<Match> {
+        // This is safe because the only way we can construct a Teddy type
+        // is if SSSE3 is available.
+        unsafe { self.find_impl(haystack) }
+    }
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
         // If our haystack is smaller than the block size, then fall back to
         // a naive brute force search.
         if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
@@ -452,15 +452,21 @@ impl Teddy {
     #[inline(always)]
     fn find1(&self, haystack: &[u8]) -> Option<Match> {
         let mut pos = 0;
-        let zero = u8x16::splat(0);
+        let zero = self.vb.u8x16_splat(0);
         let len = haystack.len();
         debug_assert!(len >= BLOCK_SIZE);
         while pos <= len - BLOCK_SIZE {
-            let h = unsafe { u8x16::load_unchecked(haystack, pos) };
+            let h = unsafe {
+                // I tried and failed to eliminate bounds checks in safe code.
+                // This is safe because of our loop invariant: pos is always
+                // <= len-16.
+                let p = haystack.get_unchecked(pos..);
+                self.vb.u8x16_load_unchecked_unaligned(p)
+            };
             // N.B. `res0` is our `C` in the module documentation.
             let res0 = self.masks.members1(h);
             // Only do expensive verification if there are any non-zero bits.
-            let bitfield = res0.ne(zero).move_mask();
+            let bitfield = res0.ne(zero).movemask();
             if bitfield != 0 {
                 if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
                     return Some(m);
@@ -477,13 +483,7 @@ impl Teddy {
     fn find2(&self, haystack: &[u8]) -> Option<Match> {
         // This is an exotic way to right shift a SIMD vector across lanes.
         // See below at use for more details.
-        let res0shuffle = u8x16::new(
-            0, 0, 1, 2,
-            3, 4, 5, 6,
-            7, 8, 9, 10,
-            11, 12, 13, 14,
-        );
-        let zero = u8x16::splat(0);
+        let zero = self.vb.u8x16_splat(0);
         let len = haystack.len();
         // The previous value of `C` (from the module documentation) for the
         // *first* byte in the fingerprint. On subsequent iterations, we take
@@ -491,32 +491,31 @@ impl Teddy {
         // position of the current `C`, shifting all other bitsets to the right
         // one lane. This causes `C` for the first byte to line up with `C` for
         // the second byte, so that they can be `AND`'d together.
-        let mut prev0 = u8x16::splat(0xFF);
+        let mut prev0 = self.vb.u8x16_splat(0xFF);
         let mut pos = 1;
         debug_assert!(len >= BLOCK_SIZE);
         while pos <= len - BLOCK_SIZE {
-            let h = unsafe { u8x16::load_unchecked(haystack, pos) };
+            let h = unsafe {
+                // I tried and failed to eliminate bounds checks in safe code.
+                // This is safe because of our loop invariant: pos is always
+                // <= len-16.
+                let p = haystack.get_unchecked(pos..);
+                self.vb.u8x16_load_unchecked_unaligned(p)
+            };
             let (res0, res1) = self.masks.members2(h);
 
-            // The next three lines are essentially equivalent to
+            // Do this:
             //
-            // ```rust,ignore
-            // (prev0 << 15) | (res0 >> 1)
-            // ```
+            //     (prev0 << 15) | (res0 >> 1)
             //
-            // ... if SIMD vectors could shift across lanes. There is the
-            // `PALIGNR` instruction, but apparently LLVM doesn't expose it as
-            // a proper intrinsic. Thankfully, it appears the following
-            // sequence does indeed compile down to a `PALIGNR`.
-            let prev0byte0 = prev0.extract(15);
-            let res0shiftr8 = res0.shuffle_bytes(res0shuffle);
-            let res0prev0 = res0shiftr8.replace(0, prev0byte0);
+            // This lets us line up our C values for each byte.
+            let res0prev0 = res0.alignr_15(prev0);
 
             // `AND`'s our `C` values together.
-            let res = res0prev0 & res1;
+            let res = res0prev0.and(res1);
             prev0 = res0;
 
-            let bitfield = res.ne(zero).move_mask();
+            let bitfield = res.ne(zero).movemask();
             if bitfield != 0 {
                 let pos = pos.checked_sub(1).unwrap();
                 if let Some(m) = self.verify(haystack, pos, res, bitfield) {
@@ -539,44 +538,29 @@ impl Teddy {
     /// since we now need to align for three bytes.
     #[inline(always)]
     fn find3(&self, haystack: &[u8]) -> Option<Match> {
-        let zero = u8x16::splat(0);
+        let zero = self.vb.u8x16_splat(0);
         let len = haystack.len();
-
-        let res0shuffle = u8x16::new(
-            0, 0, 0, 1,
-            2, 3, 4, 5,
-            6, 7, 8, 9,
-            10, 11, 12, 13,
-        );
-        let res1shuffle = u8x16::new(
-            0, 0, 1, 2,
-            3, 4, 5, 6,
-            7, 8, 9, 10,
-            11, 12, 13, 14,
-        );
-        let mut prev0 = u8x16::splat(0xFF);
-        let mut prev1 = u8x16::splat(0xFF);
+        let mut prev0 = self.vb.u8x16_splat(0xFF);
+        let mut prev1 = self.vb.u8x16_splat(0xFF);
         let mut pos = 2;
         while pos <= len - BLOCK_SIZE {
-            let h = unsafe { u8x16::load_unchecked(haystack, pos) };
+            let h = unsafe {
+                // I tried and failed to eliminate bounds checks in safe code.
+                // This is safe because of our loop invariant: pos is always
+                // <= len-16.
+                let p = haystack.get_unchecked(pos..);
+                self.vb.u8x16_load_unchecked_unaligned(p)
+            };
             let (res0, res1, res2) = self.masks.members3(h);
 
-            let prev0byte0 = prev0.extract(14);
-            let prev0byte1 = prev0.extract(15);
-            let res0shiftr16 = res0.shuffle_bytes(res0shuffle);
-            let res0prev0 = res0shiftr16.replace(0, prev0byte0)
-                                        .replace(1, prev0byte1);
-
-            let prev1byte0 = prev1.extract(15);
-            let res1shiftr8 = res1.shuffle_bytes(res1shuffle);
-            let res1prev1 = res1shiftr8.replace(0, prev1byte0);
-
-            let res = res0prev0 & res1prev1 & res2;
+            let res0prev0 = res0.alignr_14(prev0);
+            let res1prev1 = res1.alignr_15(prev1);
+            let res = res0prev0.and(res1prev1).and(res2);
 
             prev0 = res0;
             prev1 = res1;
 
-            let bitfield = res.ne(zero).move_mask();
+            let bitfield = res.ne(zero).movemask();
             if bitfield != 0 {
                 let pos = pos.checked_sub(2).unwrap();
                 if let Some(m) = self.verify(haystack, pos, res, bitfield) {
@@ -609,11 +593,11 @@ impl Teddy {
         while bitfield != 0 {
             // The next offset, relative to pos, where some fingerprint
             // matched.
-            let byte_pos = bitfield.trailing_zeros();
+            let byte_pos = bitfield.trailing_zeros() as usize;
             bitfield &= !(1 << byte_pos);
 
             // Offset relative to the beginning of the haystack.
-            let start = pos + byte_pos as usize;
+            let start = pos + byte_pos;
 
             // The bitfield telling us which patterns had fingerprints that
             // match at this starting position.
@@ -664,6 +648,7 @@ impl Teddy {
     ///
     /// This is used when we don't have enough bytes in the haystack for our
     /// block based approach.
+    #[inline(never)]
     fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
         self.ac.find(&haystack[pos..]).next().map(|m| {
             Match {
@@ -675,23 +660,36 @@ impl Teddy {
     }
 }
 
+/// A list of masks. This has length equal to the length of the fingerprint.
+/// The length of the fingerprint is always `min(3, len(smallest_substring))`.
+#[derive(Debug, Clone)]
+struct Masks {
+    vb: SSSE3VectorBuilder,
+    masks: [Mask; 3],
+    size: usize,
+}
+
 impl Masks {
     /// Create a new set of masks of size `n`, where `n` corresponds to the
     /// number of bytes in a fingerprint.
-    fn new(n: usize) -> Masks {
-        Masks(vec![Mask::new(); n])
+    fn new(vb: SSSE3VectorBuilder, n: usize) -> Masks {
+        Masks {
+            vb: vb,
+            masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
+            size: n,
+        }
     }
 
     /// Returns the number of masks.
     fn len(&self) -> usize {
-        self.0.len()
+        self.size
     }
 
     /// Adds the given pattern to the given bucket. The bucket should be a
     /// power of `2 <= 2^7`.
     fn add(&mut self, bucket: u8, pat: &[u8]) {
-        for (i, mask) in self.0.iter_mut().enumerate() {
-            mask.add(bucket, pat[i]);
+        for i in 0..self.len() {
+            self.masks[i].add(bucket, pat[i]);
         }
     }
 
@@ -703,25 +701,25 @@ impl Masks {
     /// of a pattern in bucket `j`.
     #[inline(always)]
     fn members1(&self, haystack_block: u8x16) -> u8x16 {
-        let masklo = u8x16::splat(0xF);
-        let hlo = haystack_block & masklo;
-        let hhi = (haystack_block >> 4) & masklo;
+        let masklo = self.vb.u8x16_splat(0xF);
+        let hlo = haystack_block.and(masklo);
+        let hhi = haystack_block.bit_shift_right_4().and(masklo);
 
-        self.0[0].lo.shuffle_bytes(hlo) & self.0[0].hi.shuffle_bytes(hhi)
+        self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
     }
 
     /// Like members1, but computes C for the first and second bytes in the
     /// fingerprint.
     #[inline(always)]
     fn members2(&self, haystack_block: u8x16) -> (u8x16, u8x16) {
-        let masklo = u8x16::splat(0xF);
-        let hlo = haystack_block & masklo;
-        let hhi = (haystack_block >> 4) & masklo;
-
-        let res0 = self.0[0].lo.shuffle_bytes(hlo)
-                   & self.0[0].hi.shuffle_bytes(hhi);
-        let res1 = self.0[1].lo.shuffle_bytes(hlo)
-                   & self.0[1].hi.shuffle_bytes(hhi);
+        let masklo = self.vb.u8x16_splat(0xF);
+        let hlo = haystack_block.and(masklo);
+        let hhi = haystack_block.bit_shift_right_4().and(masklo);
+
+        let res0 =
+            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
+        let res1 =
+            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
         (res0, res1)
     }
 
@@ -729,26 +727,35 @@ impl Masks {
     /// in the fingerprint.
     #[inline(always)]
     fn members3(&self, haystack_block: u8x16) -> (u8x16, u8x16, u8x16) {
-        let masklo = u8x16::splat(0xF);
-        let hlo = haystack_block & masklo;
-        let hhi = (haystack_block >> 4) & masklo;
-
-        let res0 = self.0[0].lo.shuffle_bytes(hlo)
-                   & self.0[0].hi.shuffle_bytes(hhi);
-        let res1 = self.0[1].lo.shuffle_bytes(hlo)
-                   & self.0[1].hi.shuffle_bytes(hhi);
-        let res2 = self.0[2].lo.shuffle_bytes(hlo)
-                   & self.0[2].hi.shuffle_bytes(hhi);
+        let masklo = self.vb.u8x16_splat(0xF);
+        let hlo = haystack_block.and(masklo);
+        let hhi = haystack_block.bit_shift_right_4().and(masklo);
+
+        let res0 =
+            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
+        let res1 =
+            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
+        let res2 =
+            self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
         (res0, res1, res2)
     }
 }
 
+/// A single mask.
+#[derive(Debug, Clone, Copy)]
+struct Mask {
+    /// Bitsets for the low nybbles in a fingerprint.
+    lo: u8x16,
+    /// Bitsets for the high nybbles in a fingerprint.
+    hi: u8x16,
+}
+
 impl Mask {
     /// Create a new mask with no members.
-    fn new() -> Mask {
+    fn new(vb: SSSE3VectorBuilder) -> Mask {
         Mask {
-            lo: u8x16::splat(0),
-            hi: u8x16::splat(0),
+            lo: vb.u8x16_splat(0),
+            hi: vb.u8x16_splat(0),
         }
     }
 
@@ -756,39 +763,13 @@ impl Mask {
     fn add(&mut self, bucket: u8, byte: u8) {
         // Split our byte into two nybbles, and add each nybble to our
         // mask.
-        let byte_lo = (byte & 0xF) as u32;
-        let byte_hi = (byte >> 4) as u32;
+        let byte_lo = (byte & 0xF) as usize;
+        let byte_hi = (byte >> 4) as usize;
 
         let lo = self.lo.extract(byte_lo);
-        self.lo = self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
+        self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
 
         let hi = self.hi.extract(byte_hi);
-        self.hi = self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
-    }
-}
-
-/// UnsafeLoad permits loading data into a SIMD vector without bounds checks.
-///
-/// Ideally, this would be part of the `simd` crate, or even better, we could
-/// figure out how to do it without `unsafe` at all.
-trait UnsafeLoad {
-    type Elem;
-
-    /// load_unchecked creates a new SIMD vector from the elements in `slice`
-    /// starting at `offset`. `slice` must have at least the number of elements
-    /// required to fill a SIMD vector.
-    unsafe fn load_unchecked(slice: &[Self::Elem], offset: usize) -> Self;
-}
-
-impl UnsafeLoad for u8x16 {
-    type Elem = u8;
-
-    unsafe fn load_unchecked(slice: &[u8], offset: usize) -> u8x16 {
-        let mut x = u8x16::splat(0);
-        ptr::copy_nonoverlapping(
-            slice.get_unchecked(offset),
-            &mut x as *mut u8x16 as *mut u8,
-            16);
-        x
+        self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
     }
 }
diff --git a/src/literal/teddy_ssse3/mod.rs b/src/literal/teddy_ssse3/mod.rs
new file mode 100644
index 0000000000..2221159945
--- /dev/null
+++ b/src/literal/teddy_ssse3/mod.rs
@@ -0,0 +1,16 @@
+pub use self::imp::*;
+
+#[cfg(all(
+    feature = "unstable",
+    regex_runtime_teddy_ssse3,
+    any(target_arch = "x86", target_arch = "x86_64"),
+))]
+mod imp;
+
+#[cfg(not(all(
+    feature = "unstable",
+    regex_runtime_teddy_ssse3,
+    any(target_arch = "x86", target_arch = "x86_64"),
+)))]
+#[path = "fallback.rs"]
+mod imp;
diff --git a/src/prog.rs b/src/prog.rs
index 6ae49f6a12..4262aa96e6 100644
--- a/src/prog.rs
+++ b/src/prog.rs
@@ -7,7 +7,7 @@ use std::slice;
 use std::sync::Arc;
 
 use input::Char;
-use literals::LiteralSearcher;
+use literal::LiteralSearcher;
 
 /// `InstPtr` represents the index of an instruction in a regex program.
 pub type InstPtr = usize;
diff --git a/src/simd_accel/mod.rs b/src/simd_accel/mod.rs
deleted file mode 100644
index f3c868dd88..0000000000
--- a/src/simd_accel/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-#[cfg(target_feature = "ssse3")]
-pub mod teddy128;
-#[cfg(not(target_feature = "ssse3"))]
-#[path = "../simd_fallback/teddy128.rs"]
-pub mod teddy128;
diff --git a/src/simd_fallback/mod.rs b/src/simd_fallback/mod.rs
deleted file mode 100644
index b7ce4b188a..0000000000
--- a/src/simd_fallback/mod.rs
+++ /dev/null
@@ -1 +0,0 @@
-pub mod teddy128;
diff --git a/src/vector/mod.rs b/src/vector/mod.rs
new file mode 100644
index 0000000000..20409f6a81
--- /dev/null
+++ b/src/vector/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub mod ssse3;
diff --git a/src/vector/ssse3.rs b/src/vector/ssse3.rs
new file mode 100644
index 0000000000..0cb6701917
--- /dev/null
+++ b/src/vector/ssse3.rs
@@ -0,0 +1,200 @@
+#![allow(dead_code)]
+
+use std::arch::x86_64::*;
+use std::fmt;
+
+/// A builder for SSSE3 empowered vectors.
+///
+/// This builder represents a receipt that the SSSE3 target feature is enabled
+/// on the currently running CPU. Namely, the only way to get a value of this
+/// type is if the SSSE3 feature is enabled.
+///
+/// This type can then be used to build vector types that use SSSE3 features
+/// safely.
+#[derive(Clone, Copy, Debug)]
+pub struct SSSE3VectorBuilder(());
+
+impl SSSE3VectorBuilder {
+    /// Create a new SSSE3 vector builder.
+    ///
+    /// If the SSSE3 feature is not enabled for the current target, then
+    /// return `None`.
+    pub fn new() -> Option<SSSE3VectorBuilder> {
+        if is_target_feature_detected!("ssse3") {
+            Some(SSSE3VectorBuilder(()))
+        } else {
+            None
+        }
+    }
+
+    /// Create a new u8x16 SSSE3 vector where all of the bytes are set to
+    /// the given value.
+    #[inline]
+    pub fn u8x16_splat(self, n: u8) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe { u8x16::splat(n) }
+    }
+
+    /// Load 16 bytes from the given slice, with bounds checks.
+    #[inline]
+    pub fn u8x16_load_unaligned(self, slice: &[u8]) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe { u8x16::load_unaligned(slice) }
+    }
+
+    /// Load 16 bytes from the given slice, without bounds checks.
+    #[inline]
+    pub unsafe fn u8x16_load_unchecked_unaligned(self, slice: &[u8]) -> u8x16 {
+        // Safe because we know SSSE3 is enabled, but still unsafe
+        // because we aren't doing bounds checks.
+        u8x16::load_unchecked_unaligned(slice)
+    }
+
+    /// Load 16 bytes from the given slice, with bound and alignment checks.
+    #[inline]
+    pub fn u8x16_load(self, slice: &[u8]) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe { u8x16::load(slice) }
+    }
+
+    /// Load 16 bytes from the given slice, without bound or alignment checks.
+    #[inline]
+    pub unsafe fn u8x16_load_unchecked(self, slice: &[u8]) -> u8x16 {
+        // Safe because we know SSSE3 is enabled, but still unsafe
+        // because we aren't doing bounds checks.
+        u8x16::load_unchecked(slice)
+    }
+}
+
+// We define our union with a macro so that our code continues to compile on
+// Rust 1.12.
+macro_rules! defunion {
+    () => {
+        /// A u8x16 is a 128-bit vector with 16 single-byte lanes.
+        ///
+        /// It provides a safe API that uses only SSE2 or SSSE3 instructions.
+        /// The only way for callers to construct a value of this type is
+        /// through the SSSE3VectorBuilder type, and the only way to get a
+        /// SSSE3VectorBuilder is if the `ssse3` target feature is enabled.
+        ///
+        /// Note that generally speaking, all uses of this type should get
+        /// inlined, otherwise you probably have a performance bug.
+        #[derive(Clone, Copy)]
+        #[allow(non_camel_case_types)]
+        pub union u8x16 {
+            vector: __m128i,
+            bytes: [u8; 16],
+        }
+    }
+}
+
+defunion!();
+
+impl u8x16 {
+    #[inline]
+    unsafe fn splat(n: u8) -> u8x16 {
+        u8x16 { vector: _mm_set1_epi8(n as i8) }
+    }
+
+    #[inline]
+    unsafe fn load_unaligned(slice: &[u8]) -> u8x16 {
+        assert!(slice.len() >= 16);
+        u8x16::load_unchecked(slice)
+    }
+
+    #[inline]
+    unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x16 {
+        let v = _mm_loadu_si128(slice.as_ptr() as *const u8 as *const __m128i);
+        u8x16 { vector: v }
+    }
+
+    #[inline]
+    unsafe fn load(slice: &[u8]) -> u8x16 {
+        assert!(slice.len() >= 16);
+        assert!(slice.as_ptr() as usize % 16 == 0);
+        u8x16::load_unchecked(slice)
+    }
+
+    #[inline]
+    unsafe fn load_unchecked(slice: &[u8]) -> u8x16 {
+        let v = _mm_load_si128(slice.as_ptr() as *const u8 as *const __m128i);
+        u8x16 { vector: v }
+    }
+
+    #[inline]
+    pub fn extract(self, i: usize) -> u8 {
+        // Safe because `bytes` is always accessible.
+        unsafe { self.bytes[i] }
+    }
+
+    #[inline]
+    pub fn replace(&mut self, i: usize, byte: u8) {
+        // Safe because `bytes` is always accessible.
+        unsafe { self.bytes[i] = byte; }
+    }
+
+    #[inline]
+    pub fn shuffle(self, indices: u8x16) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            u8x16 { vector: _mm_shuffle_epi8(self.vector, indices.vector) }
+        }
+    }
+
+    #[inline]
+    pub fn ne(self, other: u8x16) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            let boolv = _mm_cmpeq_epi8(self.vector, other.vector);
+            let ones = _mm_set1_epi8(0xFF as u8 as i8);
+            u8x16 { vector: _mm_andnot_si128(boolv, ones) }
+        }
+    }
+
+    #[inline]
+    pub fn and(self, other: u8x16) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            u8x16 { vector: _mm_and_si128(self.vector, other.vector) }
+        }
+    }
+
+    #[inline]
+    pub fn movemask(self) -> u32 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            _mm_movemask_epi8(self.vector) as u32
+        }
+    }
+
+    #[inline]
+    pub fn alignr_14(self, other: u8x16) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 14) }
+        }
+    }
+
+    #[inline]
+    pub fn alignr_15(self, other: u8x16) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 15) }
+        }
+    }
+
+    #[inline]
+    pub fn bit_shift_right_4(self) -> u8x16 {
+        // Safe because we know SSSE3 is enabled.
+        unsafe {
+            u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
+        }
+    }
+}
+
+impl fmt::Debug for u8x16 {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // Safe because `bytes` is always accessible.
+        unsafe { self.bytes.fmt(f) }
+    }
+}

From 64551965b4913ab994a045a46302a4d5b87e13e8 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 11 Mar 2018 22:05:31 -0400
Subject: [PATCH 3/6] teddy: port teddy searcher to AVX2

This commit adds a copy of the Teddy searcher that works on AVX2. We
don't attempt to reuse any code between them just yet, and instead just
copy & paste and tweak parts of it to work on 32 bytes instead of 16.
(Some parts were trickier than others. For example, @jneem figured out
how to nearly compensate for the lack of a real 256-bit bytewise PALIGNR
instruction, which we borrow here.)

Overall, AVX2 provides a nice bump in performance.
---
 build.rs                           |   1 +
 src/literal/mod.rs                 |  53 +++-
 src/literal/teddy_avx2/fallback.rs |  20 ++
 src/literal/teddy_avx2/imp.rs      | 467 +++++++++++++++++++++++++++++
 src/literal/teddy_avx2/mod.rs      |  16 +
 src/vector/avx2.rs                 | 195 ++++++++++++
 src/vector/mod.rs                  |   2 +
 7 files changed, 742 insertions(+), 12 deletions(-)
 create mode 100644 src/literal/teddy_avx2/fallback.rs
 create mode 100644 src/literal/teddy_avx2/imp.rs
 create mode 100644 src/literal/teddy_avx2/mod.rs
 create mode 100644 src/vector/avx2.rs

diff --git a/build.rs b/build.rs
index 645d5ec309..ad27991206 100644
--- a/build.rs
+++ b/build.rs
@@ -21,6 +21,7 @@ fn main() {
     if env::var_os("CARGO_CFG_REGEX_DISABLE_AUTO_OPTIMIZATIONS").is_none() {
         if version.contains("nightly") {
             println!("cargo:rustc-cfg=regex_runtime_teddy_ssse3");
+            println!("cargo:rustc-cfg=regex_runtime_teddy_avx2");
         }
     }
 }
diff --git a/src/literal/mod.rs b/src/literal/mod.rs
index 69be8eab01..de1bd5339b 100644
--- a/src/literal/mod.rs
+++ b/src/literal/mod.rs
@@ -16,8 +16,10 @@ use memchr::{memchr, memchr2, memchr3};
 use syntax::hir::literal::{Literal, Literals};
 
 use freqs::BYTE_FREQUENCIES;
-use self::teddy_ssse3::Teddy;
+use self::teddy_avx2::{Teddy as TeddyAVX2};
+use self::teddy_ssse3::{Teddy as TeddySSSE3};
 
+mod teddy_avx2;
 mod teddy_ssse3;
 
 /// A prefix extracted from a compiled regular expression.
@@ -47,7 +49,10 @@ enum Matcher {
     AC(FullAcAutomaton<Literal>),
     /// A simd accelerated multiple string matcher. Used only for a small
     /// number of small literals.
-    Teddy128(Teddy),
+    TeddySSSE3(TeddySSSE3),
+    /// A simd accelerated multiple string matcher. Used only for a small
+    /// number of small literals. This uses 256-bit vectors.
+    TeddyAVX2(TeddyAVX2),
 }
 
 impl LiteralSearcher {
@@ -98,7 +103,8 @@ impl LiteralSearcher {
             FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
             BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
             AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
-            Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)),
+            TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
+            TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
         }
     }
 
@@ -136,8 +142,11 @@ impl LiteralSearcher {
             Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
             Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
             Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
-            Matcher::Teddy128(ref ted) => {
-                LiteralIter::Teddy128(ted.patterns())
+            Matcher::TeddySSSE3(ref ted) => {
+                LiteralIter::TeddySSSE3(ted.patterns())
+            }
+            Matcher::TeddyAVX2(ref ted) => {
+                LiteralIter::TeddyAVX2(ted.patterns())
             }
         }
     }
@@ -166,7 +175,8 @@ impl LiteralSearcher {
             FreqyPacked(_) => 1,
             BoyerMoore(_) => 1,
             AC(ref aut) => aut.len(),
-            Teddy128(ref ted) => ted.len(),
+            TeddySSSE3(ref ted) => ted.len(),
+            TeddyAVX2(ref ted) => ted.len(),
         }
     }
 
@@ -179,7 +189,8 @@ impl LiteralSearcher {
             FreqyPacked(ref single) => single.approximate_size(),
             BoyerMoore(ref single) => single.approximate_size(),
             AC(ref aut) => aut.heap_bytes(),
-            Teddy128(ref ted) => ted.approximate_size(),
+            TeddySSSE3(ref ted) => ted.approximate_size(),
+            TeddyAVX2(ref ted) => ted.approximate_size(),
         }
     }
 }
@@ -220,7 +231,15 @@ impl Matcher {
             }
         }
         let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
-        if Teddy::available() && !is_aho_corasick_fast {
+        if TeddyAVX2::available() && !is_aho_corasick_fast {
+            const MAX_TEDDY_LITERALS: usize = 32;
+            if lits.literals().len() <= MAX_TEDDY_LITERALS {
+                if let Some(ted) = TeddyAVX2::new(lits) {
+                    return Matcher::TeddyAVX2(ted);
+                }
+            }
+        }
+        if TeddySSSE3::available() && !is_aho_corasick_fast {
             // Only try Teddy if Aho-Corasick can't use memchr on an ASCII
             // byte. Also, in its current form, Teddy doesn't scale well to
             // lots of literals.
@@ -232,8 +251,8 @@ impl Matcher {
             // negating the benefit of memchr.
             const MAX_TEDDY_LITERALS: usize = 32;
             if lits.literals().len() <= MAX_TEDDY_LITERALS {
-                if let Some(ted) = Teddy::new(lits) {
-                    return Matcher::Teddy128(ted);
+                if let Some(ted) = TeddySSSE3::new(lits) {
+                    return Matcher::TeddySSSE3(ted);
                 }
             }
             // Fallthrough to ol' reliable Aho-Corasick...
@@ -248,7 +267,8 @@ pub enum LiteralIter<'a> {
     Bytes(&'a [u8]),
     Single(&'a [u8]),
     AC(&'a [Literal]),
-    Teddy128(&'a [Vec<u8>]),
+    TeddySSSE3(&'a [Vec<u8>]),
+    TeddyAVX2(&'a [Vec<u8>]),
 }
 
 impl<'a> Iterator for LiteralIter<'a> {
@@ -284,7 +304,16 @@ impl<'a> Iterator for LiteralIter<'a> {
                     Some(&**next)
                 }
             }
-            LiteralIter::Teddy128(ref mut lits) => {
+            LiteralIter::TeddySSSE3(ref mut lits) => {
+                if lits.is_empty() {
+                    None
+                } else {
+                    let next = &lits[0];
+                    *lits = &lits[1..];
+                    Some(&**next)
+                }
+            }
+            LiteralIter::TeddyAVX2(ref mut lits) => {
                 if lits.is_empty() {
                     None
                 } else {
diff --git a/src/literal/teddy_avx2/fallback.rs b/src/literal/teddy_avx2/fallback.rs
new file mode 100644
index 0000000000..20524aabfe
--- /dev/null
+++ b/src/literal/teddy_avx2/fallback.rs
@@ -0,0 +1,20 @@
+use syntax::hir::literal::Literals;
+
+#[derive(Debug, Clone)]
+pub struct Teddy(());
+
+#[derive(Debug, Clone)]
+pub struct Match {
+    pub pat: usize,
+    pub start: usize,
+    pub end: usize,
+}
+
+impl Teddy {
+    pub fn available() -> bool { false }
+    pub fn new(_pats: &Literals) -> Option<Teddy> { None }
+    pub fn patterns(&self) -> &[Vec<u8>] { &[] }
+    pub fn len(&self) -> usize { 0 }
+    pub fn approximate_size(&self) -> usize { 0 }
+    pub fn find(&self, _haystack: &[u8]) -> Option<Match> { None }
+}
diff --git a/src/literal/teddy_avx2/imp.rs b/src/literal/teddy_avx2/imp.rs
new file mode 100644
index 0000000000..0bfc71cc91
--- /dev/null
+++ b/src/literal/teddy_avx2/imp.rs
@@ -0,0 +1,467 @@
+/*!
+This is the Teddy searcher, but ported to AVX2.
+
+See the module comments in the SSSE3 Teddy searcher for a more in depth
+explanation of how this algorithm works. For the most part, this port is
+basically the same as the SSSE3 version, but using 256-bit vectors instead of
+128-bit vectors, which increases throughput.
+*/
+
+use std::cmp;
+
+use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
+use syntax::hir::literal::Literals;
+
+use vector::avx2::{AVX2VectorBuilder, u8x32};
+
+/// Corresponds to the number of bytes read at a time in the haystack.
+const BLOCK_SIZE: usize = 32;
+
+/// Match reports match information.
+#[derive(Debug, Clone)]
+pub struct Match {
+    /// The index of the pattern that matched. The index is in correspondence
+    /// with the order of the patterns given at construction.
+    pub pat: usize,
+    /// The start byte offset of the match.
+    pub start: usize,
+    /// The end byte offset of the match. This is always `start + pat.len()`.
+    pub end: usize,
+}
+
+/// A SIMD accelerated multi substring searcher.
+#[derive(Debug, Clone)]
+pub struct Teddy {
+    /// A builder for AVX2 empowered vectors.
+    vb: AVX2VectorBuilder,
+    /// A list of substrings to match.
+    pats: Vec<Vec<u8>>,
+    /// An Aho-Corasick automaton of the patterns. We use this when we need to
+    /// search pieces smaller than the Teddy block size.
+    ac: FullAcAutomaton<Vec<u8>>,
+    /// A set of 8 buckets. Each bucket corresponds to a single member of a
+    /// bitset. A bucket contains zero or more substrings. This is useful
+    /// when the number of substrings exceeds 8, since our bitsets cannot have
+    /// more than 8 members.
+    buckets: Vec<Vec<usize>>,
+    /// Our set of masks. There's one mask for each byte in the fingerprint.
+    masks: Masks,
+}
+
+impl Teddy {
+    /// Returns true if and only if Teddy is supported on this platform.
+    ///
+    /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
+    /// return `None`.
+    pub fn available() -> bool {
+        AVX2VectorBuilder::new().is_some()
+    }
+
+    /// Create a new `Teddy` multi substring matcher.
+    ///
+    /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
+    /// an empty substring), then `None` is returned.
+    pub fn new(pats: &Literals) -> Option<Teddy> {
+        let vb = match AVX2VectorBuilder::new() {
+            None => return None,
+            Some(vb) => vb,
+        };
+        if !Teddy::available() {
+            return None;
+        }
+
+        let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
+        let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
+        // Don't allow any empty patterns and require that we have at
+        // least one pattern.
+        if min_len < 1 {
+            return None;
+        }
+        // Pick the largest mask possible, but no larger than 3.
+        let nmasks = cmp::min(3, min_len);
+        let mut masks = Masks::new(vb, nmasks);
+        let mut buckets = vec![vec![]; 8];
+        // Assign a substring to each bucket, and add the bucket's bitfield to
+        // the appropriate position in the mask.
+        for (pati, pat) in pats.iter().enumerate() {
+            let bucket = pati % 8;
+            buckets[bucket].push(pati);
+            masks.add(bucket as u8, pat);
+        }
+        Some(Teddy {
+            vb: vb,
+            pats: pats.to_vec(),
+            ac: AcAutomaton::new(pats.to_vec()).into_full(),
+            buckets: buckets,
+            masks: masks,
+        })
+    }
+
+    /// Returns all of the substrings matched by this `Teddy`.
+    pub fn patterns(&self) -> &[Vec<u8>] {
+        &self.pats
+    }
+
+    /// Returns the number of substrings in this matcher.
+    pub fn len(&self) -> usize {
+        self.pats.len()
+    }
+
+    /// Returns the approximate size on the heap used by this matcher.
+    pub fn approximate_size(&self) -> usize {
+        self.pats.iter().fold(0, |a, b| a + b.len())
+    }
+
+    /// Searches `haystack` for the substrings in this `Teddy`. If a match was
+    /// found, then it is returned. Otherwise, `None` is returned.
+    pub fn find(&self, haystack: &[u8]) -> Option<Match> {
+        // This is safe because the only way we can construct a Teddy type
+        // is if AVX2 is available.
+        unsafe { self.find_impl(haystack) }
+    }
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
+        // If our haystack is smaller than the block size, then fall back to
+        // a naive brute force search.
+        if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
+            return self.slow(haystack, 0);
+        }
+        match self.masks.len() {
+            0 => None,
+            1 => self.find1(haystack),
+            2 => self.find2(haystack),
+            3 => self.find3(haystack),
+            _ => unreachable!(),
+        }
+    }
+
+    /// `find1` is used when there is only 1 mask. This is the easy case and is
+    /// pretty much as described in the module documentation.
+    #[inline(always)]
+    fn find1(&self, haystack: &[u8]) -> Option<Match> {
+        let mut pos = 0;
+        let zero = self.vb.u8x32_splat(0);
+        let len = haystack.len();
+        debug_assert!(len >= BLOCK_SIZE);
+        while pos <= len - BLOCK_SIZE {
+            let h = unsafe {
+                // I tried and failed to eliminate bounds checks in safe code.
+                // This is safe because of our loop invariant: pos is always
+                // <= len-32.
+                let p = haystack.get_unchecked(pos..);
+                self.vb.u8x32_load_unchecked_unaligned(p)
+            };
+            // N.B. `res0` is our `C` in the module documentation.
+            let res0 = self.masks.members1(h);
+            // Only do expensive verification if there are any non-zero bits.
+            let bitfield = res0.ne(zero).movemask();
+            if bitfield != 0 {
+                if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
+                    return Some(m);
+                }
+            }
+            pos += BLOCK_SIZE;
+        }
+        self.slow(haystack, pos)
+    }
+
+    /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
+    /// long.
+    #[inline(always)]
+    fn find2(&self, haystack: &[u8]) -> Option<Match> {
+        // This is an exotic way to right shift a SIMD vector across lanes.
+        // See below at use for more details.
+        let zero = self.vb.u8x32_splat(0);
+        let len = haystack.len();
+        // The previous value of `C` (from the module documentation) for the
+        // *first* byte in the fingerprint. On subsequent iterations, we take
+        // the last bitset from the previous `C` and insert it into the first
+        // position of the current `C`, shifting all other bitsets to the right
+        // one lane. This causes `C` for the first byte to line up with `C` for
+        // the second byte, so that they can be `AND`'d together.
+        let mut prev0 = self.vb.u8x32_splat(0xFF);
+        let mut pos = 1;
+        debug_assert!(len >= BLOCK_SIZE);
+        while pos <= len - BLOCK_SIZE {
+            let h = unsafe {
+                // I tried and failed to eliminate bounds checks in safe code.
+                // This is safe because of our loop invariant: pos is always
+                // <= len-32.
+                let p = haystack.get_unchecked(pos..);
+                self.vb.u8x32_load_unchecked_unaligned(p)
+            };
+            let (res0, res1) = self.masks.members2(h);
+
+            // Do this:
+            //
+            //     (prev0 << 15) | (res0 >> 1)
+            //
+            // This lets us line up our C values for each byte.
+            let res0prev0 = res0.alignr_15(prev0);
+
+            // `AND`'s our `C` values together.
+            let res = res0prev0.and(res1);
+            prev0 = res0;
+
+            let bitfield = res.ne(zero).movemask();
+            if bitfield != 0 {
+                let pos = pos.checked_sub(1).unwrap();
+                if let Some(m) = self.verify(haystack, pos, res, bitfield) {
+                    return Some(m);
+                }
+            }
+            pos += BLOCK_SIZE;
+        }
+        // The windowing above doesn't check the last byte in the last
+        // window, so start the slow search at the last byte of the last
+        // window.
+        self.slow(haystack, pos.checked_sub(1).unwrap())
+    }
+
+    /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
+    /// long.
+    ///
+    /// N.B. This is a straight-forward extrapolation of `find2`. The only
+    /// difference is that we need to keep track of two previous values of `C`,
+    /// since we now need to align for three bytes.
+    #[inline(always)]
+    fn find3(&self, haystack: &[u8]) -> Option<Match> {
+        let zero = self.vb.u8x32_splat(0);
+        let len = haystack.len();
+        let mut prev0 = self.vb.u8x32_splat(0xFF);
+        let mut prev1 = self.vb.u8x32_splat(0xFF);
+        let mut pos = 2;
+
+        while pos <= len - BLOCK_SIZE {
+            let h = unsafe {
+                // I tried and failed to eliminate bounds checks in safe code.
+                // This is safe because of our loop invariant: pos is always
+                // <= len-32.
+                let p = haystack.get_unchecked(pos..);
+                self.vb.u8x32_load_unchecked_unaligned(p)
+            };
+            let (res0, res1, res2) = self.masks.members3(h);
+
+            let res0prev0 = res0.alignr_14(prev0);
+            let res1prev1 = res1.alignr_15(prev1);
+            let res = res0prev0.and(res1prev1).and(res2);
+
+            prev0 = res0;
+            prev1 = res1;
+
+            let bitfield = res.ne(zero).movemask();
+            if bitfield != 0 {
+                let pos = pos.checked_sub(2).unwrap();
+                if let Some(m) = self.verify(haystack, pos, res, bitfield) {
+                    return Some(m);
+                }
+            }
+            pos += BLOCK_SIZE;
+        }
+        // The windowing above doesn't check the last two bytes in the last
+        // window, so start the slow search at the penultimate byte of the
+        // last window.
+        // self.slow(haystack, pos.saturating_sub(2))
+        self.slow(haystack, pos.checked_sub(2).unwrap())
+    }
+
+    /// Runs the verification procedure on `res` (i.e., `C` from the module
+    /// documentation), where the haystack block starts at `pos` in
+    /// `haystack`. `bitfield` has ones in the bit positions that `res` has
+    /// non-zero bytes.
+    ///
+    /// If a match exists, it returns the first one.
+    #[inline(always)]
+    fn verify(
+        &self,
+        haystack: &[u8],
+        pos: usize,
+        res: u8x32,
+        mut bitfield: u32,
+    ) -> Option<Match> {
+        while bitfield != 0 {
+            // The next offset, relative to pos, where some fingerprint
+            // matched.
+            let byte_pos = bitfield.trailing_zeros() as usize;
+            bitfield &= !(1 << byte_pos);
+
+            // Offset relative to the beginning of the haystack.
+            let start = pos + byte_pos;
+
+            // The bitfield telling us which patterns had fingerprints that
+            // match at this starting position.
+            let mut patterns = res.extract(byte_pos);
+            while patterns != 0 {
+                let bucket = patterns.trailing_zeros() as usize;
+                patterns &= !(1 << bucket);
+
+                // Actual substring search verification.
+                if let Some(m) = self.verify_bucket(haystack, bucket, start) {
+                    return Some(m);
+                }
+            }
+        }
+
+        None
+    }
+
+    /// Verifies whether any substring in the given bucket matches in haystack
+    /// at the given starting position.
+    #[inline(always)]
+    fn verify_bucket(
+        &self,
+        haystack: &[u8],
+        bucket: usize,
+        start: usize,
+    ) -> Option<Match> {
+        // This cycles through the patterns in the bucket in the order that
+        // the patterns were given. Therefore, we guarantee leftmost-first
+        // semantics.
+        for &pati in &self.buckets[bucket] {
+            let pat = &*self.pats[pati];
+            if start + pat.len() > haystack.len() {
+                continue;
+            }
+            if pat == &haystack[start..start + pat.len()] {
+                return Some(Match {
+                    pat: pati,
+                    start: start,
+                    end: start + pat.len(),
+                });
+            }
+        }
+        None
+    }
+
+    /// Slow substring search through all patterns in this matcher.
+    ///
+    /// This is used when we don't have enough bytes in the haystack for our
+    /// block based approach.
+    #[inline(never)]
+    fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
+        self.ac.find(&haystack[pos..]).next().map(|m| {
+            Match {
+                pat: m.pati,
+                start: pos + m.start,
+                end: pos + m.end,
+            }
+        })
+    }
+}
+
+/// A list of masks. This has length equal to the length of the fingerprint.
+/// The length of the fingerprint is always `min(3, len(smallest_substring))`.
+#[derive(Debug, Clone)]
+struct Masks {
+    vb: AVX2VectorBuilder,
+    masks: [Mask; 3],
+    size: usize,
+}
+
+impl Masks {
+    /// Create a new set of masks of size `n`, where `n` corresponds to the
+    /// number of bytes in a fingerprint.
+    fn new(vb: AVX2VectorBuilder, n: usize) -> Masks {
+        Masks {
+            vb: vb,
+            masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
+            size: n,
+        }
+    }
+
+    /// Returns the number of masks.
+    fn len(&self) -> usize {
+        self.size
+    }
+
+    /// Adds the given pattern to the given bucket. The bucket should be a
+    /// power of `2 <= 2^7`.
+    fn add(&mut self, bucket: u8, pat: &[u8]) {
+        for i in 0..self.len() {
+            self.masks[i].add(bucket, pat[i]);
+        }
+    }
+
+    /// Finds the fingerprints that are in the given haystack block. i.e., this
+    /// returns `C` as described in the module documentation.
+    ///
+    /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
+    /// only if `haystack_block[i]` corresponds to a fingerprint that is part
+    /// of a pattern in bucket `j`.
+    #[inline(always)]
+    fn members1(&self, haystack_block: u8x32) -> u8x32 {
+        let masklo = self.vb.u8x32_splat(0xF);
+        let hlo = haystack_block.and(masklo);
+        let hhi = haystack_block.bit_shift_right_4().and(masklo);
+
+        self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
+    }
+
+    /// Like members1, but computes C for the first and second bytes in the
+    /// fingerprint.
+    #[inline(always)]
+    fn members2(&self, haystack_block: u8x32) -> (u8x32, u8x32) {
+        let masklo = self.vb.u8x32_splat(0xF);
+        let hlo = haystack_block.and(masklo);
+        let hhi = haystack_block.bit_shift_right_4().and(masklo);
+
+        let res0 =
+            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
+        let res1 =
+            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
+        (res0, res1)
+    }
+
+    /// Like `members1`, but computes `C` for the first, second and third bytes
+    /// in the fingerprint.
+    #[inline(always)]
+    fn members3(&self, haystack_block: u8x32) -> (u8x32, u8x32, u8x32) {
+        let masklo = self.vb.u8x32_splat(0xF);
+        let hlo = haystack_block.and(masklo);
+        let hhi = haystack_block.bit_shift_right_4().and(masklo);
+
+        let res0 =
+            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
+        let res1 =
+            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
+        let res2 =
+            self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
+        (res0, res1, res2)
+    }
+}
+
+/// A single mask.
+#[derive(Debug, Clone, Copy)]
+struct Mask {
+    /// Bitsets for the low nybbles in a fingerprint.
+    lo: u8x32,
+    /// Bitsets for the high nybbles in a fingerprint.
+    hi: u8x32,
+}
+
+impl Mask {
+    /// Create a new mask with no members.
+    fn new(vb: AVX2VectorBuilder) -> Mask {
+        Mask {
+            lo: vb.u8x32_splat(0),
+            hi: vb.u8x32_splat(0),
+        }
+    }
+
+    /// Adds the given byte to the given bucket.
+    fn add(&mut self, bucket: u8, byte: u8) {
+        // Split our byte into two nybbles, and add each nybble to our
+        // mask.
+        let byte_lo = (byte & 0xF) as usize;
+        let byte_hi = (byte >> 4) as usize;
+
+        let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
+        self.lo.replace(byte_lo, lo);
+        self.lo.replace(byte_lo + 16, lo);
+
+        let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
+        self.hi.replace(byte_hi, hi);
+        self.hi.replace(byte_hi + 16, hi);
+    }
+}
diff --git a/src/literal/teddy_avx2/mod.rs b/src/literal/teddy_avx2/mod.rs
new file mode 100644
index 0000000000..78b6e20e8a
--- /dev/null
+++ b/src/literal/teddy_avx2/mod.rs
@@ -0,0 +1,16 @@
+pub use self::imp::*;
+
+#[cfg(all(
+    feature = "unstable",
+    regex_runtime_teddy_avx2,
+    any(target_arch = "x86_64"),
+))]
+mod imp;
+
+#[cfg(not(all(
+    feature = "unstable",
+    regex_runtime_teddy_avx2,
+    any(target_arch = "x86_64"),
+)))]
+#[path = "fallback.rs"]
+mod imp;
diff --git a/src/vector/avx2.rs b/src/vector/avx2.rs
new file mode 100644
index 0000000000..f178899844
--- /dev/null
+++ b/src/vector/avx2.rs
@@ -0,0 +1,195 @@
+#![allow(dead_code)]
+
+use std::arch::x86_64::*;
+use std::fmt;
+
+#[derive(Clone, Copy, Debug)]
+pub struct AVX2VectorBuilder(());
+
+impl AVX2VectorBuilder {
+    pub fn new() -> Option<AVX2VectorBuilder> {
+        if is_target_feature_detected!("avx2") {
+            Some(AVX2VectorBuilder(()))
+        } else {
+            None
+        }
+    }
+
+    /// Create a new u8x32 AVX2 vector where all of the bytes are set to
+    /// the given value.
+    #[inline]
+    pub fn u8x32_splat(self, n: u8) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe { u8x32::splat(n) }
+    }
+
+    /// Load 32 bytes from the given slice, with bounds checks.
+    #[inline]
+    pub fn u8x32_load_unaligned(self, slice: &[u8]) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe { u8x32::load_unaligned(slice) }
+    }
+
+    /// Load 32 bytes from the given slice, without bounds checks.
+    #[inline]
+    pub unsafe fn u8x32_load_unchecked_unaligned(self, slice: &[u8]) -> u8x32 {
+        // Safe because we know AVX2 is enabled, but still unsafe
+        // because we aren't doing bounds checks.
+        u8x32::load_unchecked_unaligned(slice)
+    }
+
+    /// Load 32 bytes from the given slice, with bound and alignment checks.
+    #[inline]
+    pub fn u8x32_load(self, slice: &[u8]) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe { u8x32::load(slice) }
+    }
+
+    /// Load 32 bytes from the given slice, without bound or alignment checks.
+    #[inline]
+    pub unsafe fn u8x32_load_unchecked(self, slice: &[u8]) -> u8x32 {
+        // Safe because we know AVX2 is enabled, but still unsafe
+        // because we aren't doing bounds checks.
+        u8x32::load_unchecked(slice)
+    }
+}
+
+// We define our union with a macro so that our code continues to compile on
+// Rust 1.12.
+macro_rules! defunion {
+    () => {
+        #[derive(Clone, Copy)]
+        #[allow(non_camel_case_types)]
+        pub union u8x32 {
+            vector: __m256i,
+            bytes: [u8; 32],
+        }
+    }
+}
+
+defunion!();
+
+impl u8x32 {
+    #[inline]
+    unsafe fn splat(n: u8) -> u8x32 {
+        u8x32 { vector: _mm256_set1_epi8(n as i8) }
+    }
+
+    #[inline]
+    unsafe fn load_unaligned(slice: &[u8]) -> u8x32 {
+        assert!(slice.len() >= 32);
+        u8x32::load_unchecked_unaligned(slice)
+    }
+
+    #[inline]
+    unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x32 {
+        let p = slice.as_ptr() as *const u8 as *const __m256i;
+        u8x32 { vector: _mm256_loadu_si256(p) }
+    }
+
+    #[inline]
+    unsafe fn load(slice: &[u8]) -> u8x32 {
+        assert!(slice.len() >= 32);
+        assert!(slice.as_ptr() as usize % 32 == 0);
+        u8x32::load_unchecked(slice)
+    }
+
+    #[inline]
+    unsafe fn load_unchecked(slice: &[u8]) -> u8x32 {
+        let p = slice.as_ptr() as *const u8 as *const __m256i;
+        u8x32 { vector: _mm256_load_si256(p) }
+    }
+
+    #[inline]
+    pub fn extract(self, i: usize) -> u8 {
+        // Safe because `bytes` is always accessible.
+        unsafe { self.bytes[i] }
+    }
+
+    #[inline]
+    pub fn replace(&mut self, i: usize, byte: u8) {
+        // Safe because `bytes` is always accessible.
+        unsafe { self.bytes[i] = byte; }
+    }
+
+    #[inline]
+    pub fn shuffle(self, indices: u8x32) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            u8x32 { vector: _mm256_shuffle_epi8(self.vector, indices.vector) }
+        }
+    }
+
+    #[inline]
+    pub fn ne(self, other: u8x32) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            let boolv = _mm256_cmpeq_epi8(self.vector, other.vector);
+            let ones = _mm256_set1_epi8(0xFF as u8 as i8);
+            u8x32 { vector: _mm256_andnot_si256(boolv, ones) }
+        }
+    }
+
+    #[inline]
+    pub fn and(self, other: u8x32) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            u8x32 { vector: _mm256_and_si256(self.vector, other.vector) }
+        }
+    }
+
+    #[inline]
+    pub fn movemask(self) -> u32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            _mm256_movemask_epi8(self.vector) as u32
+        }
+    }
+
+    #[inline]
+    pub fn alignr_14(self, other: u8x32) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            // Credit goes to jneem for figuring this out:
+            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+            //
+            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
+            // PALIGNR instructions, which is not what we want, so we need to
+            // do some extra shuffling.
+            let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
+            let v = _mm256_alignr_epi8(self.vector, v, 14);
+            u8x32 { vector: v }
+        }
+    }
+
+    #[inline]
+    pub fn alignr_15(self, other: u8x32) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            // Credit goes to jneem for figuring this out:
+            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
+            //
+            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
+            // PALIGNR instructions, which is not what we want, so we need to
+            // do some extra shuffling.
+            let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
+            let v = _mm256_alignr_epi8(self.vector, v, 15);
+            u8x32 { vector: v }
+        }
+    }
+
+    #[inline]
+    pub fn bit_shift_right_4(self) -> u8x32 {
+        // Safe because we know AVX2 is enabled.
+        unsafe {
+            u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
+        }
+    }
+}
+
+impl fmt::Debug for u8x32 {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // Safe because `bytes` is always accessible.
+        unsafe { self.bytes.fmt(f) }
+    }
+}
diff --git a/src/vector/mod.rs b/src/vector/mod.rs
index 20409f6a81..880dbc1878 100644
--- a/src/vector/mod.rs
+++ b/src/vector/mod.rs
@@ -1,2 +1,4 @@
+#[cfg(target_arch = "x86_64")]
+pub mod avx2;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub mod ssse3;

From 02962dfdfd9e754b675e119ff54621f5237554c2 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 12 Mar 2018 20:56:49 -0400
Subject: [PATCH 4/6] bench: remove RUSTFLAGS

We no longer need to enable SIMD optimizations at compile time. They are
automatically enabled when regex is compiled with the `unstable`
feature.
---
 bench/compile | 3 ---
 bench/run     | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/bench/compile b/bench/compile
index f1289dbcfe..698dbe3918 100755
--- a/bench/compile
+++ b/bench/compile
@@ -1,8 +1,5 @@
 #!/bin/sh
 
-# Enable SIMD.
-export RUSTFLAGS="-C target-cpu=native"
-
 exec cargo build \
   --release \
   --features 're-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
diff --git a/bench/run b/bench/run
index 7651115848..6c960e2f4d 100755
--- a/bench/run
+++ b/bench/run
@@ -9,11 +9,6 @@ if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then
   usage
 fi
 
-# Enable SIMD, unless we're in CI, then we inherit RUSTLFAGS.
-if [ -z "$TRAVIS_RUST_VERSION" ]; then
-  export RUSTFLAGS="-C target-cpu=native"
-fi
-
 which="$1"
 shift
 case $which in

From 0a4027f07fe483f055e576f855bdba05e3bd71a4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 12 Mar 2018 21:14:14 -0400
Subject: [PATCH 5/6] ci: remove RUSTFLAGS, enable unstable

This removes our compile time SIMD flags and replaces them with the
`unstable` feature, which will cause CI to use whatever CPU features are
available.

Ideally, we would test each important CPU feature combinations, but I'd
like to avoid doing that in one CI job and instead split them out into
separate CI jobs to keep CI times low. That requires more work.
---
 ci/after_success.sh | 1 -
 ci/run-kcov         | 2 +-
 ci/script.sh        | 8 +-------
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/ci/after_success.sh b/ci/after_success.sh
index e44a0970fa..4e11477e70 100755
--- a/ci/after_success.sh
+++ b/ci/after_success.sh
@@ -9,7 +9,6 @@ if [ "$TRAVIS_RUST_VERSION" != "nightly" ] || [ "$TRAVIS_PULL_REQUEST" != "false
   exit 0
 fi
 
-export RUSTFLAGS="-C target-feature=+ssse3"
 env
 
 # Install kcov.
diff --git a/ci/run-kcov b/ci/run-kcov
index 0ef842c319..78f5b96b3a 100755
--- a/ci/run-kcov
+++ b/ci/run-kcov
@@ -28,7 +28,7 @@ while true; do
   esac
 done
 
-cargo test --no-run --verbose --jobs 4
+cargo test --no-run --verbose --jobs 4 --features unstable
 for t in ${tests[@]}; do
   kcov \
     --verify \
diff --git a/ci/script.sh b/ci/script.sh
index 6f06aa2958..baaca969e6 100755
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -4,12 +4,6 @@
 
 set -ex
 
-if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
-  # We set this once so that all invocations share this setting. This should
-  # help with build times by avoiding excessive re-compiles.
-  export RUSTFLAGS="-C target-feature=+ssse3"
-fi
-
 # Builds the regex crate and runs tests.
 cargo build --verbose
 cargo doc --verbose
@@ -25,7 +19,7 @@ fi
 
 # Run tests. If we have nightly, then enable our nightly features.
 if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
-  cargo test --verbose --features 'simd-accel pattern'
+  cargo test --verbose --features unstable
 else
   cargo test --verbose
 fi

From 75055b6a1d97daa8af8a5806ec905edca31476de Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Mon, 12 Mar 2018 21:17:37 -0400
Subject: [PATCH 6/6] doc: note the new `unstable` feature

---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 91dd968c0b..c61ba4a644 100644
--- a/README.md
+++ b/README.md
@@ -188,6 +188,16 @@ assert!(!matches.matched(5));
 assert!(matches.matched(6));
 ```
 
+### Usage: enable SIMD optimizations
+
+This crate provides an `unstable` feature that can only be enabled on nightly
+Rust. When this feature is enabled, the regex crate will use SIMD optimizations
+if your CPU supports them. No additional compile time flags are required; the
+regex crate will detect your CPU support at runtime.
+
+When `std::arch` becomes stable, then these optimizations will be enabled
+automatically.
+
 
 ### Usage: a regular expression parser