diff --git a/README.md b/README.md
index 36424bf..8cc6e1d 100644
--- a/README.md
+++ b/README.md
@@ -103,6 +103,8 @@ For more details, please see [the source code for these benchmarks](benches/vari
 
 ## TODO
 * Encoding multiple values at once
+* Faster decode for two `u64` values with AVX2 (currently fairly slow)
+* Parallel ZigZag decode/encode
 * Support for ARM NEON
 * Fallback scalar implementation
 * Further optimization (I'm pretty sure I left some performance on the table)
diff --git a/benches/varint_bench/main.rs b/benches/varint_bench/main.rs
index 1b6276a..64f9fb4 100644
--- a/benches/varint_bench/main.rs
+++ b/benches/varint_bench/main.rs
@@ -481,6 +481,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         )
     });
 
+    group.throughput(Throughput::Elements(2));
     group.bench_function("varint-simd/2x_wide/unsafe", |b| {
         b.iter_batched_ref(
             create_double_encoded_generator_wide::<u64, u64, _>(&mut rng),
diff --git a/src/decode/mod.rs b/src/decode/mod.rs
index 596ac31..c0cf607 100644
--- a/src/decode/mod.rs
+++ b/src/decode/mod.rs
@@ -315,9 +315,9 @@ unsafe fn dual_u32_stage2(comb: __m128i) -> __m128i {
     )
 }
 
-/// Decode two adjacent varints simultaneously from the input pointer. Requires AVX2. Allows for
-/// decoding a pair of `u64` values. For smaller values, the non-wide variation of this function
-/// will probably be faster.
+/// **Experimental. May have relatively poor performance.** Decode two adjacent varints
+/// simultaneously from the input pointer. Requires AVX2. Allows for decoding a pair of `u64`
+/// values. For smaller values, the non-wide variation of this function will probably be faster.
 ///
 /// Returns a tuple containing the two decoded values and the two lengths of bytes read for each
 /// value.