This post is made after further investigating odd performance behavior I mentioned in a previous post here.
Basically, x86_64 AVX2 vertical addition seems to sometimes take more processor cycles as measured using the rdtsc x86 instruction.
Here's my source code/test case:
use core::mem::transmute;
use core::sync::atomic;
use core::hint::black_box;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[test]
#[inline(never)]
pub fn rdtsc_avx2_vs_sse2() {
assert!(std::is_x86_feature_detected!("avx2"));
unsafe {
let a_256: __m256i = black_box(transmute([0u8; 32]));
let a_128s: [__m128i; 2] = black_box(transmute(a_256));
let b_256: __m256i = black_box(transmute([1u8; 32]));
let b_128s: [__m128i; 2] = black_box(transmute(b_256));
atomic::fence(atomic::Ordering::SeqCst);
let start_avx2 = _rdtsc();
atomic::fence(atomic::Ordering::SeqCst);
let result_avx2 = _mm256_add_epi8(a_256, b_256);
atomic::fence(atomic::Ordering::SeqCst);
let end_avx2 = _rdtsc();
atomic::fence(atomic::Ordering::SeqCst);
std::println!("AVX2 version got {:?} and took {} CPU cycles", transmute::<_, [u8; 32]>(result_avx2), end_avx2 - start_avx2);
atomic::fence(atomic::Ordering::SeqCst);
let start_sse2 = _rdtsc();
atomic::fence(atomic::Ordering::SeqCst);
let result_sse2 = [_mm_add_epi8(a_128s[0], b_128s[0]), _mm_add_epi8(a_128s[1], b_128s[1])];
atomic::fence(atomic::Ordering::SeqCst);
let end_sse2 = _rdtsc();
atomic::fence(atomic::Ordering::SeqCst);
std::println!("SSE2 version got {:?} and took {} CPU cycles", transmute::<_, [u8; 32]>(result_sse2), end_sse2 - start_sse2);
}
}
And here's some outputs on my machine (in release mode):
> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 576 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 216 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.05s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 612 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 180 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 216 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 504 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 216 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 576 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 900 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 216 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 252 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 576 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 864 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 216 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
PS C:\Users\alfri\Documents\Projects\x86-simd> cargo test --release rdtsc_avx2_vs_sse2 -- --nocapture
Finished `release` profile [optimized] target(s) in 0.04s
Running unittests src\lib.rs (target\release\deps\x86_simd-ef4220bf1b194345.exe)
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 8 filtered out; finished in 0.00s
Running tests\rdtsc_avx2_vs_sse2.rs (target\release\deps\rdtsc_avx2_vs_sse2-d7b94e7d902646c6.exe)
running 1 test
AVX2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 864 CPU cycles
SSE2 version got [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] and took 180 CPU cycles
test rdtsc_avx2_vs_sse2 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
Is it just my specific chip? I have an AMD Ryzen 5 3600 (in the zen2 line).
It just doesn't make sense to me that a single AVX2 call would seemingly randomly take 800 cycles when 2 SSE2 calls take almost always 200.