before I trying to optimize the code, the code is:
fn round_one_phase1(ra: &mut u32, rb: &mut u32, rc: &mut u32, rd: &mut u32,
re: u32, rf: &mut u32, rg: &mut u32, rh: &mut u32,
j: usize, w1: &mut u32, w2: &mut u32){
let mut tt2 = ra.rotate_left(12);
let mut tt1 = tt2.wrapping_add(re).wrapping_add(T[j]);
tt1 = tt1.rotate_left(7);
tt2 = tt2 ^ tt1;
*rd = ff1(*ra, *rb, *rc)
.wrapping_add(*rd)
.wrapping_add(tt2)
.wrapping_add(*w2);
*rh = gg1(re, *rf, *rg)
.wrapping_add(*rh)
.wrapping_add(tt1)
.wrapping_add(*w1);
*rb = rb.rotate_left(9);
*rf = rf.rotate_left(19);
*rh = p0(*rh);
}
I try to decrease the add num, so I use SIMD instruction. Here is my code:
fn round_one_phase1(ra: &mut u32, rb: &mut u32, rc: &mut u32, rd: &mut u32,
re: u32, rf: &mut u32, rg: &mut u32, rh: &mut u32,
j: usize, w1: &mut u32, w2: &mut u32){
let mut tt2 = ra.rotate_left(12);
let mut tt1 = tt2.wrapping_add(re).wrapping_add(T[j]);
tt1 = tt1.rotate_left(7);
tt2 = tt2 ^ tt1;
unsafe {
let rd_rh_part1= _mm_setr_epi32(ff1(*ra, *rb, *rc) as i32, gg1(re, *rf, *rg) as i32, tt2 as i32, tt1 as i32);
let rd_rh_part2 = _mm_setr_epi32(*rd as i32, *rh as i32, *w2 as i32, *w1 as i32);
let rd_rh_res_part1 = _mm_add_epi32(rd_rh_part1, rd_rh_part2);
let rd_rh_res_add1 = _mm_setr_epi32(_mm_extract_epi32::<0>(rd_rh_res_part1), _mm_extract_epi32::<1>(rd_rh_res_part1), 0, 0);
let rd_rh_res_add2 = _mm_setr_epi32(_mm_extract_epi32::<2>(rd_rh_res_part1), _mm_extract_epi32::<3>(rd_rh_res_part1), 0, 0);
let rd_rh_res =_mm_add_epi32(rd_rh_res_add1, rd_rh_res_add2);
*rd = _mm_extract_epi32::<0>(rd_rh_res) as u32;
*rh = _mm_extract_epi32::<1>(rd_rh_res) as u32;
}
*rb = rb.rotate_left(9);
*rf = rf.rotate_left(19);
*rh = p0(*rh);
}
Then I use below to run code.
RUSTFLAGS=-Ctarget-feature=+avx2,+sse2,+sse4.1 cargo test --release sm3_time_test -p ylong_sm3 -- --nocapture
Before I changed the code, the time is 1543.606
After I changed the code, the time is 1860
Why does time take longer?
So Im very confused.
Can any one give me some adive?
Thanks!!!