Hi!
I want to implement convolution in Rust, but it is very slow (compared to C).
The Rust and C source code:
fn re_re_conv(sample: &[f32], coeff: &[f32]) -> Vec<f32> {
let outlen = sample.len() - coeff.len() + 1;
let mut out = Vec::with_capacity(outlen);
for i in 0..outlen {
let mut acc: f32 = 0.;
for j in 0..coeff.len() {
acc += sample[i + j] * coeff[j];
}
out.push(acc);
}
out
}
const SAMPLELEN: usize = 20_000_000;
const COEFFLEN: usize = 500;
fn main() {
let mut sample = Vec::with_capacity(SAMPLELEN);
let mut coeff = Vec::with_capacity(COEFFLEN);
// ugly, but no extra time in this test
unsafe {
sample.set_len(SAMPLELEN);
coeff.set_len(COEFFLEN);
}
let result = re_re_conv(&sample, &coeff);
println!("{} {}", result[0], result[SAMPLELEN - COEFFLEN]);
}
And the C code:
void re_re_conv(float *out, int *out_length, const float *sample, int samplelen, const float *coeff, int coefflen) {
int outlen = samplelen - coefflen + 1;
for (int i=0; i<outlen; i++) {
float acc = 0.;
for (int j=0; j<coefflen; j++) {
acc += sample[i + j] * coeff[j];
}
out[i] = acc;
}
*out_length = outlen;
}
#include <stdio.h>
#include <malloc.h>
#define SAMPLELEN (20*1000*1000)
#define COEFFLEN 500
int main() {
float *sample = malloc(SAMPLELEN*sizeof(float));
float *coeff = malloc(COEFFLEN*sizeof(float));
int result_len;
float *result = malloc(SAMPLELEN*sizeof(float));
re_re_conv(result, &result_len, sample, SAMPLELEN, coeff, COEFFLEN);
printf("%f %f", result[0], result[SAMPLELEN - COEFFLEN]);
}
Compile:
$ rustc -O test_rs.c
$ gcc -Ofast -march=native test_c.c -o test_c
And the running time on Odroid-C2 (ARM64):
$ time ./test_rs --> 72 second (real)
$ time ./test_c --> 14,3 second (real) - 5.0x faster
x86_64 time:
$ time ./test_rs --> 10,95 second (real)
$ time ./test_c --> 1,61 second (real) - 6,8x faster
Can you help me, how can I write 4..5x faster convolution code in Rust?
Target: ARM64 & x86_64.
Update#1: clang
On Odroid-C2 (ARM64): (clang does not support '-march=native')
$ clang -Ofast conv_c.c -o conv_c_clang
$ time ./conv_c_clang --> 15,8 sec, 4,5x faster than Rust.
On x86_64:
$ clang -Ofast -march=native conv_c.c -o conv_c_clang
$ time ./conv_c_clang --> 1,51 sec (x86_64), 7x faster than Rust.