Hello everyone,
Assume you have the following code :
struct StaticMatrix<'a>
{
/*
A n x m matrix
*/
// Dimension
n : usize,
m: usize,
// Values
v: &'a mut [f64],
}
fn mat_mul(a: StaticMatrix, b: StaticMatrix, s: StaticMatrix)
{
if a.m != b.n
{
panic!("Inconsistent matrix shape in multiplication");
}
let mut inter: f64;
let m_b = b.m;
let n_b = b.n;
let n_a = a.n;
let n_s = s.n;
for j in 0..m_b
{
for i in 0..n_a
{
inter = 0.0;
for k in 0..m_b
{
inter = inter + a.v[i+n_a*k]*b.v[k+n_b*j];
}
s.v[i + j*n_s] = inter;
}
}
}
fn main() {
const N: usize = 1000;
//let mut values: [f64; N*N] = [0.0;N*N];
let a = StaticMatrix {
n: N,
m: N,
v: &mut [0.0;N*N]
};
let b = StaticMatrix {
n: N,
m: N,
v: &mut [0.0;N*N]
};
let mut s = StaticMatrix {
n: N,
m: N,
v: &mut [0.0;N*N]
};
mat_mul(a, b, s);
}
As you can see, it is a basic implementation of matrix multiplication. When I compile this with --release flag, it run in 0m0,043s. However if I split this code in two files like this :
static_matrix.rs
pub struct StaticMatrix<'a>
{
/*
A n x m matrix
*/
// Dimension
pub n : usize,
pub m: usize,
// Values
pub v: &'a mut [f64],
}
pub fn mat_mul(a: StaticMatrix, b: StaticMatrix, s: StaticMatrix)
{
if a.m != b.n
{
panic!("Inconsistent matrix shape in multiplication");
}
let mut inter: f64;
let m_b = b.m;
let n_b = b.n;
let n_a = a.n;
let n_s = s.n;
for j in 0..m_b
{
for i in 0..n_a
{
inter = 0.0;
for k in 0..m_b
{
inter = inter + a.v[i+n_a*k]*b.v[k+n_b*j];
}
s.v[i + j*n_s] = inter;
}
}
}
main.rs
mod static_matrix;
use crate::static_matrix::StaticMatrix;
use crate::static_matrix::mat_mul;
fn main() {
const N: usize = 1000;
//let mut values: [f64; N*N] = [0.0;N*N];
let a = StaticMatrix {
n: N,
m: N,
v: &mut [0.0;N*N]
};
let b = StaticMatrix {
n: N,
m: N,
v: &mut [0.0;N*N]
};
let mut s = StaticMatrix {
n: N,
m: N,
v: &mut [0.0;N*N]
};
mat_mul(a, b, s);
}
First I overflow my stack (I solve that with ulimit Linux command), but in addition I have an execution time of 0m9,700s, that is 200 time slower (always with --release flag)... I just want to know why I observe this and how to split this code without performance loss ?
Thank you !