I'm implementing many solutions in Rust for Euler Problems, and I'm seeing many interesting phenomena.

All the code of this post is compiled with:

`-C opt-level=3 -C prefer-dynamic -C target-cpu=native -Z orbit`

(None of the code in this post was originally mine, I've converted it to Rust from other languages, and I've tried to improve it a little in various ways. Generally the resulting Rust code is hugely more clear).

This is a solution for Problem 154 ( #154 Exploring Pascal's pyramid - Project Euler ):

```
fn e154() -> u32 {
fn rep_div(mut n: u32, i: u32) -> u32 {
let mut result = 0;
while n % i == 0 {
result += 1;
n /= i;
}
return result;
}
const MAX: u32 = 200_000;
const UMAX: usize = MAX as usize;
let mut f5 = vec![0u32; UMAX + 1];
let mut f2 = vec![0u32; UMAX + 1];
for n in 1 .. MAX + 1 {
f5[n as usize] = f5[n as usize - 1] + rep_div(n, 5);
f2[n as usize] = f2[n as usize - 1] + rep_div(n, 2);
}
let fives = f5[UMAX] - 11;
let twos = f2[UMAX] - 11;
let mut answer = 0;
for a in 0 .. MAX / 3 {
let mut b = a;
while a + 2 * b <= MAX {
let c = MAX - a - b;
let factors5 = f5[a as usize] + f5[b as usize] + f5[c as usize];
let factors2 = f2[a as usize] + f2[b as usize] + f2[c as usize];
if factors5 < fives && factors2 < twos {
if a == c {
answer += 1;
} else if a < b && b < c {
answer += 6;
} else {
answer += 3;
}
}
b += 1;
}
}
answer
}
fn main() {
assert_eq!(e154(), 479_742_450);
}
```

This is the same code, but the answer is accumulated with an intermediate step, before the final sum():

```
#![feature(iter_arith)]
fn e154() -> u32 {
fn rep_div(mut n: u32, i: u32) -> u32 {
let mut result = 0;
while n % i == 0 {
result += 1;
n /= i;
}
return result;
}
const MAX: u32 = 200_000;
const UMAX: usize = MAX as usize;
let mut f5 = vec![0u32; UMAX + 1];
let mut f2 = vec![0u32; UMAX + 1];
for n in 1 .. MAX + 1 {
f5[n as usize] = f5[n as usize - 1] + rep_div(n, 5);
f2[n as usize] = f2[n as usize - 1] + rep_div(n, 2);
}
let fives = f5[UMAX] - 11;
let twos = f2[UMAX] - 11;
(0 .. MAX / 3)
.map(|a| {
let mut answer = 0;
let mut b = a;
while a + 2 * b <= MAX {
let c = MAX - a - b;
let factors5 = f5[a as usize] + f5[b as usize] + f5[c as usize];
let factors2 = f2[a as usize] + f2[b as usize] + f2[c as usize];
if factors5 < fives && factors2 < twos {
if a == c {
answer += 1;
} else if a < b && b < c {
answer += 6;
} else {
answer += 3;
}
}
b += 1;
}
answer
})
.sum()
}
fn main() {
assert_eq!(e154(), 479_742_450);
}
```

For me it's a surprise to see this second version is quite faster than the first (If you have multi-thread code, having a local accumulator is a significant advantage, but that's single thread code). Do you have an idea of the causes?

This is a faster solution (it's a jungle of casts, but if I use usize types to remove most of them, the code slows down):

```
fn e154() -> u32 {
const L: u32 = 200_000;
const POW10: u32 = 12;
const L3: u32 = L / 3;
let mut e2 = vec![0u32; L as usize + 1];
let mut pow: u32 = 2;
while pow <= L {
let mut mult: u32 = pow;
while mult <= L {
e2[mult as usize] += 1;
mult += pow;
}
pow *= 2;
}
let mut e5 = [0u32; L as usize + 1];
pow = 5;
while pow <= L {
let mut mult = pow;
while mult <= L {
e5[mult as usize] += 1;
mult += pow;
}
pow *= 5;
}
for i in 1u32 .. L + 1 {
e2[i as usize] = e2[i as usize] + e2[i as usize - 1];
e5[i as usize] = e5[i as usize] + e5[i as usize - 1];
}
let mut d5 = vec![0u32; L as usize + 1];
for i in 0u32 .. L + 1 {
d5[i as usize] = i - 4 * e5[i as usize];
}
// Find max val of d5.
let mut maxd: u32 = 0;
for i in 0u32 .. L + 1 {
if d5[i as usize] > maxd {
maxd = d5[i as usize];
}
}
// nid[d] is the number of i for which d[i] = d.
let mut nid = vec![0u32; maxd as usize + 1];
for i in 0u32 .. L {
nid[d5[i as usize] as usize] += 1;
}
// li[v] is the sorted list of i for which d5[i] = v.
let mut li = (0 .. maxd as usize + 1)
.map(|i| vec![0u32; nid[i] as usize + 2])
.collect::<Vec<_>>();
let mut starts = vec![0usize; maxd as usize + 1];
for x in nid.iter_mut() { *x = 0; }
for i in 0 .. L + 1 {
let d5i = d5[i as usize] as usize;
li[d5i][nid[d5i] as usize] = i;
nid[d5i] += 1;
}
for i in 0 .. maxd + 1 {
li[i as usize][nid[i as usize] as usize] = 1_000_000; // Sentinel.
}
let mut n_sols: u32 = 0;
let mut l_minus_2i: u32 = L + 2;
for i in 0 .. L3 + 1 {
let dif: u32 = 4 * POW10 + d5[L as usize] - d5[i as usize];
let mind: u32 = (dif + 1) / 2;
l_minus_2i -= 2;
for vald in mind .. maxd + 1 {
let mut start = starts[vald as usize];
while li[vald as usize][start] < i {
start += 1;
}
starts[vald as usize] = start;
for &j in li[vald as usize][start ..].iter() {
if j > l_minus_2i {
break;
}
let dif_minus_vald: u32 = dif - vald;
let k: u32 = L - i - j;
if d5[k as usize] >= dif_minus_vald &&
e2[L as usize] - e2[i as usize] - e2[j as usize] - e2[k as usize] >= POW10 {
if j == k {
n_sols += 6;
} else {
if i == j || i == k {
if d5[k as usize] >= mind && vald >= dif_minus_vald {
n_sols += 3;
} else {
n_sols += 6;
}
} else {
if d5[k as usize] >= mind && vald >= dif_minus_vald {
n_sols += 6;
} else {
n_sols += 12;
}
}
}
}
}
}
}
n_sols / 2
}
fn main() {
assert_eq!(e154(), 479_742_450);
}
```

To experiment I've replaced the array accesses in the main part of the code with .get_unchecked()/.get_unchecked_mut():

```
fn e154() -> u32 {
const L: u32 = 200_000;
const POW10: u32 = 12;
const L3: u32 = L / 3;
let mut e2 = vec![0u32; L as usize + 1];
let mut pow: u32 = 2;
while pow <= L {
let mut mult: u32 = pow;
while mult <= L {
e2[mult as usize] += 1;
mult += pow;
}
pow *= 2;
}
let mut e5 = [0u32; L as usize + 1];
pow = 5;
while pow <= L {
let mut mult = pow;
while mult <= L {
e5[mult as usize] += 1;
mult += pow;
}
pow *= 5;
}
for i in 1u32 .. L + 1 {
e2[i as usize] = e2[i as usize] + e2[i as usize - 1];
e5[i as usize] = e5[i as usize] + e5[i as usize - 1];
}
let mut d5 = vec![0u32; L as usize + 1];
for i in 0u32 .. L + 1 {
d5[i as usize] = i - 4 * e5[i as usize];
}
// Find max val of d5.
let mut maxd: u32 = 0;
for i in 0u32 .. L + 1 {
if d5[i as usize] > maxd {
maxd = d5[i as usize];
}
}
// nid[d] is the number of i for which d[i] = d.
let mut nid = vec![0u32; maxd as usize + 1];
for i in 0u32 .. L {
nid[d5[i as usize] as usize] += 1;
}
// li[v] is the sorted list of i for which d5[i] = v.
let mut li = (0 .. maxd as usize + 1)
.map(|i| vec![0u32; nid[i] as usize + 2])
.collect::<Vec<_>>();
let mut starts = vec![0usize; maxd as usize + 1];
for x in nid.iter_mut() { *x = 0; }
for i in 0 .. L + 1 {
let d5i = d5[i as usize] as usize;
li[d5i][nid[d5i] as usize] = i;
nid[d5i] += 1;
}
for i in 0 .. maxd + 1 {
li[i as usize][nid[i as usize] as usize] = 1_000_000; // Sentinel.
}
let mut n_sols: u32 = 0;
let mut l_minus_2i: u32 = L + 2;
unsafe {
for i in 0 .. L3 + 1 {
let dif: u32 = 4 * POW10 + *d5.get_unchecked(L as usize) - *d5.get_unchecked(i as usize);
let mind: u32 = (dif + 1) / 2;
l_minus_2i -= 2;
for vald in mind .. maxd + 1 {
let mut start = *starts.get_unchecked(vald as usize);
while *li.get_unchecked(vald as usize).get_unchecked(start) < i {
start += 1;
}
*starts.get_unchecked_mut(vald as usize) = start;
for &j in li.get_unchecked(vald as usize)[start ..].iter() {
if j > l_minus_2i {
break;
}
let dif_minus_vald: u32 = dif - vald;
let k: u32 = L - i - j;
if *d5.get_unchecked(k as usize) >= dif_minus_vald &&
*e2.get_unchecked(L as usize) - *e2.get_unchecked(i as usize) -
*e2.get_unchecked(j as usize) - e2.get_unchecked(k as usize) >= POW10 {
if j == k {
n_sols += 6;
} else {
if i == j || i == k {
if *d5.get_unchecked(k as usize) >= mind && vald >= dif_minus_vald {
n_sols += 3;
} else {
n_sols += 6;
}
} else {
if *d5.get_unchecked(k as usize) >= mind && vald >= dif_minus_vald {
n_sols += 6;
} else {
n_sols += 12;
}
}
}
}
}
}
}
}
n_sols / 2
}
fn main() {
assert_eq!(e154(), 479_742_450);
}
```

This code with .get_unchecked() is slower than the regular checked code, do you know why? I've seen a similar outcome in many other situations.