I got it down to 37.6M instructions:

```
1,446,052,975 branch-instructions:u # 235.778 M/sec (12.86%)
11,787,247 branch-misses:u # 0.82% of all branches (12.88%)
17,822,449 cache-misses:u # 42.093 % of all cache refs (12.88%)
42,341,125 cache-references:u # 6.904 M/sec (12.91%)
18,469,630,318 cpu-cycles:u # 3.011 GHz (12.99%)
37,559,336,652 instructions:u # 2.03 insn per cycle
# 0.25 stalled cycles per insn (16.27%)
17,350,946,134 ref-cycles:u # 2829.060 M/sec (19.56%)
4,650,037,949 stalled-cycles-backend:u # 25.18% backend cycles idle (19.59%)
9,450,246,769 stalled-cycles-frontend:u # 51.17% frontend cycles idle (19.60%)
0 alignment-faults:u # 0.000 K/sec
0 bpf-output:u # 0.000 K/sec
0 context-switches:u # 0.000 K/sec
6,133.10 msec cpu-clock:u # 0.999 CPUs utilized
0 major-faults:u # 0.000 K/sec
122,844 minor-faults:u # 0.020 M/sec
122,844 page-faults:u # 0.020 M/sec
6,133.13 msec task-clock:u # 0.999 CPUs utilized
6,137,839,308 ns duration_time:u # 1000.770 M/sec
134,377,834 L1-dcache-load-misses:u # 1.10% of all L1-dcache accesses (19.59%)
12,174,827,662 L1-dcache-loads:u # 1985.097 M/sec (19.51%)
316,753,907 L1-dcache-prefetch-misses:u # 51.647 M/sec (12.91%)
915,558 L1-dcache-prefetches:u # 0.149 M/sec (12.88%)
9,018,038 L1-dcache-store-misses:u # 1.470 M/sec (12.87%)
3,206,034,971 L1-dcache-stores:u # 522.742 M/sec (12.86%)
60,996,287 L1-icache-load-misses:u # 0.59% of all L1-icache accesses (12.85%)
10,371,690,334 L1-icache-loads:u # 1691.097 M/sec (12.85%)
15,493,301 LLC-load-misses:u # 58.12% of all LL-cache accesses (12.85%)
26,655,844 LLC-loads:u # 4.346 M/sec (12.85%)
42,925,831 LLC-prefetch-misses:u # 6.999 M/sec (6.42%)
98,418,713 LLC-prefetches:u # 16.047 M/sec (6.42%)
1,580,115 LLC-store-misses:u # 0.258 M/sec (6.42%)
18,946,611 LLC-stores:u # 3.089 M/sec (6.42%)
1,250,061,895 branch-load-misses:u # 203.822 M/sec (9.64%)
1,514,701,822 branch-loads:u # 246.971 M/sec (12.86%)
2,767,282 dTLB-load-misses:u # 0.02% of all dTLB cache accesses (12.86%)
12,334,413,397 dTLB-loads:u # 2011.118 M/sec (12.85%)
145,344 dTLB-store-misses:u # 0.024 M/sec (12.86%)
3,103,502,436 dTLB-stores:u # 506.024 M/sec (12.85%)
530,157 iTLB-load-misses:u # 0.00% of all iTLB cache accesses (12.87%)
38,267,760,941 iTLB-loads:u # 6239.532 M/sec (12.87%)
6.137839308 seconds time elapsed
5.753397000 seconds user
```

At this point, I think we are within 3x of openblas.

Here is the code it is currently doing for C = A * B, where A, B, C are all 4x4 f32 matrices:

```
│ Disassembly of section .text: ▒
│ ▒
│ 00000000000135a0 <x_tensor::blas::mat4f::Mat4f::mat4_sgemm>: ▒
│ x_tensor::fixed::pt4::Pt4f32::mut_s_add: ▒
│ ▒
│ impl Pt4f32 { ▒
│ #[inline(always)] ▒
│ pub fn mut_s_add(&mut self, alpha: f32, rhs: &Pt4f32) { ▒
│ let d = &mut self.0; ▒
│ d[0] = d[0] + rhs.0[0] * alpha; ▒
0.74 │ movss (%rsi),%xmm4 ▒
0.03 │ movss 0x4(%rsi),%xmm5 ▒
0.05 │ movss 0x8(%rsi),%xmm6 ▒
2.96 │ movss 0xc(%rsi),%xmm7 ▒
0.01 │ shufps $0x0,%xmm4,%xmm4 ▒
9.84 │ movaps (%rdx),%xmm3 ▒
0.25 │ movaps 0x10(%rdx),%xmm2 ▒
3.08 │ movaps 0x20(%rdx),%xmm1 ▒
1.94 │ movaps 0x30(%rdx),%xmm0 ▒
6.12 │ mulps %xmm3,%xmm4 ▒
7.32 │ addps (%rdi),%xmm4 ▒
0.37 │ shufps $0x0,%xmm5,%xmm5 ▒
0.02 │ mulps %xmm2,%xmm5 ▒
7.30 │ addps %xmm4,%xmm5 ▒
│ shufps $0x0,%xmm6,%xmm6 ▒
0.45 │ mulps %xmm1,%xmm6 ▒
7.43 │ addps %xmm5,%xmm6 ▒
│ shufps $0x0,%xmm7,%xmm7 ▒
0.02 │ mulps %xmm0,%xmm7 ▒
8.21 │ addps %xmm6,%xmm7 ▒
2.88 │ movaps %xmm7,(%rdi) ▒
│ movss 0x10(%rsi),%xmm4 ▒
│ movss 0x14(%rsi),%xmm5 ▒
0.04 │ movss 0x18(%rsi),%xmm6 ▒
2.76 │ movss 0x1c(%rsi),%xmm7 ▒
│ shufps $0x0,%xmm4,%xmm4 ▒
0.04 │ mulps %xmm3,%xmm4 ▒
0.17 │ addps 0x10(%rdi),%xmm4 ▒
2.63 │ shufps $0x0,%xmm5,%xmm5 ▒
0.01 │ mulps %xmm2,%xmm5 ▒
0.16 │ addps %xmm4,%xmm5 ▒
0.04 │ shufps $0x0,%xmm6,%xmm6 ▒
2.73 │ mulps %xmm1,%xmm6 ▒
0.22 │ addps %xmm5,%xmm6 ▒
│ shufps $0x0,%xmm7,%xmm7 ▒
0.03 │ mulps %xmm0,%xmm7 ▒
3.63 │ addps %xmm6,%xmm7 ▒
0.41 │ movaps %xmm7,0x10(%rdi) ▒
0.01 │ movss 0x20(%rsi),%xmm4 ▒
0.01 │ movss 0x24(%rsi),%xmm5 ▒
2.30 │ movss 0x28(%rsi),%xmm6 ▒
0.40 │ movss 0x2c(%rsi),%xmm7 ▒
│ shufps $0x0,%xmm4,%xmm4 ▒
0.02 │ mulps %xmm3,%xmm4 ▒
2.28 │ addps 0x20(%rdi),%xmm4 ▒
0.43 │ shufps $0x0,%xmm5,%xmm5 ▒
0.02 │ mulps %xmm2,%xmm5 ▒
0.06 │ addps %xmm4,%xmm5 ▒
2.32 │ shufps $0x0,%xmm6,%xmm6 ▒
0.38 │ mulps %xmm1,%xmm6 ▒
0.20 │ addps %xmm5,%xmm6 ▒
0.02 │ shufps $0x0,%xmm7,%xmm7 ▒
2.47 │ mulps %xmm0,%xmm7 ▒
0.80 │ addps %xmm6,%xmm7 ▒
0.34 │ movaps %xmm7,0x20(%rdi) ▒
│ movss 0x30(%rsi),%xmm4 ▒
2.46 │ movss 0x34(%rsi),%xmm5 ▒
0.24 │ movss 0x38(%rsi),%xmm6 ▒
0.33 │ movss 0x3c(%rsi),%xmm7 ▒
0.01 │ shufps $0x0,%xmm4,%xmm4 ▒
2.37 │ mulps %xmm3,%xmm4 ▒
0.29 │ addps 0x30(%rdi),%xmm4 ▒
0.39 │ shufps $0x0,%xmm5,%xmm5 ▒
0.01 │ mulps %xmm2,%xmm5 ▒
2.32 │ addps %xmm4,%xmm5 ▒
0.22 │ shufps $0x0,%xmm6,%xmm6 ▒
0.39 │ mulps %xmm1,%xmm6 ▒
0.16 │ addps %xmm5,%xmm6 ▒
2.26 │ shufps $0x0,%xmm7,%xmm7 ▒
0.17 │ mulps %xmm0,%xmm7 ▒
0.97 │ addps %xmm6,%xmm7 ▒
0.42 │ movaps %xmm7,0x30(%rdi) ▒
│ x_tensor::blas::mat4f::Mat4f::mat4_sgemm: ▒
│ out[3].mut_s_add(d3[2], &rr2); ▒
│ out[3].mut_s_add(d3[3], &rr3); ▒
│ ▒
│ /* ▒
│ ▒
│ */} ▒
3.08 │ ← retq ▒
▒
```

I believe the xmm registers imply sse. This CPU does not have avx instructions, so, afaik the best we can do is 4 f32's at once.