Note you can move the "use" inside the main:
fn main() {
use std::io::prelude::{ Read, Write };
use std::{fs, io, env, process};
On my system your C++14 version runs in about 0.16 seconds, while the Rust code runs in 0.09 or 0.10 seconds. I don't know why.
Generally I love to use iterator chains of map/filter/fold and the like, but in code full of mutations I think they are less clear than regular imperative code. So I've converted the last part of your Rust code in code more similar to the C++14 one. On my system this Rust code is equally fast, and for me is more clear:
for &(seven, count) in &sevens {
let scores = first_part(&words, seven);
let mut out = *b".......\n";
let mut rest = seven;
let mut any = false;
for (&score, outc) in scores.iter().zip(out.iter_mut().rev().skip(1)) {
let points = score + 3 * count;
let a = if points >= 26 && points <= 32 { any = true; b'A' } else { b'a' };
*outc = a + (25 - rest.trailing_zeros()) as u8;
rest &= rest - 1;
}
if any { sink.write(&out).unwrap(); };
}
}
Then I've tried to use regular imperative code for the section that computes the scores too. But as probably you have seen the result is slower, it runs in
0.16 seconds (as the C++14 code):
for &(seven, count) in &sevens {
let mut scores = [0u16; 7];
for word in &words {
if word & !seven == 0 {
let mut rest = seven;
for score in scores.iter_mut() {
if (word & rest & !(rest - 1)) != 0 {
*score += 1;
}
rest &= rest - 1;
}
}
}
I don't know why that's slower. To better understand the situation I've moved that code in a function:
for &(seven, count) in &sevens {
let scores = first_part(&words, seven);
Where:
#[inline(never)]
fn first_part(words: &[u32], seven: u32) -> [u16; 7] {
let mut scores = [0u16; 7];
for word in words {
if word & !seven == 0 {
let mut rest = seven;
for score in scores.iter_mut() {
if (word & rest & !(rest - 1)) != 0 {
*score += 1;
}
rest &= rest - 1;
}
}
}
scores
}
I've also moved out to a function the original code:
#[inline(never)]
fn first_part(words: &[u32], seven: u32) -> [u16; 7] {
words
.iter()
.filter(|&word| word & !seven == 0)
.fold([0u16; 7], |mut scores, word| {
scores
.iter_mut()
.fold(seven, |rest, score| {
if word & rest & !(rest - 1) != 0 { *score += 1 }
rest & rest - 1
});
scores
})
}
This is the asm of first_part with the original code (with fold):
_ZN10first_part20h0910ff8aaedb046ceaaE:
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rsi
pushq %rdi
pushq %rbp
pushq %rbx
subq $48, %rsp
leal -1(%r9), %r11d
movl %r9d, %r14d
negl %r14d
andl %r9d, %r14d
andl %r9d, %r11d
movl %r9d, %eax
notl %eax
leal -1(%r11), %r9d
movl %r11d, %ebp
negl %ebp
movl %ebp, 20(%rsp)
andl %r11d, %r9d
leal -1(%r9), %edi
movl %r9d, %ebp
negl %ebp
movl %ebp, 16(%rsp)
andl %r9d, %edi
leal -1(%rdi), %esi
movl %edi, %ebp
negl %ebp
movl %ebp, 12(%rsp)
andl %edi, %esi
leal -1(%rsi), %ebx
movl %esi, %ebp
negl %ebp
movl %ebp, 8(%rsp)
andl %esi, %ebx
movl %ebx, %ebp
negl %ebp
movl %ebp, 4(%rsp)
leal -1(%rbx), %r15d
andl %ebx, %r15d
movl %r15d, %ebp
negl %ebp
movl %ebp, (%rsp)
leaq (%rdx,%r8,4), %r8
movl $0, 44(%rsp)
movl $0, 40(%rsp)
movl $0, 36(%rsp)
movl $0, 32(%rsp)
movl $0, 28(%rsp)
movl $0, 24(%rsp)
xorl %r13d, %r13d
jmp .LBB0_1
.LBB0_3:
movl %r14d, %r10d
andl %r12d, %r10d
cmpl $1, %r10d
sbbw $-1, %r13w
movl %r12d, %ebp
andl 20(%rsp), %ebp
andl %r11d, %ebp
cmpl $1, %ebp
movl 24(%rsp), %ebp
sbbw $-1, %bp
movl %ebp, 24(%rsp)
movl %r12d, %ebp
andl 16(%rsp), %ebp
andl %r9d, %ebp
cmpl $1, %ebp
movl 28(%rsp), %ebp
sbbw $-1, %bp
movl %ebp, 28(%rsp)
movl %r12d, %ebp
andl 12(%rsp), %ebp
andl %edi, %ebp
cmpl $1, %ebp
movl 32(%rsp), %ebp
sbbw $-1, %bp
movl %ebp, 32(%rsp)
movl %r12d, %ebp
andl 8(%rsp), %ebp
andl %esi, %ebp
cmpl $1, %ebp
movl 36(%rsp), %ebp
sbbw $-1, %bp
movl %ebp, 36(%rsp)
movl %r12d, %ebp
andl 4(%rsp), %ebp
andl %ebx, %ebp
cmpl $1, %ebp
movl 40(%rsp), %ebp
sbbw $-1, %bp
movl %ebp, 40(%rsp)
andl (%rsp), %r12d
andl %r15d, %r12d
cmpl $1, %r12d
movl 44(%rsp), %ebp
sbbw $-1, %bp
movl %ebp, 44(%rsp)
.align 16, 0x90
.LBB0_1:
cmpq %rdx, %r8
je .LBB0_4
movl (%rdx), %r12d
addq $4, %rdx
testl %eax, %r12d
jne .LBB0_1
jmp .LBB0_3
.LBB0_4:
movw %r13w, (%rcx)
movl 24(%rsp), %eax
movw %ax, 2(%rcx)
movl 28(%rsp), %eax
movw %ax, 4(%rcx)
movl 32(%rsp), %eax
movw %ax, 6(%rcx)
movl 36(%rsp), %eax
movw %ax, 8(%rcx)
movl 40(%rsp), %eax
movw %ax, 10(%rcx)
movl 44(%rsp), %eax
movw %ax, 12(%rcx)
movq %rcx, %rax
addq $48, %rsp
popq %rbx
popq %rbp
popq %rdi
popq %rsi
popq %r12
popq %r13
popq %r14
popq %r15
retq
This is the asm of first_part with the modified code (with loops):
_ZN10first_part20h1a62e25daf793c42eaaE:
pushq %r14
pushq %rsi
pushq %rdi
pushq %rbp
pushq %rbx
subq $16, %rsp
movq $0, 6(%rsp)
movq $0, (%rsp)
testq %r8, %r8
je .LBB0_4
movl %r9d, %eax
notl %eax
leal -1(%r9), %ebp
movl %r9d, %r10d
negl %r10d
andl %r9d, %r10d
andl %r9d, %ebp
leal -1(%rbp), %esi
movl %ebp, %r9d
negl %r9d
andl %ebp, %r9d
andl %ebp, %esi
leal -1(%rsi), %edi
movl %esi, %r11d
negl %r11d
andl %esi, %r11d
andl %esi, %edi
leal -1(%rdi), %esi
movl %edi, %r14d
negl %r14d
andl %edi, %r14d
andl %edi, %esi
leal -1(%rsi), %ebp
movl %esi, %edi
negl %edi
andl %esi, %edi
andl %esi, %ebp
leal -1(%rbp), %esi
movl %ebp, %ebx
negl %ebx
andl %ebp, %ebx
andl %ebp, %esi
movl %esi, %ebp
negl %ebp
andl %esi, %ebp
shlq $2, %r8
.align 16, 0x90
.LBB0_2:
movl (%rdx), %esi
testl %eax, %esi
jne .LBB0_3
testl %esi, %r10d
je .LBB0_7
incw (%rsp)
movl (%rdx), %esi
.LBB0_7:
testl %esi, %r9d
je .LBB0_9
incw 2(%rsp)
movl (%rdx), %esi
.LBB0_9:
testl %esi, %r11d
je .LBB0_11
incw 4(%rsp)
movl (%rdx), %esi
.LBB0_11:
testl %esi, %r14d
je .LBB0_13
incw 6(%rsp)
movl (%rdx), %esi
.LBB0_13:
testl %esi, %edi
je .LBB0_15
incw 8(%rsp)
movl (%rdx), %esi
.LBB0_15:
testl %esi, %ebx
je .LBB0_17
incw 10(%rsp)
movl (%rdx), %esi
.LBB0_17:
testl %esi, %ebp
je .LBB0_3
incw 12(%rsp)
.LBB0_3:
addq $4, %rdx
addq $-4, %r8
jne .LBB0_2
.LBB0_4:
movq (%rsp), %rax
movq 6(%rsp), %rdx
movq %rdx, 6(%rcx)
movq %rax, (%rcx)
movq %rcx, %rax
addq $16, %rsp
popq %rbx
popq %rbp
popq %rdi
popq %rsi
popq %r14
retq
The two asm listings are quite different. But I don't yet know what causes the significant performance difference... Opinions are welcome.