It's interesting to compare the assembly of both functions. I made a small change: instead of printing out swaps or not, I either return its value or I return 1 (just to avoid all the code added for println!).
In the code that doesn't use swaps, the optimizer has doubled the code length to (1) add a specific case when len = 2, and (2) unroll the loop to process 2 comparisons by iteration, skipping the last one if the length is odd.
The loop code with swaps is almost the same, though, so there shouldn't be any noticeable difference except if the branch prediction is very unlucky (it shouldn't be a limitation on the CPU side) in one case and not the other, which I'd dismiss given the symmetry.
I see you don't observe the problem any more. Have your data changed, and is/was there any particular pattern to them? Other than that, it must indeed be a measurement error of a problem in the procedure. I doubt that the optimizer will merge both functions even if there's only a small difference at the end.
left: discards swaps, right: returns swaps (thus limiting the loop unrolling)
example::sort: |
cmp rsi, 2 |
jb .LBB0_13 |
lea rcx, [rsi - 1] |
mov eax, 1 |
cmp rsi, 2 |
jne .LBB0_2 |
.LBB0_9: |
test cl, 1 |
je .LBB0_13 |
mov ecx, dword ptr [rdi + 4*rax] |
.LBB0_11: |
mov edx, dword ptr [rdi + 4*rax - 4] |
cmp edx, ecx |
jbe .LBB0_13 |
mov dword ptr [rdi + 4*rax], edx |
dec rax |
mov dword ptr [rdi + 4*rax], ecx |
jne .LBB0_11 |
.LBB0_13: | example::sort:
mov eax, 1 | cmp rsi, 2
ret | jb .LBB0_7
.LBB0_2: | mov ecx, 1
mov rdx, rcx | xor eax, eax
and rdx, -2 | jmp .LBB0_4
xor esi, esi | .LBB0_2:
jmp .LBB0_3 | mov rax, rdx
.LBB0_8: | .LBB0_3:
add rax, 2 | inc rcx
add rsi, 2 | cmp rcx, rsi
cmp rsi, rdx | je .LBB0_8
je .LBB0_9 | .LBB0_4:
.LBB0_3: | lea rdx, [rax + rcx]
mov r8d, dword ptr [rdi + 4*rax] | mov r8d, dword ptr [rdi + 4*rcx]
mov r9, rax | mov r9, rcx
.LBB0_4: | .LBB0_5:
mov r10d, dword ptr [rdi + 4*r9 - 4] | mov r10d, dword ptr [rdi + 4*r9 - 4]
cmp r10d, r8d | cmp r10d, r8d
jbe .LBB0_5 | jbe .LBB0_3
mov dword ptr [rdi + 4*r9], r10d | inc rax
dec r9 | mov dword ptr [rdi + 4*r9], r10d
mov dword ptr [rdi + 4*r9], r8d | dec r9
jne .LBB0_4 | mov dword ptr [rdi + 4*r9], r8d
.LBB0_5: | test r9, r9
mov r8d, dword ptr [rdi + 4*rax + 4] | jne .LBB0_5
mov r9, rax | jmp .LBB0_2
.LBB0_6: | .LBB0_7:
mov r10d, dword ptr [rdi + 4*r9] | xor eax, eax
cmp r10d, r8d | .LBB0_8:
jbe .LBB0_8 | ret
mov dword ptr [rdi + 4*r9 + 4], r10d |
mov dword ptr [rdi + 4*r9], r8d |
add r9, -1 |
jb .LBB0_6 |
jmp .LBB0_8 |