What is the implementation of `count_ones`?


#1

I’m trying to find out how the numeric function count_ones is implemented. In particular, I want to know whether it uses the SSE popcnt instruction.

I searched the Rust source tree, but found no fundamental function definitions.

How do I do this? Thanks!


#2

From @lambda’s post, Copy generictype + trait implementation for array needed in a separate function provides an easy way to see what assembly is actually generated for any given construct. It looks like it uses popcnt if you set -C target-cpu=native but by default is a shift-add chain. I used the following code:

// Type your code here, or load an example.
pub fn square(num: i32) -> u32 {
    num.count_ones()
}

Searching the source for count_ones turns up an implementation buried in a macro; the relevant macro call site is delegating to an intrinsic. Intrinsics in general are matched by the compiler; ctpop appears to be turned into a call to llvm.ctpop.i64.


#3

That’s because Rust doesn’t define how it works. Given:

#[allow(dead_code)]
#[inline(never)]
pub fn count_ones(v: u64) -> u32 {
    v.count_ones()
}

and rustc --crate-type lib count_ones.rs -O as the base compiler invocation, you can get the generated LLVM IR by adding --emit llvm-ir, which looks like:

; ModuleID = 'count_ones.0.rs'
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i686-pc-windows-gnu"

; Function Attrs: noinline norecurse nounwind readnone uwtable
define i32 @_ZN10count_ones20hc73667123f942da6eaaE(i64) unnamed_addr #0 {
entry-block:
  %1 = tail call i64 @llvm.ctpop.i64(i64 %0) #2
  %2 = trunc i64 %1 to i32
  ret i32 %2
}

; Function Attrs: nounwind readnone
declare i64 @llvm.ctpop.i64(i64) #1

attributes #0 = { noinline norecurse nounwind readnone uwtable }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

So it boils down to an LLVM intrinsic. If you use --emit asm, you can get the assembly which looks like:

    .text
    .def     @feat.00;
    .scl    3;
    .type    0;
    .endef
    .globl    @feat.00
@feat.00 = 1
    .def     __ZN10count_ones20hc73667123f942da6eaaE;
    .scl    2;
    .type    32;
    .endef
    .globl    __ZN10count_ones20hc73667123f942da6eaaE
    .align    16, 0x90
__ZN10count_ones20hc73667123f942da6eaaE:
    .cfi_startproc
    movl    4(%esp), %eax
    movl    8(%esp), %ecx
    movl    %ecx, %edx
    shrl    %edx
    andl    $1431655765, %edx
    subl    %edx, %ecx
    movl    %ecx, %edx
    andl    $858993459, %edx
    shrl    $2, %ecx
    andl    $858993459, %ecx
    addl    %edx, %ecx
    movl    %ecx, %edx
    shrl    $4, %edx
    addl    %ecx, %edx
    andl    $252645135, %edx
    imull    $16843009, %edx, %ecx
    shrl    $24, %ecx
    movl    %eax, %edx
    shrl    %edx
    andl    $1431655765, %edx
    subl    %edx, %eax
    movl    %eax, %edx
    andl    $858993459, %edx
    shrl    $2, %eax
    andl    $858993459, %eax
    addl    %edx, %eax
    movl    %eax, %edx
    shrl    $4, %edx
    addl    %eax, %edx
    andl    $252645135, %edx
    imull    $16843009, %edx, %eax
    shrl    $24, %eax
    addl    %ecx, %eax
    retl
    .cfi_endproc

However, if I override the target CPU and specify something more recent using --emit asm -C target-cpu=haswell:

    .text
    .def     @feat.00;
    .scl    3;
    .type    0;
    .endef
    .globl    @feat.00
@feat.00 = 1
    .def     __ZN10count_ones20hc73667123f942da6eaaE;
    .scl    2;
    .type    32;
    .endef
    .globl    __ZN10count_ones20hc73667123f942da6eaaE
    .align    16, 0x90
__ZN10count_ones20hc73667123f942da6eaaE:
    .cfi_startproc
    popcntl    8(%esp), %ecx
    popcntl    4(%esp), %eax
    addl    %ecx, %eax
    retl
    .cfi_endproc

And there’s popcntl.

So, the answer to your question is: uh, maybe; it depends.

Edit: oh, a more direct way to get it to use popcntl: add -C target-feature=+popcnt. I would have checked this before, but I got bored waiting for LLVM to install so I could list CPU features (llc -mcpu=help).


#4

I know that this is an old-thread, but I recently read a paper on this:

Using SIMD instructions can be faster than hardware pop-count functions: