[arm] Binary without branch works, binary with branch does not

Thread update: [arm] Binary without branch works, binary with branch does not - #8 by lopote3501

I'm on rustc 1.37.0-nightly. This is my code:

#[repr(C)]
#[allow(non_snake_case)]
struct Registers {
    FSEL: [Volatile<u32>; 6],
    __r0: Reserved<u32>,
    SET: [WriteVolatile<u32>; 2],
    __r1: Reserved<u32>,
    CLR: [WriteVolatile<u32>; 2],
    __r2: Reserved<u32>,
    LEV: [ReadVolatile<u32>; 2],
    __r3: Reserved<u32>,
    EDS: [Volatile<u32>; 2],
    __r4: Reserved<u32>,
    REN: [Volatile<u32>; 2],
    __r5: Reserved<u32>,
    FEN: [Volatile<u32>; 2],
    __r6: Reserved<u32>,
    HEN: [Volatile<u32>; 2],
    __r7: Reserved<u32>,
    LEN: [Volatile<u32>; 2],
    __r8: Reserved<u32>,
    AREN: [Volatile<u32>; 2],
    __r9: Reserved<u32>,
    AFEN: [Volatile<u32>; 2],
    __r10: Reserved<u32>,
    PUD: Volatile<u32>,
    PUDCLK: [Volatile<u32>; 2],
}

I have a problem when I execute this line (from Reading structures in memory via pointers - #19 by Ethindp ):

unsafe { (GPIO_BASE as *mut Registers).as_mut().unwrap() }

I do not precisely know what is happening when that line is executed because I'm running it on a raspberry pi 3 and it just does not execute anything else after that.

The address is correct because reading from/writing to it with plain addresses such as const FSEL1: *mut u32 = (GPIO_BASE + 0x04) as *mut u32; works.

Volatile's are just wrappers with read/write_volatile() (I don't think Volatile is the problem as I also tried with *mut [u32;...]).

What can the problem be?

No idea. But a while back I made this experiment to memory map the Pi system timer and read it in user space:

//
// Read memory mapped system timer of Raspberry Pi.
//
// See: https://users.rust-lang.org/t/memory-mapping-soc-registers/31976/15
//

use std::fs;
use memmap::{MmapOptions, MmapMut};
use volatile::Volatile; 

pub struct SystemTimer {
    mmap: MmapMut,
}

#[repr(C)] 
struct Registers {
    control_and_status: Volatile<u32>,
    counter_low: Volatile<u32>,
    counter_high: Volatile<u32>,
    compare_0: Volatile<u32>,
    compare_1: Volatile<u32>,
    compare_2: Volatile<u32>,
    compare_3: Volatile<u32>,
}

impl SystemTimer {
    pub fn new() -> SystemTimer {
        let mmap = SystemTimer::map_registers();
        SystemTimer {
            mmap: mmap,
        }
    }

    fn map_registers() -> MmapMut {
        // All peripherals can be described by an offset from the Peripheral Base Address, which starts at:
        // 0x20000000 on the Raspberry Pi model 1
        // 0x3F000000 on the models 2 and 3.
        const PERIPHERAL_BASE_ADDRESS: u64 = 0x3F000000;

        // The System Timer is a hardware clock that can be used to keep time and generate interrupts after a certain time.
        // It is located at offset 0x3000 from the peripheral base.
        const SYSTEM_TIMER_OFFSET: u64 = 0x3000;

        let f = fs::OpenOptions::new().read(true)
                                    .write(true)
                                    .open("/dev/mem")
                                    .unwrap();

        // Create a new memory map builder and build a map.
        unsafe {
            MmapOptions::new()
                        .offset(PERIPHERAL_BASE_ADDRESS + SYSTEM_TIMER_OFFSET)
                        .len(4096)
                        .map_mut(&f)
                        .unwrap()
        }
    }

    pub fn read_counter_low(&self) -> u32 {
        unsafe {
            let registers = self.mmap.as_ptr() as *const Registers;
            (*registers).counter_low.read()
        }
    } 

    pub fn read_counter_high(&self) -> u32 {
        unsafe {
            let registers = self.mmap.as_ptr() as *const Registers;
            (*registers).counter_high.read()
        }
    }

    pub fn read_counter_64(&self) -> u64 {
        unsafe {
            let registers = self.mmap.as_ptr() as *const Registers;
            let mut high;
            let mut low;
            loop {
                high = (*registers).counter_high.read();
                low = (*registers).counter_low.read();
                let high2 = (*registers).counter_high.read();
                if high == high2 {
                    break;
                }
            }
            ((high as u64) << 32) + (low as u64)
        }
    }
}

It works. On a Pi 3 at least. I don't know if my use of "unsafe" is strictly corret though.

1 Like

Are you on a Linux distro? I see .open("/dev/mem").
I'm running mine bare metal

Are you sure it's not padding? You might need to add #[repr(packed)]. It might also have something to do with memory protection. I don't know how "bare metal" your environment is, but you might need to make sure you are allowed to read and write the addresses you want to access.

Ah, sorry yes, I'm running that on a regular 32 bit Pi OS.

I might have been wrong, it was not that line. When I inline(always) all methods I call, it works. When not, it does not. The kernel is as small as 2kB vs 4kB. It could happen my bootloader does not load "large" kernels. I will try to find out the answer and post it here

That certainly smells like Undefined Behavior. Do you have any unsafe code? Can you provide some code or at least some context as to what the code is supposed to be doing?

I got to the point where a small change makes a bit difference, inline(always) vs inline(never). The diff of the disassemblies is (of the whole binaries):

$ diff objdump_always objdump_never
30c30
<  4000054:	bl	#0xc <kmain::h497e3e7b019098a1>
---
>  4000054:	bl	#0x24 <kmain::h497e3e7b019098a1>
34,56c34,57
< 0000000004000060 kmain::h497e3e7b019098a1:
<  4000060:	mov	w8, #0x4
<  4000064:	movk	w8, #0x3f20, lsl #16
<  4000068:	ldr	w9, [x8]
<  400006c:	orr	w10, w9, #0x40000
<  4000070:	mov	w9, #0x27c0
<  4000074:	movk	w9, #0x9, lsl #16
<  4000078:	str	w10, [x8]
<  400007c:	ldr	w10, [x8, #0x18]
<  4000080:	orr	w11, w10, #0x10000
<  4000084:	mov	x10, x9
<  4000088:	str	w11, [x8, #0x18]
<  400008c:	subs	x10, x10, #0x1
<  4000090:	nop
<  4000094:	b.ne	#-0x8 <kmain::h497e3e7b019098a1+0x2c>
<  4000098:	ldr	w10, [x8, #0x24]
<  400009c:	orr	w11, w10, #0x10000
<  40000a0:	mov	x10, x9
<  40000a4:	str	w11, [x8, #0x24]
<  40000a8:	subs	x10, x10, #0x1
<  40000ac:	nop
<  40000b0:	b.ne	#-0x8 <kmain::h497e3e7b019098a1+0x48>
<  40000b4:	b	#-0x38 <kmain::h497e3e7b019098a1+0x1c>
---
> 0000000004000060 spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564:
>  4000060:	mov	w8, #0x27c0
>  4000064:	movk	w8, #0x9, lsl #16
>  4000068:	subs	x8, x8, #0x1
>  400006c:	nop
>  4000070:	b.ne	#-0x8 <spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564+0x8>
>  4000074:	ret
> 
> 0000000004000078 kmain::h497e3e7b019098a1:
>  4000078:	stp	x19, x30, [sp, #-0x10]!
>  400007c:	mov	w19, #0x4
>  4000080:	movk	w19, #0x3f20, lsl #16
>  4000084:	ldr	w8, [x19]
>  4000088:	orr	w8, w8, #0x40000
>  400008c:	str	w8, [x19]
>  4000090:	ldr	w8, [x19, #0x18]
>  4000094:	orr	w8, w8, #0x10000
>  4000098:	str	w8, [x19, #0x18]
>  400009c:	bl	#-0x3c <spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564>
>  40000a0:	ldr	w8, [x19, #0x24]
>  40000a4:	orr	w8, w8, #0x10000
>  40000a8:	str	w8, [x19, #0x24]
>  40000ac:	bl	#-0x4c <spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564>
>  40000b0:	b	#-0x20 <kmain::h497e3e7b019098a1+0x18>

The always version works, the never version does not.
I wonder if it's due to some of the instructions being for the wrong architecture? Running qemu works:

qemu-system-aarch64 \
    -nographic \
    -M raspi3 \
    -d in_asm \
    -serial null -serial mon:stdio \
    -kernel kernel.bin

I am running on a raspberry pi 3 a+. This is my .cargo/config:

[build]
target = "aarch64-unknown-none"

[target.aarch64-unknown-none]
runner = "./qemu.sh"
rustflags = [
    "-C", "target-cpu=cortex-a53",
    "-C", "link-arg=--script=.cargo/layout.ld",
    "-C", "link-arg=--no-dynamic-linker",

    # link to libsd.a
    "-C", "link-arg=-L.cargo",
    "-C", "link-arg=-lsd",
]

From the technical spec my rpi3a+ should be able to run this. What am I missing?

The two binaries are very small. I will post both.

This is the one with inline(always) (works):

cargo objdump -- -disassemble -no-show-raw-insn -print-imm-hex build/kernel.elf

build/kernel.elf:	file format ELF64-aarch64-little

Disassembly of section .text:
0000000004000000 _start:
 4000000:	mrs	x1, MPIDR_EL1
 4000004:	and	x1, x1, #0x3
 4000008:	cbz	x1, #0xc <_start+0x14>
 400000c:	wfe
 4000010:	b	#-0x4 <_start+0xc>
 4000014:	adr	x1, #-0x14
 4000018:	mov	sp, x1
 400001c:	bl	#0xc <kinit>
 4000020:	b	#-0x14 <_start+0xc>
 4000024:	udf	#0x0

0000000004000028 kinit:
 4000028:	adrp	x8, #0x0
 400002c:	add	x8, x8, #0xc0
 4000030:	adrp	x9, #0x0
 4000034:	add	x9, x9, #0xc0
 4000038:	cmp	x9, x8
 400003c:	b.hs	#0x18 <kinit+0x2c>
 4000040:	adrp	x9, #0x0
 4000044:	add	x9, x9, #0xc0
 4000048:	str	xzr, [x9], #0x8
 400004c:	cmp	x9, x8
 4000050:	b.lo	#-0x8 <kinit+0x20>
 4000054:	bl	#0xc <kmain::h497e3e7b019098a1>
 4000058:	brk	#0x1
 400005c:	udf	#0x0

0000000004000060 kmain::h497e3e7b019098a1:
 4000060:	mov	w8, #0x4
 4000064:	movk	w8, #0x3f20, lsl #16
 4000068:	ldr	w9, [x8]
 400006c:	orr	w10, w9, #0x40000
 4000070:	mov	w9, #0x27c0
 4000074:	movk	w9, #0x9, lsl #16
 4000078:	str	w10, [x8]
 400007c:	ldr	w10, [x8, #0x18]
 4000080:	orr	w11, w10, #0x10000
 4000084:	mov	x10, x9
 4000088:	str	w11, [x8, #0x18]
 400008c:	subs	x10, x10, #0x1
 4000090:	nop
 4000094:	b.ne	#-0x8 <kmain::h497e3e7b019098a1+0x2c>
 4000098:	ldr	w10, [x8, #0x24]
 400009c:	orr	w11, w10, #0x10000
 40000a0:	mov	x10, x9
 40000a4:	str	w11, [x8, #0x24]
 40000a8:	subs	x10, x10, #0x1
 40000ac:	nop
 40000b0:	b.ne	#-0x8 <kmain::h497e3e7b019098a1+0x48>
 40000b4:	b	#-0x38 <kmain::h497e3e7b019098a1+0x1c>

The next one is the one with inline(never) (does not work):

cargo objdump -- -disassemble -no-show-raw-insn -print-imm-hex build/kernel.elf

build/kernel.elf:	file format ELF64-aarch64-little

Disassembly of section .text:
0000000004000000 _start:
 4000000:	mrs	x1, MPIDR_EL1
 4000004:	and	x1, x1, #0x3
 4000008:	cbz	x1, #0xc <_start+0x14>
 400000c:	wfe
 4000010:	b	#-0x4 <_start+0xc>
 4000014:	adr	x1, #-0x14
 4000018:	mov	sp, x1
 400001c:	bl	#0xc <kinit>
 4000020:	b	#-0x14 <_start+0xc>
 4000024:	udf	#0x0

0000000004000028 kinit:
 4000028:	adrp	x8, #0x0
 400002c:	add	x8, x8, #0xc0
 4000030:	adrp	x9, #0x0
 4000034:	add	x9, x9, #0xc0
 4000038:	cmp	x9, x8
 400003c:	b.hs	#0x18 <kinit+0x2c>
 4000040:	adrp	x9, #0x0
 4000044:	add	x9, x9, #0xc0
 4000048:	str	xzr, [x9], #0x8
 400004c:	cmp	x9, x8
 4000050:	b.lo	#-0x8 <kinit+0x20>
 4000054:	bl	#0x24 <kmain::h497e3e7b019098a1>
 4000058:	brk	#0x1
 400005c:	udf	#0x0

0000000004000060 spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564:
 4000060:	mov	w8, #0x27c0
 4000064:	movk	w8, #0x9, lsl #16
 4000068:	subs	x8, x8, #0x1
 400006c:	nop
 4000070:	b.ne	#-0x8 <spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564+0x8>
 4000074:	ret

0000000004000078 kmain::h497e3e7b019098a1:
 4000078:	stp	x19, x30, [sp, #-0x10]!
 400007c:	mov	w19, #0x4
 4000080:	movk	w19, #0x3f20, lsl #16
 4000084:	ldr	w8, [x19]
 4000088:	orr	w8, w8, #0x40000
 400008c:	str	w8, [x19]
 4000090:	ldr	w8, [x19, #0x18]
 4000094:	orr	w8, w8, #0x10000
 4000098:	str	w8, [x19, #0x18]
 400009c:	bl	#-0x3c <spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564>
 40000a0:	ldr	w8, [x19, #0x24]
 40000a4:	orr	w8, w8, #0x10000
 40000a8:	str	w8, [x19, #0x24]
 40000ac:	bl	#-0x4c <spin_sleep_ms::hbe40fb542a7832db.llvm.13348190869585059564>
 40000b0:	b	#-0x20 <kmain::h497e3e7b019098a1+0x18>

The first does not have the stp instruction the second has. I don't know if that makes any difference.

I am changing the title of this thread to reflect on this last post

I think you made a mistake, @lopote3501. Those two assembly listings are completely identical. Neither of them have the stp instruction.

Thank you, edited the post above. They are correct now

1 Like

Is it helpful if I post the binaries and not only their disassemblies?
Is the .cargo/config correct for the rpi3a+?

Could you please post the Rust source instead? It would be easier for us to get started with at the higher level.

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.