How to force inlining or avoid calls to the PLT?

I am writing a linux dynamic linker in rust, and I am having issues preforming initial relocations without segfaulting. I've seen how origin does it and can do the same, but I would like to make my code more readable.

The issue is that calls to the PLT (I am pretty sure it's the PLT...) will segfault because nothing has been relocated yet. I would really like to find a way to get this code working reliably:

// main.rs (called by a custom _start)
let miros = if base.is_null() {
    // We are the executable:
    StaticPie::from_program_headers(&program_header_table, pseudorandom_bytes)
} else {
    // We are the dynmaic linker:
    StaticPie::from_base(base, pseudorandom_bytes)
};

miros.relocate().allocate_tls(); // <- PLT call here

// static_pie.rs
use std::{
    arch::asm,
    marker::PhantomData,
    ptr::{null, null_mut},
    slice,
};

use crate::{
    arch::{
        exit, io,
        mmap::{mmap, MAP_ANONYMOUS, MAP_PRIVATE, PROT_READ, PROT_WRITE},
        thread_pointer::set_thread_pointer,
    },
    elf::{
        dynamic_array::{DynamicArrayItem, DynamicArrayIter, DT_RELA, DT_RELAENT, DT_RELASZ},
        header::{ElfHeader, ET_DYN},
        program_header::{ProgramHeader, PT_DYNAMIC, PT_PHDR, PT_TLS},
        relocate::{Rela, RelocationSlices},
        thread_local_storage::ThreadControlBlock,
    },
    linux::page_size,
    syscall_debug_assert,
    utils::round_up_to_boundary,
};

pub struct Ingredients;
pub struct Baked;

/// A struct representing a statically relocatable Position Independent Executable (PIE). 🥧
pub struct StaticPie<T> {
    base_address: *const (),
    relocation_slices: RelocationSlices,
    tls_program_header: *const ProgramHeader,
    pseudorandom_bytes: *const [u8; 16],
    phantom_data: PhantomData<T>,
}

impl StaticPie<Ingredients> {
    pub unsafe fn from_base(
        base: *const (),
        pseudorandom_bytes: *const [u8; 16],
    ) -> StaticPie<Ingredients> {
        // ELf Header:
        let header = &*(base as *const ElfHeader);
        syscall_debug_assert!(header.e_type == ET_DYN);
        syscall_debug_assert!(header.e_phentsize == size_of::<ProgramHeader>() as u16);

        // Program Headers:
        let program_header_table = slice::from_raw_parts(
            base.byte_add(header.e_phoff) as *const ProgramHeader,
            header.e_phnum as usize,
        );

        let (mut dynamic_program_header, mut tls_program_header) = (null(), null());
        for header in program_header_table {
            match header.p_type {
                PT_DYNAMIC => dynamic_program_header = header,
                PT_TLS => tls_program_header = header,
                _ => (),
            }
        }
        syscall_debug_assert!(dynamic_program_header != null());

        Self::build(
            base,
            dynamic_program_header,
            tls_program_header,
            pseudorandom_bytes,
        )
    }

    pub unsafe fn from_program_headers(
        program_header_table: &'static [ProgramHeader],
        pseudorandom_bytes: *const [u8; 16],
    ) -> StaticPie<Ingredients> {
        let (mut base, mut dynamic_program_header, mut tls_program_header) =
            (null(), null(), null());
        for header in program_header_table {
            match header.p_type {
                PT_PHDR => {
                    base = program_header_table.as_ptr().byte_sub(header.p_vaddr) as *const ();
                }
                PT_DYNAMIC => dynamic_program_header = header,
                PT_TLS => tls_program_header = header,
                _ => (),
            }
        }
        syscall_debug_assert!(dynamic_program_header != null());

        Self::build(
            base,
            dynamic_program_header,
            tls_program_header,
            pseudorandom_bytes,
        )
    }

    #[must_use]
    unsafe fn build(
        base: *const (),
        dynamic_program_header: *const ProgramHeader,
        tls_program_header: *const ProgramHeader,
        pseudorandom_bytes: *const [u8; 16],
    ) -> StaticPie<Ingredients> {
        // Dynamic Arrary:
        let dynamic_array = DynamicArrayIter::new(
            base.byte_add((*dynamic_program_header).p_vaddr) as *const DynamicArrayItem
        );
        syscall_debug_assert!(dynamic_array.clone().count() != 0);

        let mut rela_pointer: *const Rela = null();
        let mut rela_count = 0;

        for item in dynamic_array {
            match item.d_tag {
                DT_RELA => {
                    rela_pointer = base.byte_add(item.d_un.d_ptr.addr()) as *const Rela;
                }
                DT_RELASZ => {
                    rela_count = item.d_un.d_val / core::mem::size_of::<Rela>();
                }
                #[cfg(debug_assertions)]
                DT_RELAENT => {
                    syscall_debug_assert!(item.d_un.d_val as usize == size_of::<Rela>())
                }
                _ => (),
            }
        }

        syscall_debug_assert!(rela_pointer != null());
        let rela_slice = slice::from_raw_parts(rela_pointer, rela_count);

        StaticPie::<Ingredients> {
            base_address: base,
            relocation_slices: RelocationSlices { rela_slice },
            tls_program_header,
            pseudorandom_bytes,
            phantom_data: PhantomData,
        }
    }
}

impl StaticPie<Ingredients> {
    #[must_use]
    pub unsafe fn relocate(self) -> StaticPie<Baked> {
        #[cfg(target_arch = "x86_64")]
        for rela in self.relocation_slices.rela_slice {
            let relocate_address = rela.r_offset.wrapping_add(self.base_address.addr());

            // x86_64 assembly pointer widths:
            // byte  | 8 bits  (1 byte)
            // word  | 16 bits (2 bytes)
            // dword | 32 bits (4 bytes) | "double word"
            // qword | 64 bits (8 bytes) | "quad word"
            use crate::elf::relocate::{R_X86_64_IRELATIVE, R_X86_64_RELATIVE};
            match rela.r_type() {
                R_X86_64_RELATIVE => {
                    let relocate_value =
                        self.base_address.addr().wrapping_add_signed(rela.r_addend);
                    asm!(
                        "mov qword ptr [{}], {}",
                        in(reg) relocate_address,
                        in(reg) relocate_value,
                        options(nostack, preserves_flags),
                    );
                }
                R_X86_64_IRELATIVE => {
                    let function_pointer =
                        self.base_address.addr().wrapping_add_signed(rela.r_addend);
                    let function: extern "C" fn() -> usize = core::mem::transmute(function_pointer);
                    let relocate_value = function();
                    asm!(
                        "mov qword ptr [{}], {}",
                        in(reg) relocate_address,
                        in(reg) relocate_value,
                        options(nostack, preserves_flags),
                    );
                }
                _ => {
                    io::write(io::STD_ERR, "Unsupported Relocation");
                    exit::exit(3233);
                }
            }
        }

        StaticPie::<Baked> {
            phantom_data: PhantomData::<Baked>,
            ..self
        }
    }
}

impl StaticPie<Baked> {
    pub unsafe fn allocate_tls(self) {
        // Static Thread Local Storage [before Thread Pointer]:
        //                                         ┌---------------------┐
        //      ┌----------------------------┐  <- |    tls-offset[1]    |
        //      |      Static TLS Block      |     |---------------------|
        //      |----------------------------|  <- | Thread Pointer (TP) |
        // ┌--- | Thread Control Block (TCB) |     └---------------------┘
        // |    └----------------------------┘
        // |
        // |   ┌------------------┐
        // └-> | Null Dtv Pointer |
        //     └------------------┘
        // NOTE: I am not bothering with alignment at the first address because it's already page aligned...
        if self.tls_program_header.is_null() {
            return;
        }
        let tls_program_header = *self.tls_program_header;

        let tls_blocks_size_and_align =
            round_up_to_boundary(tls_program_header.p_memsz, tls_program_header.p_align);
        let tcb_size = size_of::<ThreadControlBlock>();

        let required_size = tls_blocks_size_and_align + tcb_size;
        let tls_block_pointer = mmap(
            null_mut(),
            required_size,
            PROT_READ | PROT_WRITE,
            MAP_PRIVATE | MAP_ANONYMOUS,
            -1, // file descriptor (-1 for anonymous mapping)
            0,  // offset
        );
        syscall_debug_assert!(tls_block_pointer.addr() % page_size::get_page_size() == 0);

        // Initialize the TLS data from template image:
        slice::from_raw_parts_mut(tls_block_pointer as *mut u8, tls_program_header.p_filesz)
            .copy_from_slice(slice::from_raw_parts(
                self.base_address.byte_add(tls_program_header.p_offset) as *mut u8,
                tls_program_header.p_filesz,
            ));

        // Zero out TLS data beyond `p_filesz`:
        slice::from_raw_parts_mut(
            tls_block_pointer.byte_add(tls_program_header.p_filesz) as *mut u8,
            tls_program_header.p_memsz - tls_program_header.p_filesz,
        )
        .fill(0);

        // Initialize the Thread Control Block (TCB):
        let thread_control_block =
            tls_block_pointer.byte_add(tls_blocks_size_and_align) as *mut ThreadControlBlock;

        let thread_pointer_register: *mut () =
            (*thread_control_block).thread_pointee.as_mut_ptr().cast();

        *thread_control_block = ThreadControlBlock {
            thread_pointee: [],
            thread_pointer_register,
            dynamic_thread_vector: null_mut(),
            _padding: [0; 3],
            canary: usize::from_ne_bytes(
                (*self.pseudorandom_bytes)[..size_of::<usize>()]
                    .try_into()
                    .unwrap(),
            ),
        };

        // Make the thread pointer (which is fs on x86_64) point to the TCB:
        set_thread_pointer(thread_pointer_register);
    }
}

When debuging The segfault occurse like so:

Breakpoint 1, miros::rust_main (stack_pointer=0x7fffffffe7c0) at src/main.rs:87
87	   miros.relocate().allocate_tls();
(gdb) x/15i $pc
=> 0x23800b <_ZN5miros9rust_main17hbf1c54ea22468c36E+1547>:	
    lea    0x1c8(%rsp),%rdi
   0x238013 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1555>:	
    lea    0x178(%rsp),%rsi
   0x23801b <_ZN5miros9rust_main17hbf1c54ea22468c36E+1563>:	
    mov    $0x28,%edx
   0x238020 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1568>:	
    call   0x2ec640
   0x238025 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1573>:	
    lea    0x1a0(%rsp),%rdi
   0x23802d <_ZN5miros9rust_main17hbf1c54ea22468c36E+1581>:	
    lea    0x1c8(%rsp),%rsi
   0x238035 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1589>:	
    call   0x237000 <_ZN5miros10static_pie47StaticPie$LT$miros..static_pie..Ingredients$GT$8relocate17h8991c1b754563d5fE>
   0x23803a <_ZN5miros9rust_main17hbf1c54ea22468c36E+1594>:	
    lea    0x1a0(%rsp),%rdi
   0x238042 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1602>:	
    call   0x237020 <_ZN5miros10static_pie41StaticPie$LT$miros..static_pie..Baked$GT$12allocate_tls17h6659d728bf640a30E>
   0x238047 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1607>:	
--Type <RET> for more, q to quit, c to continue without paging--
    lea    0x1f0(%rsp),%rdi
   0x23804f <_ZN5miros9rust_main17hbf1c54ea22468c36E+1615>:	
    call   0x23a790 <_ZN5alloc3vec12Vec$LT$T$GT$3new17h4563aef74b1e7031E>
   0x238054 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1620>:	
    lea    0x1f0(%rsp),%rdi
   0x23805c <_ZN5miros9rust_main17hbf1c54ea22468c36E+1628>:	
    lea    -0x37713(%rip),%rsi        # 0x200950
   0x238063 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1635>:	
    mov    $0x5,%edx
   0x238068 <_ZN5miros9rust_main17hbf1c54ea22468c36E+1640>:	
    lea    0xb58e9(%rip),%rcx        # 0x2ed958
(gdb) si
0x0000000000238013	87	   miros.relocate().allocate_tls();
(gdb) si
0x000000000023801b	87	   miros.relocate().allocate_tls();
(gdb) si
0x0000000000238020	87	   miros.relocate().allocate_tls();
(gdb) si
0x00000000002ec640 in ?? ()
   from ./target/debug/miros
(gdb) x/15i $pc
=> 0x2ec640:	
    jmp    *0x7f52(%rip)        # 0x2f4598
   0x2ec646:	push   $0x0
   0x2ec64b:	jmp    0x0
   0x2ec650:	
    jmp    *0x7f4a(%rip)        # 0x2f45a0
   0x2ec656:	push   $0x1
   0x2ec65b:	jmp    0x0
   0x2ec660:	
    jmp    *0x7f42(%rip)        # 0x2f45a8
   0x2ec666:	push   $0x2
   0x2ec66b:	jmp    0x0
   0x2ec670:	
    jmp    *0x7f3a(%rip)        # 0x2f45b0
   0x2ec676:	push   $0x3
   0x2ec67b:	jmp    0x0
   0x2ec680:	
    jmp    *0x7f32(%rip)        # 0x2f45b8
   0x2ec686:	push   $0x4
   0x2ec68b:	jmp    0x0
(gdb) si
0x0000000000000000 in ?? ()
(gdb) si

Program received signal SIGSEGV, Segmentation fault.
0x0000000000000000 in ?? ()

The table(?)/section which it jumps to and proceds to derefrence a null pointer seems to be this one:

Disassembly of section .iplt:

00000000002ec640 <.iplt>:
  2ec640:	ff 25 52 7f 00 00    	jmp    *0x7f52(%rip)        # 2f4598 <_GLOBAL_OFFSET_TABLE_+0x18>
  2ec646:	68 00 00 00 00       	push   $0x0
  2ec64b:	e9 b0 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec650:	ff 25 4a 7f 00 00    	jmp    *0x7f4a(%rip)        # 2f45a0 <_GLOBAL_OFFSET_TABLE_+0x20>
  2ec656:	68 01 00 00 00       	push   $0x1
  2ec65b:	e9 a0 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec660:	ff 25 42 7f 00 00    	jmp    *0x7f42(%rip)        # 2f45a8 <_GLOBAL_OFFSET_TABLE_+0x28>
  2ec666:	68 02 00 00 00       	push   $0x2
  2ec66b:	e9 90 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec670:	ff 25 3a 7f 00 00    	jmp    *0x7f3a(%rip)        # 2f45b0 <_GLOBAL_OFFSET_TABLE_+0x30>
  2ec676:	68 03 00 00 00       	push   $0x3
  2ec67b:	e9 80 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec680:	ff 25 32 7f 00 00    	jmp    *0x7f32(%rip)        # 2f45b8 <_GLOBAL_OFFSET_TABLE_+0x38>
  2ec686:	68 04 00 00 00       	push   $0x4
  2ec68b:	e9 70 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec690:	ff 25 2a 7f 00 00    	jmp    *0x7f2a(%rip)        # 2f45c0 <_GLOBAL_OFFSET_TABLE_+0x40>
  2ec696:	68 05 00 00 00       	push   $0x5
  2ec69b:	e9 60 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec6a0:	ff 25 22 7f 00 00    	jmp    *0x7f22(%rip)        # 2f45c8 <_GLOBAL_OFFSET_TABLE_+0x48>
  2ec6a6:	68 06 00 00 00       	push   $0x6
  2ec6ab:	e9 50 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec6b0:	ff 25 1a 7f 00 00    	jmp    *0x7f1a(%rip)        # 2f45d0 <_GLOBAL_OFFSET_TABLE_+0x50>
  2ec6b6:	68 07 00 00 00       	push   $0x7
  2ec6bb:	e9 40 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec6c0:	ff 25 12 7f 00 00    	jmp    *0x7f12(%rip)        # 2f45d8 <_GLOBAL_OFFSET_TABLE_+0x58>
  2ec6c6:	68 08 00 00 00       	push   $0x8
  2ec6cb:	e9 30 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec6d0:	ff 25 0a 7f 00 00    	jmp    *0x7f0a(%rip)        # 2f45e0 <_GLOBAL_OFFSET_TABLE_+0x60>
  2ec6d6:	68 09 00 00 00       	push   $0x9
  2ec6db:	e9 20 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>
  2ec6e0:	ff 25 02 7f 00 00    	jmp    *0x7f02(%rip)        # 2f45e8 <_GLOBAL_OFFSET_TABLE_+0x68>
  2ec6e6:	68 0a 00 00 00       	push   $0xa
  2ec6eb:	e9 10 39 d1 ff       	jmp    0 <__evoke_link_warning_gethostbyname2_r>

I am compiling with this .cargo/config.toml

[build]
rustflags = ["-C", "target-feature=+crt-static", "-C", "link-arg=-nostartfiles"
]

and file shows this:

❯ file ./target/debug/miros                                                                     01/01/25
./target/debug/miros: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), statically linked, BuildID[sha1]=7232f6a293119031a16474c11812750c8a1a6d9b, with debug_info, not stripped, too many notes (256)

and readelf shows this:

❯ readelf -d ./target/debug/miros                                                               01/01/25

There is no dynamic section in this file.

❯ readelf -r ./target/debug/miros                                                               02/01/25

Relocation section '.rela.dyn' at offset 0x298 contains 23 entries:
  Offset          Info           Type           Sym. Value    Sym. Name + Addend
0000002f4598  000000000025 R_X86_64_IRELATIV                    2a8a60
0000002f45a0  000000000025 R_X86_64_IRELATIV                    2a9480
0000002f45a8  000000000025 R_X86_64_IRELATIV                    2a9280
0000002f45b0  000000000025 R_X86_64_IRELATIV                    2a89f0
0000002f45b8  000000000025 R_X86_64_IRELATIV                    2a89f0
0000002f45c0  000000000025 R_X86_64_IRELATIV                    2a98a0
0000002f45c8  000000000025 R_X86_64_IRELATIV                    2a9380
0000002f45d0  000000000025 R_X86_64_IRELATIV                    2a97d0
0000002f45d8  000000000025 R_X86_64_IRELATIV                    2a9910
0000002f45e0  000000000025 R_X86_64_IRELATIV                    2a9740
0000002f45e8  000000000025 R_X86_64_IRELATIV                    2a9660
0000002f45f0  000000000025 R_X86_64_IRELATIV                    2a96d0
0000002f45f8  000000000025 R_X86_64_IRELATIV                    2aa3f0
0000002f4600  000000000025 R_X86_64_IRELATIV                    2a9840
0000002f4608  000000000025 R_X86_64_IRELATIV                    2a9570
0000002f4610  000000000025 R_X86_64_IRELATIV                    2a99f0
0000002f4618  000000000025 R_X86_64_IRELATIV                    2a8970
0000002f4620  000000000025 R_X86_64_IRELATIV                    2a95e0
0000002f4628  000000000025 R_X86_64_IRELATIV                    2a9a60
0000002f4630  000000000025 R_X86_64_IRELATIV                    2aa4d0
0000002f4638  000000000025 R_X86_64_IRELATIV                    2c3630
0000002f4640  000000000025 R_X86_64_IRELATIV                    2c35c0
0000002f4648  000000000025 R_X86_64_IRELATIV                    2c4c10

❯ readelf -e ./target/debug/miros                                                               01/01/25
ELF Header:
  Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF64
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           Advanced Micro Devices X86-64
  Version:                           0x1
  Entry point address:               0x23a170
  Start of program headers:          64 (bytes into file)
  Start of section headers:          4847824 (bytes into file)
  Flags:                             0x0
  Size of this header:               64 (bytes)
  Size of program headers:           56 (bytes)
  Number of program headers:         10
  Size of section headers:           64 (bytes)
  Number of section headers:         40
  Section header string table index: 38

Section Headers:
  [Nr] Name              Type             Address           Offset
       Size              EntSize          Flags  Link  Info  Align
  [ 0]                   NULL             0000000000000000  00000000
       0000000000000000  0000000000000000           0     0     0
  [ 1] .note.gnu.bu[...] NOTE             0000000000200270  00000270
       0000000000000024  0000000000000000   A       0     0     4
  [ 2] .rela.dyn         RELA             0000000000200298  00000298
       0000000000000228  0000000000000018   A       0     0     8
  [ 3] .rodata           PROGBITS         00000000002004c0  000004c0
       0000000000020823  0000000000000000 AMS       0     0     32
  [ 4] .gcc_except_table PROGBITS         0000000000220ce4  00020ce4
       0000000000002e00  0000000000000000   A       0     0     4
  [ 5] rodata.cst32      PROGBITS         0000000000223b00  00023b00
       0000000000000060  0000000000000020  AM       0     0     32
  [ 6] .eh_frame_hdr     PROGBITS         0000000000223b60  00023b60
       0000000000002f8c  0000000000000000   A       0     0     4
  [ 7] .eh_frame         PROGBITS         0000000000226af0  00026af0
       000000000000e708  0000000000000000   A       0     0     8
  [ 8] .text             PROGBITS         0000000000236200  00035200
       00000000000b6431  0000000000000000  AX       0     0     64
  [ 9] .iplt             PROGBITS         00000000002ec640  000eb640
       0000000000000170  0000000000000000  AX       0     0     16
  [10] .tdata            PROGBITS         00000000002ed7b0  000eb7b0
       0000000000000038  0000000000000000 WAT       0     0     8
  [11] .tbss             NOBITS           00000000002ed7e8  000eb7e8
       0000000000000068  0000000000000000 WAT       0     0     8
  [12] .data.rel.ro      PROGBITS         00000000002ed800  000eb800
       0000000000006558  0000000000000000  WA       0     0     32
  [13] .init_array       INIT_ARRAY       00000000002f3d58  000f1d58
       0000000000000008  0000000000000000  WA       0     0     8
  [14] .fini_array       FINI_ARRAY       00000000002f3d60  000f1d60
       0000000000000008  0000000000000008  WA       0     0     8
  [15] .got              PROGBITS         00000000002f3d68  000f1d68
       0000000000000818  0000000000000000  WA       0     0     8
  [16] .got.plt          PROGBITS         00000000002f4580  000f2580
       00000000000000d0  0000000000000000  WA       0     0     8
  [17] .relro_padding    NOBITS           00000000002f4650  000f2650
       00000000000009b0  0000000000000000  WA       0     0     1
  [18] .data             PROGBITS         00000000002f5660  000f2660
       0000000000001a5c  0000000000000000  WA       0     0     32
  [19] .bss              NOBITS           00000000002f70c0  000f40bc
       00000000000052e8  0000000000000000  WA       0     0     32
  [20] .debug_abbrev     PROGBITS         0000000000000000  000f40bc
       0000000000003e1d  0000000000000000           0     0     1
  [21] .debug_info       PROGBITS         0000000000000000  000f7ed9
       0000000000109e41  0000000000000000           0     0     1
  [22] .debug_aranges    PROGBITS         0000000000000000  00201d1a
       0000000000007aa0  0000000000000000           0     0     1
  [23] .debug_ranges     PROGBITS         0000000000000000  002097ba
       00000000000aae60  0000000000000000           0     0     1
  [24] .debug_str        PROGBITS         0000000000000000  002b461a
       0000000000155335  0000000000000001  MS       0     0     1
  [25] .comment          PROGBITS         0000000000000000  0040994f
       00000000000000ef  0000000000000001  MS       0     0     1
  [26] .debug_frame      PROGBITS         0000000000000000  00409a40
       0000000000000ab8  0000000000000000           0     0     8
  [27] .debug_line       PROGBITS         0000000000000000  0040a4f8
       000000000006983f  0000000000000000           0     0     1
  [28] .debug_loc        PROGBITS         0000000000000000  00473d37
       0000000000000280  0000000000000000           0     0     1
  [29] .gnu.build.a[...] NOTE             0000000000000000  00473fb8
       0000000000002274  0000000000000000           0     0     4
  [30] .note.stapsdt     NOTE             0000000000000000  0047622c
       0000000000001994  0000000000000000           0     0     4
  [31] .gnu.warning[...] PROGBITS         0000000000000000  00477bc0
       0000000000000087  0000000000000000           0     0     32
  [32] .gnu.warning[...] PROGBITS         0000000000000000  00477c60
       000000000000008c  0000000000000000           0     0     32
  [33] .gnu.warning[...] PROGBITS         0000000000000000  00477d00
       0000000000000086  0000000000000000           0     0     32
  [34] .gnu.warning[...] PROGBITS         0000000000000000  00477da0
       000000000000008b  0000000000000000           0     0     32
  [35] .gnu.warning[...] PROGBITS         0000000000000000  00477e40
       0000000000000082  0000000000000000           0     0     32
  [36] .gnu.warning[...] PROGBITS         0000000000000000  00477ee0
       0000000000000083  0000000000000000           0     0     32
  [37] .symtab           SYMTAB           0000000000000000  00477f68
       0000000000010d58  0000000000000018          39   2175     8
  [38] .shstrtab         STRTAB           0000000000000000  00488cc0
       0000000000000205  0000000000000000           0     0     1
  [39] .strtab           STRTAB           0000000000000000  00488ec5
       0000000000016a04  0000000000000000           0     0     1
Key to Flags:
  W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
  L (link order), O (extra OS processing required), G (group), T (TLS),
  C (compressed), x (unknown), o (OS specific), E (exclude),
  D (mbind), l (large), p (processor specific)

Program Headers:
  Type           Offset             VirtAddr           PhysAddr
                 FileSiz            MemSiz              Flags  Align
  PHDR           0x0000000000000040 0x0000000000200040 0x0000000000200040
                 0x0000000000000230 0x0000000000000230  R      0x8
  LOAD           0x0000000000000000 0x0000000000200000 0x0000000000200000
                 0x00000000000351f8 0x00000000000351f8  R      0x1000
  LOAD           0x0000000000035200 0x0000000000236200 0x0000000000236200
                 0x00000000000b65b0 0x00000000000b65b0  R E    0x1000
  LOAD           0x00000000000eb7b0 0x00000000002ed7b0 0x00000000002ed7b0
                 0x0000000000006ea0 0x0000000000007850  RW     0x1000
  LOAD           0x00000000000f2660 0x00000000002f5660 0x00000000002f5660
                 0x0000000000001a5c 0x0000000000006d48  RW     0x1000
  TLS            0x00000000000eb7b0 0x00000000002ed7b0 0x00000000002ed7b0
                 0x0000000000000038 0x00000000000000a0  R      0x8
  GNU_RELRO      0x00000000000eb7b0 0x00000000002ed7b0 0x00000000002ed7b0
                 0x0000000000006ea0 0x0000000000007850  R      0x1
  GNU_EH_FRAME   0x0000000000023b60 0x0000000000223b60 0x0000000000223b60
                 0x0000000000002f8c 0x0000000000002f8c  R      0x4
  GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
                 0x0000000000000000 0x0000000000000000  RW     0x0
  NOTE           0x0000000000000270 0x0000000000200270 0x0000000000200270
                 0x0000000000000024 0x0000000000000024  R      0x4

 Section to Segment mapping:
  Segment Sections...
   00     
   01     .note.gnu.build-id .rela.dyn .rodata .gcc_except_table rodata.cst32 .eh_frame_hdr .eh_frame 
   02     .text .iplt 
   03     .tdata .data.rel.ro .init_array .fini_array .got .got.plt .relro_padding 
   04     .data .bss 
   05     .tdata .tbss 
   06     .tdata .data.rel.ro .init_array .fini_array .got .got.plt .relro_padding 
   07     .eh_frame_hdr 
   08     
   09     .note.gnu.build-id

I know what the problem is, this instruction in the .iplt section:

jmp    *0x7f52(%rip)        # 2f4598 <_GLOBAL_OFFSET_TABLE_+0x18>

It is dereferencing a yet to be preformed relocation. This one to be exact:

0000002f4598  000000000025 R_X86_64_IRELATIV     2a8a60

I tried using #[inline(always)] but LLVM ignores that in debug mode... apparently it's just a suggestion.
Is there another way to avoid calls to the PLT while still using pretty functions, or do I just have to write C code in rust?

Thanks in advance for any help!

There is no way to force inlining. What you could try however is compiling with -Clink-arg=-Wl,-Bsymbolic to force all exported symbols to have protected visibility. This avoids the need for PLT entries and thus almost certainly causes the linker to emit direct calls instead. GOT entries will also become become pre-resolved. You only need to make sure to apply relative relocations immediately before doing anything and adding an optimization barrier to prevent the compiler from moving code to before applying relative relocations. Eg by doing the call to the rest of the code using inline asm. Having to resolve relative relocations is unavoidable as data can't reference other data or functions using pc-relative addressing.

That got everything inlined perfectly, unfortunately it still didn't fix my issue.

I found the label at the indirect relocation's addend, and it is __new_memcpy_ifunc not my method. Which means that inlining was never the issue. I'm not sure why I didn't think to check that before...

I am assuming __new_memcpy_ifunc is a runtime selector for memcpy, which I seem to remember was mentioned in origin's source code (I think it might have been memset, but I digress). I expect if there was a well-known way to avoid that origin would have been written differently, but I will still look into it.

Anyway, this does answer my question. Thank you very much for the help!

P.S. I have seen you in dryads repo and your work on origin... You're kind of a celebrity to me... So yeah, thanks again!

It looks like you are linking against glibc. Glibc is tightly integrated with it's dynamic linker, so even if you handle this case, for example thread spawning will likely be broken as your dynamic linker has no way to register the thread local storage of all dynamic libraries with glibc and thus glibc won't create the thread local storage. This is kind of the inverse problem I had with trying to get musl's dynamic linker working with origin where I needed to patch musl's dynamic linker to have origin handle the tls. I switched to doing something different before I could finish that work. Maybe I will pick it back up again in the future if nobody else beats me to it.

Admittedly I have no idea how I am going to do the glibc integration, this is my first "hard" project in the low-level world, so I am just taking it one step at a time.
The main goal is to learn enough to be able to write documentation on the subject.

Is the source code for your attempted musl patch available anywhere? I would love to have some more examples to work with.

Also, I just got it working!!

I think the memcpy is copying my struct:

# C ABI x86_64 calling convention (rdi, rsi, rdx)
=> 0x7ffff7f3d36b:	lea    0x350(%rsp),%rdi
   0x7ffff7f3d373:	lea    0x300(%rsp),%rsi
   0x7ffff7f3d37b:	mov    $0x28,%edx       # 0x28 = 40 bytes
   0x7ffff7f3d380:	call   0x7ffff7feeee0

Which happens to be the size of my struct:

pub struct StaticPie<T> {
    base_address: *const (), // 8 bytes
    rela_slice: &'static [Rela], // 16 bytes (pointer + length)
    tls_program_header: *const ProgramHeader, // 8 bytes
    pseudorandom_bytes: *const [u8; 16], // 8 bytes
    phantom_data: PhantomData<T>, // 0 bytes
}

If I decrease the size of my struct it seems to remove the memcpy, anything below 40 seems to work fine:

pub struct StaticPie<T> {
    base_address: *const (), // 8 bytes
    dynamic_array: *const DynamicArrayItem, // 8 bytes
    tls_program_header: *const ProgramHeader, // 8 bytes
    phantom_data: PhantomData<T>,
}

I mean, it still segfaults, but I think that's my tls_allocation not the function call.

A more clean example is this segfaults pre-relocation:

struct Test(u64, u64, u64, u64, u64);
impl Test {
    pub fn new() -> Self {
        Self(0, 1, 2, 3, 4)
    }
    pub fn add(self) -> Self {
        self.0 + self.1 + self.2 + self.3 + self.4;
        self
    }
}
Test::new().add();

but this doesn't:

struct Test(u64, u64, u64, u64);
impl Test {
    pub fn new() -> Self {
        Self(0, 1, 2, 3)
    }
    pub fn add(self) -> Self {
        self.0 + self.1 + self.2 + self.3;
        self
    }
}
Test::new().add();

I expect it's just an optimization, but I am not going to dig into it too deep at the moment.

It also may be why closures are segfaulting...

GitHub - bjorn3/origin at dynlink is the origin side and the musl patch is:

diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index 324aa859..7371e06d 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -30,10 +30,68 @@ static size_t ldso_page_size;
 
 #include "libc.h"
 
-#define malloc __libc_malloc
-#define calloc __libc_calloc
-#define realloc __libc_realloc
-#define free __libc_free
+__asm__(
+".text\n"
+".global __tlsdesc_static\n"
+".hidden __tlsdesc_static\n"
+".type __tlsdesc_static,@function\n"
+"__tlsdesc_static:\n"
+"      mov 8(%rax),%rax\n"
+"      ret\n"
+"\n"
+".global __tlsdesc_dynamic\n"
+".hidden __tlsdesc_dynamic\n"
+".type __tlsdesc_dynamic,@function\n"
+"__tlsdesc_dynamic:\n"
+"      mov 8(%rax),%rax\n"
+"      push %rdx\n"
+"      mov %fs:8,%rdx\n"
+"      push %rcx\n"
+"      mov (%rax),%rcx\n"
+"      mov 8(%rax),%rax\n"
+"      add (%rdx,%rcx,8),%rax\n"
+"      pop %rcx\n"
+"      sub %fs:0,%rax\n"
+"      pop %rdx\n"
+"      ret\n"
+"\n"
+"/* Copyright 2011-2012 Nicholas J. Kain, licensed under standard MIT license */\n"
+".text\n"
+".global __set_thread_area\n"
+".hidden __set_thread_area\n"
+".type __set_thread_area,@function\n"
+"__set_thread_area:\n"
+"      mov %rdi,%rsi           /* shift for syscall */\n"
+"      movl $0x1002,%edi       /* SET_FS register */\n"
+"      movl $158,%eax          /* set fs segment to */\n"
+"      syscall                 /* arch_prctl(SET_FS, arg)*/\n"
+"      ret\n"
+);
+
+#include "../src/malloc/replaced.c"
+#include "../src/internal/libc.c"
+#include "../src/thread/default_attr.c"
+#include "../src/internal/defsysinfo.c"
+#include "../src/signal/block.c"
+#include "../src/internal/version.c"
+
+hidden void __malloc_donate(char *start, char *end) {}
+
+hidden void __dl_vseterr(const char *fmt, va_list ap) {}
+
+hidden void __inhibit_ptc() {}
+hidden void __release_ptc() {}
+
+hidden void __tl_lock(void) {}
+hidden void __tl_unlock(void) {}
+
+#define __environ environ
+#define ___errno_location __errno_location
+
+int __membarrier(int cmd, int flags)
+{
+       return 0;
+}
 
 static void error_impl(const char *, ...);
 static void error_noop(const char *, ...);
@@ -128,6 +186,57 @@ static struct builtin_tls {
 } builtin_tls[1];
 #define MIN_TLS_ALIGN offsetof(struct builtin_tls, pt)
 
+// FIXME reimplement in origin
+int __init_tp(void *p)
+{
+       pthread_t td = p;
+       td->self = td;
+       int r = __set_thread_area(TP_ADJ(p));
+       if (r < 0) return -1;
+       if (!r) libc.can_do_threads = 1;
+       td->next = td->prev = td;
+       //td->dtors[0] = 0;
+       //td->dtors[1] = 0;
+       //td->dtors[2] = 0;
+       return 0;
+}
+
+// FIXME reimplement in origin
+void *__copy_tls(unsigned char *mem)
+{
+       pthread_t td;
+       struct tls_module *p;
+       size_t i;
+       uintptr_t *dtv;
+
+#ifdef TLS_ABOVE_TP
+       dtv = (uintptr_t*)(mem + libc.tls_size) - (libc.tls_cnt + 1);
+
+       mem += -((uintptr_t)mem + sizeof(struct pthread)) & (libc.tls_align-1);
+       td = (pthread_t)mem;
+       mem += sizeof(struct pthread);
+
+       for (i=1, p=libc.tls_head; p; i++, p=p->next) {
+               dtv[i] = (uintptr_t)(mem + p->offset) + DTP_OFFSET;
+               memcpy(mem + p->offset, p->image, p->len);
+       }
+#else
+       dtv = (uintptr_t *)mem;
+
+       mem += libc.tls_size - sizeof(struct pthread);
+       mem -= (uintptr_t)mem & (libc.tls_align-1);
+       td = (pthread_t)mem;
+
+       for (i=1, p=libc.tls_head; p; i++, p=p->next) {
+               dtv[i] = (uintptr_t)(mem - p->offset) + DTP_OFFSET;
+               memcpy(mem - p->offset, p->image, p->len);
+       }
+#endif
+       dtv[0] = libc.tls_cnt;
+       td->dtv = dtv;
+       return td;
+}
+
 #define ADDEND_LIMIT 4096
 static size_t *saved_addends, *apply_addends_to;
 
@@ -499,6 +608,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
                        reloc_addr[1] = def.sym ? (size_t)def.dso->got : 0;
                        break;
                case REL_DTPMOD:
+                       printf("%d %s\n", def.dso->tls_id, def.dso->name);
                        *reloc_addr = def.dso->tls_id;
                        break;
                case REL_DTPOFF:
@@ -1282,10 +1392,16 @@ static void load_direct_deps(struct dso *p)
                p->deps[cnt++] = q;
        for (i=0; p->dynv[i]; i+=2) {
                if (p->dynv[i] != DT_NEEDED) continue;
+               if (strcmp(p->strings + p->dynv[i+1], "libgcc_s.so.1") == 0) {
+                       continue;
+               }
+               if (strcmp(p->strings + p->dynv[i+1], "ld-linux-x86-64.so.2") == 0) {
+                       continue;
+               }
                struct dso *dep = load_library(p->strings + p->dynv[i+1], p);
                if (!dep) {
-                       error("Error loading shared library %s: %m (needed by %s)",
-                               p->strings + p->dynv[i+1], p->name);
+                       error("Error loading shared library %s: %s (needed by %s)",
+                               p->strings + p->dynv[i+1], strerror(errno), p->name);
                        if (runtime) longjmp(*rtld_fail, 1);
                        continue;
                }
@@ -1646,6 +1762,7 @@ static void update_tls_size()
        tls_align);
 }
 
+// FIXME reimplement in origin
 static void install_new_tls(void)
 {
        sigset_t set;
@@ -1890,7 +2007,7 @@ void __dls3(size_t *sp, size_t *auxv)
                }
                argv[-1] = (void *)(argc - (argv-argv_orig));
                if (!argv[0]) {
-                       dprintf(2, "musl libc (" LDSO_ARCH ")\n"
+                       dprintf(2, "eyra (" LDSO_ARCH ") with musl dynamic linker\n"
                                "Version %s\n"
                                "Dynamic Program Loader\n"
                                "Usage: %s [options] [--] pathname%s\n",
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index de2b9d8b..c045ce05 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -35,16 +35,20 @@ struct pthread {
        int tid;
        int errno_val;
        volatile int detach_state;
+       /*
        volatile int cancel;
        volatile unsigned char canceldisable, cancelasync;
        unsigned char tsd_used:1;
        unsigned char dlerror_flag:1;
        unsigned char *map_base;
+       */
        size_t map_size;
        void *stack;
        size_t stack_size;
        size_t guard_size;
        void *result;
+       //size_t dtors[3];
+       /*
        struct __ptcb *cancelbuf;
        void **tsd;
        struct {
@@ -57,7 +61,7 @@ struct pthread {
        locale_t locale;
        volatile int killlock[1];
        char *dlerror_buf;
-       void *stdio_locks;
+       void *stdio_locks;*/
 
        /* Part 3 -- the positions of these fields relative to
         * the end of the structure is external and internal ABI. */

But as I said thread local storage is mostly broken with this.