Interesting question! Let me start by posting the numbers I get on my machine for the code you provided:
$ rustc +nightly --version
rustc 1.67.0-nightly (1eb62b123 2022-11-27)
$ cargo +nightly run --release
Compiling read_test v0.1.0 (/tmp/read_test)
Finished release [optimized + debuginfo] target(s) in 0.42s
Running `target/release/read_test`
Terrible hack: 57107µs
std::fs::read: 102822µs
Let's see what happens if we switch the order of the test cases:
#![feature(allocator_api)]
#![feature(ptr_as_uninit)]
use std::{alloc::{Layout, Allocator}, time::Instant};
use nix::{fcntl::OFlag, sys::stat::Mode};
use rand::{Rng, distributions::Uniform};
fn main() {
let mut rng = rand::thread_rng();
let mut s = Vec::<u8>::new();
for _ in 0..400_000_000 {
s.push(rng.sample(Uniform::new(0, 26)) + 'a' as u8);
}
let path = "testfile";
std::fs::write(path, s).unwrap();
let tic = Instant::now();
let v2 = std::fs::read(path).unwrap();
println!("std::fs::read: {:10}µs", tic.elapsed().as_micros());
let tic = Instant::now();
let f = std::fs::File::open(path).unwrap();
let n = f.metadata().unwrap().len() as usize;
let a = std::alloc::System;
let layout = Layout::from_size_align(n, 32).unwrap();
let v = a.allocate(layout).unwrap();
let fd = nix::fcntl::open(path, OFlag::O_RDONLY, Mode::empty()).unwrap();
let v1 = unsafe { std::mem::transmute::<_, &mut [u8]>(v.as_uninit_slice_mut()) };
let x = nix::unistd::read(fd, v1).unwrap();
assert_eq!(x, n); // I know this can fail
println!("Terrible hack: {:10}µs", tic.elapsed().as_micros());
assert_eq!(v1, v2);
}
Source
$ cargo +nightly run --release
Compiling read_test v0.1.0 (/tmp/read_test)
Finished release [optimized + debuginfo] target(s) in 0.41s
Running `target/release/read_test`
std::fs::read: 59099µs
Terrible hack: 96395µs
Now they've switched positions and std::fs::read
is significantly faster than the "terrible hack" implementation! This is a strong sign that we're not actually measuring what we think we're measuring.
Let's use strace
to see what syscalls we actually make:
std::fs::read
clock_gettime(CLOCK_MONOTONIC, {tv_sec=97469, tv_nsec=90079100}) = 0
openat(AT_FDCWD, "testfile", O_RDONLY|O_CLOEXEC) = 3
statx(0, NULL, AT_STATX_SYNC_AS_STAT, STATX_ALL, NULL) = -1 EFAULT (Bad address)
statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_ALL|0x1000, stx_attributes=0, stx_mode=S_IFREG|0644, stx_size=400000000, ...}) = 0
lseek(3, 0, SEEK_CUR) = 0
mmap(NULL, 400003072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f2331def000
read(3, "oktktftsgibrigcysbdamxxylnyfbqhr"..., 400000000) = 400000000
read(3, "", 32) = 0
close(3) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=97469, tv_nsec=153213700}) = 0
write(1, "std::fs::read: 63134\302\265s\n", 29std::fs::read: 63134µs
Terrible hack
clock_gettime(CLOCK_MONOTONIC, {tv_sec=97469, tv_nsec=153860400}) = 0
openat(AT_FDCWD, "testfile", O_RDONLY|O_CLOEXEC) = 3
statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_ALL|0x1000, stx_attributes=0, stx_mode=S_IFREG|0644, stx_size=400000000, ...}) = 0
mmap(NULL, 400003072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f231a076000
openat(AT_FDCWD, "testfile", O_RDONLY) = 4
read(4, "oktktftsgibrigcysbdamxxylnyfbqhr"..., 400000000) = 400000000
clock_gettime(CLOCK_MONOTONIC, {tv_sec=97469, tv_nsec=252478000}) = 0
write(1, "Terrible hack: 98617\302\265s\n", 29Terrible hack: 98617µs
While there are some slight differences in the exact syscalls made (for a variety of reasons), both basically just boil down to:
allocate enough memory for the file content (via mmap)
read all of the file content into memory
Given that whichever test case is first runs considerably faster and consists of the same core syscalls, it's reasonable to assume that what is actually happening is that the same operations take different amounts of time (with the first set being faster for some reason and the second set being slower) and that difference is what is being measured. The next step here would be to confirm that via perf
but I'm currently using WSL and that isn't available on my machine.
Note: I believe the "terrible hack" implementation is essentially equivalent to:
use std::{io::Read, time::Instant};
let tic = Instant::now();
let mut f = std::fs::File::open(path).unwrap();
let n = f.metadata().unwrap().len() as usize;
let mut v3 = vec![0u8; n];
let x = f.read(&mut v3).unwrap();
assert_eq!(x, n);
println!("std::io::Read::read: {:10}µs", tic.elapsed().as_micros());
Which doesn't require 3rd party crates, unsafe
or unstable features. It produces the following syscalls:
clock_gettime(CLOCK_MONOTONIC, {tv_sec=98081, tv_nsec=272918100}) = 0
openat(AT_FDCWD, "testfile", O_RDONLY|O_CLOEXEC) = 5
statx(5, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_ALL|0x1000, stx_attributes=0, stx_mode=S_IFREG|0644, stx_size=400000000, ...}) = 0
mmap(NULL, 400003072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fddf6880000
read(5, "oktktftsgibrigcysbdamxxylnyfbqhr"..., 400000000) = 400000000
clock_gettime(CLOCK_MONOTONIC, {tv_sec=98081, tv_nsec=399405200}) = 0
write(1, "std::io::Read::read_to_end: "..., 42std::io::Read::read_to_end: 126487µs
With similar performance characteristics to the others depending on which implementation runs first in the program.