Which mutex to use? parking_lot or std::sync?

parking_lot naturally has a bunch more features than std's mutex, even with recent improvements, so use it if you need them. Talking about what performance benefits there may or may not be is entirely pointless without benchmarks. Here's a simple one:

use std::ops::DerefMut;
use std::sync;
use std::time::Instant;
fn main() {
    let parallelism = std::thread::available_parallelism().unwrap().into();
    let now = Instant::now();
    mutex_bench::<sync::Mutex<_>>(1, 10_000_000);
    println!("std uncontended: {:?}", now.elapsed());
    let now = Instant::now();
    mutex_bench::<parking_lot::Mutex<_>>(1, 10_000_000);
    println!("parking_lot uncontended: {:?}", now.elapsed());
    let now = Instant::now();
    mutex_bench::<sync::Mutex<_>>(parallelism, 10_000_000 / parallelism);
    println!("std medium contention: {:?}", now.elapsed());
    let now = Instant::now();
    mutex_bench::<parking_lot::Mutex<_>>(parallelism, 10_000_000 / parallelism);
    println!("parking_lot medium contention: {:?}", now.elapsed());
    let now = Instant::now();
    mutex_bench::<sync::Mutex<_>>(500, 20_000);
    println!("std high contention: {:?}", now.elapsed());
    let now = Instant::now();
    mutex_bench::<parking_lot::Mutex<_>>(500, 20_000);
    println!("parking_lot high contention: {:?}", now.elapsed());
}

fn mutex_bench<M: Mutex<usize>>(threads: usize, count: usize) {
    let mutex = M::new(0usize);
    std::thread::scope(|s| {
        for _ in 0..threads {
            s.spawn(|| {
                for _ in 0..count {
                    let mut guard = mutex.lock();
                    *guard += 1;
                }
            });
        }
    });
    assert_eq!(*mutex.lock(), threads * count);
}

trait Mutex<T>: Sync + Sized {
    type Guard<'a>: DerefMut<Target = T> + 'a
    where
        Self: 'a;
    fn new(t: T) -> Self;
    fn lock(&self) -> Self::Guard<'_>;
}
impl<T: Send> Mutex<T> for sync::Mutex<T> {
    type Guard<'a> = sync::MutexGuard<'a, T> where Self: 'a;
    fn new(t: T) -> Self {
        sync::Mutex::new(t)
    }
    fn lock(&self) -> Self::Guard<'_> {
        self.lock().unwrap()
    }
}
impl<T: Send> Mutex<T> for parking_lot::Mutex<T> {
    type Guard<'a> = parking_lot::MutexGuard<'a, T> where Self: 'a;
    fn new(t: T) -> Self {
        parking_lot::Mutex::new(t)
    }
    fn lock(&self) -> Self::Guard<'_> {
        self.lock()
    }
}

Running it on my Windows 4 core, 8 thread x64 machine I got this:

std uncontended: 146.9767ms
parking_lot uncontended: 120.4104ms
std medium contention: 194.4102ms
parking_lot medium contention: 526.2664ms
std high contention: 205.6215ms
parking_lot high contention: 353.115ms

On my machine, parking_lot's better if the mutex is uncontended, but std wins otherwise.

edit:
Running the same benchmark on my raspberry pi 4b running 32-bit Linux (4 cores, 4 threads, target armv7-unknown-linux-musleabihf) I got this:

std uncontended: 537.478562ms
parking_lot uncontended: 520.8173ms
std medium contention: 1.24173564s
parking_lot medium contention: 788.189293ms
std high contention: 1.440051111s
parking_lot high contention: 39.99078841s

Under normal conditions, parking_lot significantly outperforms std's mutex, but hits a pathologic case with a very high number of threads.