Calculating sha1 for files using tokio::io::copy

I'm looking to add checksums for fileuploads by calculating sha1 when using tokio::io::copy.

let src  = StreamReader::new(request_body);
let mut dest = File::open("foo").await?;
tokio::io::copy(&mut src, &mut dest).await.unwrap();

What is the good way to calculate sha1? File size can be large spanning several gigs.

One option is to create an AsyncRead wrapper that calls the underlying AsyncRead type to do the read and then use any new data to update the hasher

use pin_project_lite::pin_project;
use sha1::{Digest, Sha1};
use std::{error::Error, io::Read, task::Poll};
use tokio::{
    fs::{File, OpenOptions},
    io::AsyncRead,
};

// Using pin_project_lite to avoid the unsafe pin manipulation
pin_project! {
    pub struct HashRead<T> {
        #[pin]
        read: T,
        // CAUTION: Sha1 is considered broken, don't use it where you need strong
        // cryptographic guarantees from a hash function
        hasher: Sha1,
    }
}

impl<T> HashRead<T> {
    pub fn new(read: T) -> Self {
        Self {
            read,
            hasher: Sha1::new(),
        }
    }

    pub fn hash(self) -> Vec<u8> {
        self.hasher.finalize().as_slice().into()
    }
}

impl<T> AsyncRead for HashRead<T>
where
    T: AsyncRead,
{
    fn poll_read(
        self: std::pin::Pin<&mut Self>,
        cx: &mut std::task::Context<'_>,
        buf: &mut tokio::io::ReadBuf<'_>,
    ) -> std::task::Poll<std::io::Result<()>> {
        let this = self.project();
        let before_len = buf.filled().len();

        // Pass on the Poll result, updating the hasher if some new data was written to the buffer.
        match this.read.poll_read(cx, buf) {
            Poll::Pending => Poll::Pending,
            Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
            Poll::Ready(Ok(())) => {
                let filled = buf.filled();
                let after_len = filled.len();

                if after_len > before_len {
                    // new data was placed in the buffer, update the hasher with newly written data.
                    let new = &filled[before_len..];
                    this.hasher.update(new);
                }

                Poll::Ready(Ok(()))
            }
        }
    }
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let file_path = std::env::args().nth(1).unwrap();
    let mut src = HashRead::new(File::open(&file_path).await?);
    let mut dest = OpenOptions::new()
        .create(true)
        .write(true)
        .open("foo")
        .await?;
    tokio::io::copy(&mut src, &mut dest).await.unwrap();

    let hash = src.hash();
    println!("{hash:?}");

    // Check that the hash we calculated incrementally matches the hash if we just read the whole file in one operation and feed it directly to the hasher.
    assert_eq!(hash, {
        use std::fs::File;

        let mut buffer = Vec::new();
        File::open(file_path)
            .unwrap()
            .read_to_end(&mut buffer)
            .unwrap();

        let mut hasher = Sha1::new();
        hasher.update(&buffer);

        std::convert::identity::<Vec<u8>>(hasher.finalize().as_slice().into())
    });

    Ok(())
}

You can run that with

cargo run path/to/big/file
1 Like

This is perfect. Thanks!