Help with glob/globset or generall get all files in an directory based on glob pattern

Hi.

Next part of the migration from Java to rust :slight_smile:

Is it the rust way to use File glob via crates.io: Rust Package Registry or globset crates.io: Rust Package Registry or an directory iterator and a filter/match combo as shown in Directory Traversal - Rust Cookbook ?

I now need to collect all files in a log directory based on the file glob. The issue is the {...} pattern which is not supported by glob as described in this issue Several file extension pattern · Issue #163 · rust-lang/glob · GitHub . I thought that globset could help here. :person_shrugging:

The globs

/log71761/{2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09,2025-10}.{gz,log}

and

/log71771/{2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09,2025-10}.{gz,log}

DIRECTORY1=/log71761 \
DIRECTORY2=/log71771 \
DEST_FILE=lala.csv \
cargo run

use glob::glob;
use globset::GlobBuilder;
use jiff::{ToSpan, Zoned, fmt::strtime};
use std::{
    env,
    path::{Path, PathBuf, MAIN_SEPARATOR_STR},
    process::exit,
    str::FromStr,
};

const PATH1: &str = "/log/76/";
const PATH2: &str = "/log/77/";

fn main() {
    println!("Hello, world!");
    let path1 = env::var("DIRECTORY1").unwrap_or(PATH1.to_string());
    let path2 = env::var("DIRECTORY2").unwrap_or(PATH2.to_string());
    let _dest_file = env::var("DEST_FILE").unwrap_or(DEST_FILE.to_string());

    //create_dir_glob creates the file glob
    println!(
        "dir files: {:#?}",
        get_log_file_names(&[path1, path2], create_dir_glob())
    );
}

fn get_log_file_names(paths: &[String], glob_pattern: String) -> Vec::<PathBuf> {

    let mut all_files= Vec::<PathBuf>::new();

    // Iterate over given Paths and collect all files in the directory
    // based on glob pattern

    for path in paths {
        println!("My path :{:#?}:", path);
        println!("glob_pattern :{:#?}:", glob_pattern);

        // combine the given directory with file glob
        let mut full_path = String::from(path);
        full_path.push_str(MAIN_SEPARATOR_STR);
        full_path.push_str(&glob_pattern);

        // here is now my issue. globset || glob || filter?
        println!("full_path :{:#?}:", full_path);
        let mut glob = GlobBuilder::new(&full_path);
        glob.literal_separator(true);

        let my_glob = match glob.build() {
            Ok(new_glob) => {
                println!("new_glob :{}:", new_glob);
                new_glob
            }
            Err(e_glob) => {
                println!("Error at glob: {:#?}", e_glob.to_string());
                exit(-2);
            }
        };

        println!("glob :{:?}:", my_glob);
        //for entry in glob(&full_path) {
            /* for entry in match glob(&full_path) {
                Ok(my_entry) => {
                    println!("my_entry :{:?}", my_entry);
                    my_entry
                }
                Err(e_glob) => {
                    println!("Error at glob: {:#?}", e_glob.to_string());
                    exit(-2);
                }
            } { */
            //println!("entry :{:#?}", entry);
            // add file entry to the all_files Vector
            // all_files
        //}
    }

    //"".to_string()
    all_files
}

fn create_dir_glob() -> String {
    let mut to_glob_files = match String::from_str("*{") {
        Ok(new_str) => new_str,
        Err(e) => {
            println!("Error at String::from_str error: {:#?}", e.to_string());
            exit(-1);
        }
    };
    let start: Zoned = Zoned::now();
    let start_minus_n_months = match start.checked_sub(7.months()) {
        Ok(new_months) => new_months,
        Err(e_mon) => {
            println!(
                "Error at checked_sub_months error: {:#?}",
                e_mon.to_string()
            );
            exit(-2);
        }
    };

    let it = start_minus_n_months
        .datetime()
        .series(1.month())
        .filter_map(|dt| dt.to_zoned(start.time_zone().clone()).ok())
        .take_while(|zdt| zdt <= start);

    for zdt in it {
        let temp = match strtime::format("%Y-%m,", &zdt) {
            Ok(new_temp) => new_temp,
            Err(e_format) => {
                println!(
                    "Error at strtime::format error: {:#?}",
                    e_format.to_string()
                );
                exit(-2);
            }
        };
        to_glob_files.push_str(&temp);
        //println!("* {}", zdt.strftime("%Y-%m"));
    }

    // println!("capa: {}", to_glob_files.capacity());
    // println!("len: {}", to_glob_files.len());

    // Remove last ',' from the loop above
    to_glob_files.truncate(to_glob_files.len() - 1);

    // println!("len after -1: {}", to_glob_files.len());

    to_glob_files.push_str("}*.{gz,log}");

    //println!("to glob: '{}'", to_glob_files);

    to_glob_files
}

Thanks for reading and help.

The glob syntax with curly braces is fairly niche, and globs aren't really standardized in general. I'm not surprised it's not supported.

You can do the same thing with a regular expressions using the | operator, here using the regex crate.

Playground

use regex::Regex;

fn main() {
    // Note: The parenthesis are important because of the precedence of the `|` operator in regular expressions. 
    // The period also needs to be escaped as it would otherwise match any character.
    let search = Regex::new(r"(2025|2024|2023)\.(gz|log)").unwrap();

    assert!(search.is_match("2025.gz"));
    assert!(search.is_match("2025.log"));
    assert!(!search.is_match("2025.fake"));
    assert!(!search.is_match("2022.log"));
    assert!(!search.is_match("2025@log"));
}

You can combine this with std::fs::read_dir to get the same effect as the glob. Note that this assumes that you don't need to recurse into child directories. If you do then something like WalkDir might be preferable.

Here's what that code might look like (I got a little lazy about error handling so I'm not doing explicit exits like you are elsewhere, but I assume you can replace the panics where you want explicit exit codes).

use jiff::{fmt::strtime, ToSpan, Zoned};
use regex::Regex;
use std::{env, path::PathBuf, process::exit, str::FromStr};

const PATH1: &str = "/log/76/";
const PATH2: &str = "/log/77/";

fn main() {
    println!("Hello, world!");
    let path1 = env::var("DIRECTORY1").unwrap_or(PATH1.to_string());
    let path2 = env::var("DIRECTORY2").unwrap_or(PATH2.to_string());

    //create_dir_glob creates the file glob
    println!(
        "dir files: {:#?}",
        get_log_file_names(&[path1, path2], create_dir_regex())
    );
}

fn get_log_file_names(paths: &[String], regex: Regex) -> Vec<PathBuf> {
    let mut all_files = Vec::<PathBuf>::new();

    // Iterate over given Paths and collect all files in the directory
    // based on glob pattern

    for path in paths {
        println!("My path :{:#?}:", path);
        println!("regex :{:#?}:", regex);

        for dir_entry in
            std::fs::read_dir(path).unwrap_or_else(|_| panic!("Failed to read directory: {path}"))
        {
            let dir_entry = match dir_entry {
                Ok(d) => d,
                Err(e) => panic!("Failed to read file metadata in directory {path}: {e}"),
            };

            // This gets the file NAME not the complete path. My understanding based on your code is that this is all you actually care about, so using the name here avoids having to manipulate the path to get back the file name.
            match dir_entry.file_name().to_str() {
                Some(string) => {
                    if regex.is_match(string) {
                        // Here we do use the complete path returned by the directory entry (though note that this is not necessarily an absolute path, depending on what the path passed to `read_dir` was)
                        all_files.push(dir_entry.path().to_owned());
                    } else {
                        println!("Did not match: {string}")
                    }
                }
                None => println!("File with non-UTF name was skipped: {:?}", dir_entry.path()),
            }
        }
    }

    all_files
}

fn create_dir_regex() -> Regex {
    let mut to_glob_files = match String::from_str("(") {
        Ok(new_str) => new_str,
        Err(e) => {
            println!("Error at String::from_str error: {:#?}", e.to_string());
            exit(-1);
        }
    };
    let start: Zoned = Zoned::now();
    let start_minus_n_months = match start.checked_sub(7.months()) {
        Ok(new_months) => new_months,
        Err(e_mon) => {
            println!(
                "Error at checked_sub_months error: {:#?}",
                e_mon.to_string()
            );
            exit(-2);
        }
    };

    let it = start_minus_n_months
        .datetime()
        .series(1.month())
        .filter_map(|dt| dt.to_zoned(start.time_zone().clone()).ok())
        .take_while(|zdt| zdt <= start);

    for zdt in it {
        let temp = match strtime::format("%Y-%m|", &zdt) {
            Ok(new_temp) => new_temp,
            Err(e_format) => {
                println!(
                    "Error at strtime::format error: {:#?}",
                    e_format.to_string()
                );
                exit(-2);
            }
        };
        to_glob_files.push_str(&temp);
        //println!("* {}", zdt.strftime("%Y-%m"));
    }

    // println!("capa: {}", to_glob_files.capacity());
    // println!("len: {}", to_glob_files.len());

    // Remove last '|' from the loop above
    to_glob_files.pop().unwrap();

    // Close the group for matching the date portion.
    to_glob_files.push(')');

    // Your code included a glob segment after the dates, so I assume you wanted to allow other characters there between the date and the period. This is regex syntax for matching any character a variable number of times (including zero).
    to_glob_files.push_str(".*");

    // We want to include a literal backslash before the period to escape the regular expression character `.`
    // We could use a double backslash instead of a raw string literal.
    // Without the backslash this period would match any character rather than only the period.
    // We use the same mechanism here to allow either "gz" or "log" files
    to_glob_files.push_str(r"\.(gz|log)");

    //println!("to glob: '{}'", to_glob_files);

    // Build the regex. This can be expensive so we don't want to do it more than once if we can help it.
    Regex::new(&to_glob_files).expect("Invalid regex syntax")
}

You could also return Results from your functions using a custom error type implementing Termination to customize the exit code in a more Rust-native way

I'm the author of globset (and regex, and jiff and walkdir). globset supports curly braces.

If you can simplify your question with a reasonably minimal reproduction, then I'd be happy to take a look. But it's too complicated for me to understand quickly as it stands now. A simple reproduction includes at least the following:

  • A reasonably short Rust program, ideally focusing on the specific thing that isn't working for you.
  • A Cargo.toml
  • Any runtime files required to run it (like a shell script to set up a directory tree).
  • The commands you use to compile and run the program.
  • The desired or expected output.
  • The actual output that you observe.
2 Likes

Wow thank you for the great offer to help me to fix this.

That's the setup-test-env.sh script to create the test environment.

#!/bin/bash

if [ -z ${DIRECTORY1} ]; then
  echo
  echo "Please define the test directory with env var DIRECTORY1"
  echo
  exit -1
fi

if [ ${DEBUG} ] ; then
  set -x
fi

if [ ${CLEANUP} ]; then

  echo
  echo "Delete test directory from env var DIRECTORY1"
  echo

  rm -r ${DIRECTORY1}
  exit -2
fi

mkdir -p ${DIRECTORY1}/

file_glob='*{'

for i in $(seq 1 9); do
  echo ${i}
  date --date "${i} months ago" +"%Y-%m-01.gz"
  date --date "${i} months ago" +"%Y-%m-02.gz"
  date --date "${i} months ago" +"%Y-%m-03.gz"


  echo "${i}" |gzip > ${DIRECTORY1}/$(date --date "${i} months ago" +"%Y-%m-01.gz")
  echo "${i}" |gzip > ${DIRECTORY1}/$(date --date "${i} months ago" +"%Y-%m-02.gz")
  echo "${i}" |gzip > ${DIRECTORY1}/$(date --date "${i} months ago" +"%Y-%m-03.gz")
  
  file_glob=${file_glob}$(date --date "${i} months ago" +"%Y-%m,")
done

file_glob=$(echo ${file_glob}|sed --regexp-extended -e 's/,$//')
file_glob=${file_glob}"}*.{gz,log}"

echo ${file_glob}
# bash issue
eval ls -1 ${DIRECTORY1}/${file_glob}

# run rust command
if [ ${RUST} ]; then
  cargo run
fi

With DIRECTORY1=test-dir bash setup-test-env.sh will the files be created in the specified DIRECTORY1.

With RUST=1 will the cargo run be executed in the current directory.

With CLEANUP=1 DIRECTORY1=test-dir bash setup-test-env.sh can the whole DIRECTORY1 be removed.

The expected behavior is to have the file list, array or Vector or anything other which fit's best in rust, which I then iterate and read it. As this are CSV files will I read the file one by one and analyze every single line

The main.rs

use globset::GlobBuilder;
use std::{
    env,
    path::{MAIN_SEPARATOR_STR, PathBuf},
    process::exit,
};

const PATH1: &str = "/log/76/";

fn main() {
    let path1 = env::var("DIRECTORY1").unwrap_or(PATH1.to_string());

    let glob_pattern =
        "*{2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09,2025-10}*.{gz,log}";
    let mut all_files = Vec::<PathBuf>::new();

    // Iterate over given Paths and collect all files in the directory
    // based on glob pattern

    let mut full_path = String::from(path1);
    full_path.push_str(MAIN_SEPARATOR_STR);
    full_path.push_str(&glob_pattern);

    println!("full_path :{:#?}:", full_path);
    let mut glob = GlobBuilder::new(&full_path);
    glob.literal_separator(true);

    let my_glob = match glob.build() {
        Ok(new_glob) => {
            println!("new_glob :{}:", new_glob);
            new_glob
        }
        Err(e_glob) => {
            println!("Error at glob: {:#?}", e_glob.to_string());
            exit(-2);
        }
    };

    //println!("glob :{:?}:", my_glob);

    // add file entry to the all_files Vector
    println!("all_files :{:#?}:", all_files.len());
}

The output of all_files should be all_files :21: because the ls below shows the working glob.

The Cargo.toml

[package]
name = "jut-rs"
version = "0.1.0"
edition = "2024"

[dependencies]
glob = "0.3.3"
globset = "0.4.16"
jiff = { version = "0.2.15", features = ["logging"] }
log = "0.4.28"
mimalloc = "0.1.48"
tracing = { version = "0.1.41", features = ["max_level_debug", "max_level_error", "max_level_info", "max_level_trace", "max_level_warn"] }
tracing-subscriber = { version = "0.3.20", features = ["json", "local-time"] }

The ls output of the directory.

# ls -1 test-dir/*{2025-09,2025-08,2025-07,2025-06,2025-05,2025-04,2025-03,2025-02,2025-01}*.{gz,log}
## ls: cannot access 'test-dir/*2025-09*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-08*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-07*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-06*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-05*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-04*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-03*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-02*.log': No such file or directory
## ls: cannot access 'test-dir/*2025-01*.log': No such file or directory
## test-dir/2025-01-01.gz
## test-dir/2025-01-02.gz
## test-dir/2025-01-03.gz
## test-dir/2025-02-01.gz
## test-dir/2025-02-02.gz
## test-dir/2025-02-03.gz
## test-dir/2025-03-01.gz
## test-dir/2025-03-02.gz
## test-dir/2025-03-03.gz
## test-dir/2025-04-01.gz
## test-dir/2025-04-02.gz
## test-dir/2025-04-03.gz
## test-dir/2025-05-01.gz
## test-dir/2025-05-02.gz
## test-dir/2025-05-03.gz
## test-dir/2025-06-01.gz
## test-dir/2025-06-02.gz
## test-dir/2025-06-03.gz
## test-dir/2025-07-01.gz
## test-dir/2025-07-02.gz
## test-dir/2025-07-03.gz
## test-dir/2025-08-01.gz
## test-dir/2025-08-02.gz
## test-dir/2025-08-03.gz
## test-dir/2025-09-01.gz
## test-dir/2025-09-02.gz
## test-dir/2025-09-03.gz

If helps, this is the java code for the files collecting

// dirs is "List<String> dirs"
// files is "List<Path> files = new ArrayList<>();"

		for (String dir : dirs) {
			Files.newDirectoryStream(Paths.get(dir),toGlobFiles.toString())
            .forEach(files::add);
		}

		logger.info("Found # of files {}", files.size());

That's then the Java code for the iteration file by file and line by line

		Map<String, Map<String, Map<String, Map<String, Map<String, Optional<JutMessage>>>>>> By = 
           logFiles.stream()
				.flatMap(f -> GZIPFiles.lines(f))
				.filter(line -> line.length() > 2)
                .......More java code

I appreciate the repro, but that is way over-complicated. Just distill it down to the specific glob match that isn't working. Like this:

use std::path::Path;

use globset::GlobBuilder;

fn main() -> anyhow::Result<()> {
    let pattern = "test-dir/*{2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09,2025-10}*.{gz,log}";
    let mut glob = GlobBuilder::new(&pattern);
    glob.literal_separator(true);
    let my_glob = glob.build()?.compile_matcher();

    let path = Path::new("./test-dir/2025-06-01.gz");
    assert!(my_glob.is_match(path));

    Ok(())
}

That assertion fails. And it should be clear why: the glob doesn't have ./ but the path does. Perhaps glob matching should know to ignore the ./, but globset does not do that today. So strip the ./ prefix:

use std::path::Path;

use globset::GlobBuilder;

fn main() -> anyhow::Result<()> {
    let pattern = "test-dir/*{2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09,2025-10}*.{gz,log}";
    let mut glob = GlobBuilder::new(&pattern);
    glob.literal_separator(true);
    let my_glob = glob.build()?.compile_matcher();

    let path = Path::new("./test-dir/2025-06-01.gz");
    assert!(my_glob.is_match(path.strip_prefix("./").unwrap_or(path)));

    Ok(())
}

And similarly, applied to your original program (with some clean-up I did, including using anyhow):

use globset::GlobBuilder;
use std::{
    env,
    path::{PathBuf, MAIN_SEPARATOR_STR},
};

const PATH1: &str = "/log/76/";

fn main() -> anyhow::Result<()> {
    let path1 = env::var("DIRECTORY1").unwrap_or(PATH1.to_string());

    let glob_pattern =
        "*{2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09,2025-10}*.{gz,log}";

    // Iterate over given Paths and collect all files in the directory
    // based on glob pattern

    let mut full_path = String::from(path1);
    full_path.push_str(MAIN_SEPARATOR_STR);
    full_path.push_str(&glob_pattern);

    println!("full_path :{:#?}:", full_path);
    let mut glob = GlobBuilder::new(&full_path);
    glob.literal_separator(true);

    let my_glob = glob.build()?.compile_matcher();

    let mut all_files = Vec::<PathBuf>::new();
    for result in walkdir::WalkDir::new("./test-dir") {
        let dent = result?;
        let path = dent.path();
        if my_glob.is_match(path.strip_prefix("./").unwrap_or(path)) {
            all_files.push(path.to_path_buf());
        }
    }
    // add file entry to the all_files Vector
    println!("all_files :{:#?}:", all_files.len());

    Ok(())
}

And I get (among a bunch of other junk) in the output:

all_files :21:
1 Like

Thank you soooo much for the solution.
The current file is this now below. I have also adopted the feedback from that answer Question to chrono and iterate over months - #8 by quinedot from @quinedot :slight_smile:

use mimalloc::MiMalloc;

#[global_allocator]
static GLOBAL: MiMalloc = MiMalloc;

use anyhow::bail;
use globset::GlobBuilder;
use jiff::{ToSpan, Zoned, fmt::strtime};
use log::{debug, info};
use std::{
    env,
    path::{MAIN_SEPARATOR_STR, PathBuf},
    result::Result::Ok,
};
use tracing_subscriber::EnvFilter;

const PATH1: &str = "/log/76/";
const PATH2: &str = "/log/77/";
const DEST_FILE: &str = "lastDateAnalyse.txt";

fn main() -> anyhow::Result<()> {
    let start = Zoned::now();

    tracing_subscriber::fmt()
        .with_max_level(tracing::Level::TRACE)
        .with_env_filter(EnvFilter::from_default_env())
        .with_thread_ids(true) // include the thread ID of the current thread
        .with_thread_names(true)
        //.json()
        //.with_current_span(false)
        .init();

    info!("Start.");

    let path1 = env::var("DIRECTORY1").unwrap_or(PATH1.to_string());
    let path2 = env::var("DIRECTORY2").unwrap_or(PATH2.to_string());
    let _dest_file = env::var("DEST_FILE").unwrap_or(DEST_FILE.to_string());

    //println!("dir glob: {}",create_dir_glob());
    debug!(
        "dir files: {:#?}",
        get_log_file_names(&[path1, path2], create_dir_glob()?)
    );

    info!("End. Duration {:#?}", Zoned::now().since(&start)?);

    Ok(())
}

fn create_dir_glob() -> anyhow::Result<String> {
    /*
    let mut to_glob_files = match String::from_str("*{") {
        Ok(new_str) => new_str,
        Err(e) => bail!("Error at String::from_str error: {e}"),
    };
    */

    tracing::info!("lala");
    let mut to_glob_files = "*{".to_owned();

    let start: Zoned = Zoned::now();
    let start_minus_n_months = start.saturating_sub(7.months());

    let it = start_minus_n_months
        .datetime()
        .series(1.month())
        .filter_map(|dt| dt.to_zoned(start.time_zone().clone()).ok())
        .take_while(|zdt| zdt <= start);

    for zdt in it {
        let temp = match strtime::format("%Y-%m,", &zdt) {
            Ok(new_temp) => new_temp,
            Err(e_format) => bail!("Error at strtime::format error: {e_format}"),
        };
        to_glob_files.push_str(&temp);
        //println!("* {}", zdt.strftime("%Y-%m"));
    }

    // println!("capa: {}", to_glob_files.capacity());
    //println!("len: {}", to_glob_files.len());

    // Remove last ',' from the loop above
    // to_glob_files.truncate(to_glob_files.len() - 1);
    let _ = to_glob_files.pop();

    //println!("len after -1: {}", to_glob_files.len());

    to_glob_files.push_str("}*.{gz,log}");

    debug!("to glob: '{}'", to_glob_files);

    Ok(to_glob_files)
}

fn get_log_file_names(paths: &[String], glob_pattern: String) -> anyhow::Result<Vec<PathBuf>> {
    let mut all_files = Vec::<PathBuf>::new();

    // Iterate over given Paths and collect all files in the directory
    // based on glob pattern

    for path in paths {
        debug!("My path :{:#?}:", path);
        debug!("glob_pattern :{:#?}:", glob_pattern);

        let mut full_path = String::from(path);
        full_path.push_str(MAIN_SEPARATOR_STR);
        full_path.push_str(&glob_pattern);

        debug!("full_path :{:#?}:", full_path);
        let mut glob = GlobBuilder::new(&full_path);
        glob.literal_separator(true);

        let my_glob = glob.build()?.compile_matcher();

        //debug!("glob :{:?}:", my_glob);

        for result in walkdir::WalkDir::new(path) {
            //println!("Direntry :{:#?}",result);
            let dent = result?;
            let path = dent.path();
            if my_glob.is_match(path) {
                all_files.push(path.to_path_buf());
            }
        }
    }

    info!("Found {:#?} files", all_files.len());

    Ok(all_files)
}
``