From 9ef758787c78f271254f164c42f1bbbdd4971958 Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Tue, 13 Aug 2024 00:03:53 -0600 Subject: [PATCH] Initial attempt at adding support for pcre This also significantly refactors the code for creating the regexes to match with, and in some cases avoids using regex altogether. --- Cargo.lock | 48 ++++++++++++- Cargo.toml | 3 + src/cli.rs | 16 ++++- src/config.rs | 3 - src/filesystem.rs | 18 ----- src/main.rs | 136 +++++++++++++++++------------------ src/patterns.rs | 175 ++++++++++++++++++++++++++++++++++++++++++++++ src/walk.rs | 24 +++---- 8 files changed, 312 insertions(+), 111 deletions(-) create mode 100644 src/patterns.rs diff --git a/Cargo.lock b/Cargo.lock index 3fbdb102f..9a6e5f26a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,6 +131,11 @@ name = "cc" version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "065a29261d53ba54260972629f9ca6bffa69bac13cd1fed61420f7fa68b9f8bd" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] [[package]] name = "cfg-if" @@ -334,9 +339,11 @@ dependencies = [ "jemallocator", "libc", "lscolors", + "memchr", "nix 0.29.0", "normpath", "nu-ansi-term", + "pcre2", "regex", "regex-syntax", "tempfile", @@ -455,6 +462,15 @@ dependencies = [ "libc", ] +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -499,9 +515,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "nix" @@ -571,6 +587,34 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "pcre2" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be55c43ac18044541d58d897e8f4c55157218428953ebd39d86df3ba0286b2b" +dependencies = [ + "libc", + "log", + "pcre2-sys", +] + +[[package]] +name = "pcre2-sys" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "550f5d18fb1b90c20b87e161852c10cde77858c3900c5059b5ad2a1449f11d8a" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + [[package]] name = "proc-macro2" version = "1.0.81" diff --git a/Cargo.toml b/Cargo.toml index f60835a70..5b32eb1fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,7 +48,9 @@ etcetera = "0.8" normpath = "1.1.1" crossbeam-channel = "0.5.13" clap_complete = {version = "4.5.8", optional = true} +memchr = "2.7.4" faccess = "0.2.4" +pcre2 = {version = "0.2.9", optional = true} [dependencies.clap] version = "4.5.13" @@ -91,4 +93,5 @@ codegen-units = 1 use-jemalloc = ["jemallocator"] completions = ["clap_complete"] base = ["use-jemalloc"] +pcre = ["dep:pcre2"] default = ["use-jemalloc", "completions"] diff --git a/src/cli.rs b/src/cli.rs index 9bdbcc7d4..000d39642 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -166,6 +166,18 @@ pub struct Opts { )] pub regex: bool, + /// Use the PCRE regex engine + /// + /// This allows you to use features like backreferences and lookarounds. + #[cfg(feature = "pcre")] + #[arg( + long, + overrides_with_all(["glob", "regex"]), + conflicts_with("fixed_strings"), + long_help + )] + pub pcre: bool, + /// Treat the pattern as a literal string instead of a regular expression. Note /// that this also performs substring comparison. If you want to match on an /// exact filename, consider using '--glob'. @@ -605,13 +617,11 @@ pub struct Opts { /// is considered a match. If your pattern starts with a dash (-), make sure to /// pass '--' first, or it will be considered as a flag (fd -- '-foo'). #[arg( - default_value = "", - hide_default_value = true, value_name = "pattern", help = "the search pattern (a regular expression, unless '--glob' is used; optional)", long_help )] - pub pattern: String, + pub pattern: Option, /// Set the path separator to use when printing file paths. The default is /// the OS-specific separator ('/' on Unix, '\' on Windows). diff --git a/src/config.rs b/src/config.rs index 9e18120c4..0bbbe7e3d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,9 +12,6 @@ use crate::fmt::FormatTemplate; /// Configuration options for *fd*. pub struct Config { - /// Whether the search is case-sensitive or case-insensitive. - pub case_sensitive: bool, - /// Whether to search within the full file path or just the base name (filename or directory /// name). pub search_full_path: bool, diff --git a/src/filesystem.rs b/src/filesystem.rs index 2a642edd2..44e4faf01 100644 --- a/src/filesystem.rs +++ b/src/filesystem.rs @@ -1,6 +1,4 @@ -use std::borrow::Cow; use std::env; -use std::ffi::OsStr; use std::fs; use std::io; #[cfg(any(unix, target_os = "redox"))] @@ -99,22 +97,6 @@ pub fn is_pipe(_: fs::FileType) -> bool { false } -#[cfg(any(unix, target_os = "redox"))] -pub fn osstr_to_bytes(input: &OsStr) -> Cow<[u8]> { - use std::os::unix::ffi::OsStrExt; - Cow::Borrowed(input.as_bytes()) -} - -#[cfg(windows)] -pub fn osstr_to_bytes(input: &OsStr) -> Cow<[u8]> { - let string = input.to_string_lossy(); - - match string { - Cow::Owned(string) => Cow::Owned(string.into_bytes()), - Cow::Borrowed(string) => Cow::Borrowed(string.as_bytes()), - } -} - /// Remove the `./` prefix from a path. pub fn strip_current_dir(path: &Path) -> &Path { path.strip_prefix(".").unwrap_or(path) diff --git a/src/main.rs b/src/main.rs index 88e6b4cb7..a0c52dba1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,7 @@ mod filter; mod fmt; mod hyperlink; mod output; +mod patterns; mod regex_helper; mod walk; @@ -21,9 +22,9 @@ use std::time; use anyhow::{anyhow, bail, Context, Result}; use clap::{CommandFactory, Parser}; -use globset::GlobBuilder; use lscolors::LsColors; -use regex::bytes::{Regex, RegexBuilder, RegexSetBuilder}; +use patterns::PatternType; +use regex::bytes::RegexSetBuilder; use crate::cli::{ColorWhen, HyperlinkWhen, Opts}; use crate::config::Config; @@ -33,6 +34,7 @@ use crate::filetypes::FileTypes; #[cfg(unix)] use crate::filter::OwnerFilter; use crate::filter::TimeFilter; +use crate::patterns::build_patterns; use crate::regex_helper::{pattern_has_uppercase_char, pattern_matches_strings_with_leading_dot}; // We use jemalloc for performance reasons, see https://github.com/sharkdp/fd/pull/481 @@ -70,7 +72,7 @@ fn main() { } fn run() -> Result { - let opts = Opts::parse(); + let mut opts = Opts::parse(); #[cfg(feature = "completions")] if let Some(shell) = opts.gen_completions()? { @@ -84,28 +86,24 @@ fn run() -> Result { } ensure_search_pattern_is_not_a_path(&opts)?; - let pattern = &opts.pattern; - let exprs = &opts.exprs; - let empty = Vec::new(); + let mut patterns = opts.exprs.take().unwrap_or(Vec::new()); + if let Some(pattern) = opts.pattern.take() { + patterns.push(pattern); + } - let pattern_regexps = exprs - .as_ref() - .unwrap_or(&empty) - .iter() - .chain([pattern]) - .map(|pat| build_pattern_regex(pat, &opts)) - .collect::>>()?; + let pattern_type = determine_pattern_type(&opts); + // The search will be case-sensitive if the command line flag is set or + // if any of the patterns has an uppercase character (smart case). + let ignore_case = opts.ignore_case + || !(opts.case_sensitive || patterns.iter().any(|pat| pattern_has_uppercase_char(pat))); - let config = construct_config(opts, &pattern_regexps)?; + let config = construct_config(opts)?; - ensure_use_hidden_option_for_leading_dot_pattern(&config, &pattern_regexps)?; + ensure_use_hidden_option_for_leading_dot_pattern(&patterns, &config, pattern_type)?; - let regexps = pattern_regexps - .into_iter() - .map(|pat| build_regex(pat, &config)) - .collect::>>()?; + let matcher = build_patterns(patterns, pattern_type, ignore_case)?; - walk::scan(&search_paths, regexps, config) + walk::scan(&search_paths, matcher, config) } #[cfg(feature = "completions")] @@ -145,35 +143,38 @@ fn set_working_dir(opts: &Opts) -> Result<()> { /// Detect if the user accidentally supplied a path instead of a search pattern fn ensure_search_pattern_is_not_a_path(opts: &Opts) -> Result<()> { - if !opts.full_path - && opts.pattern.contains(std::path::MAIN_SEPARATOR) - && Path::new(&opts.pattern).is_dir() - { - Err(anyhow!( - "The search pattern '{pattern}' contains a path-separation character ('{sep}') \ - and will not lead to any search results.\n\n\ - If you want to search for all files inside the '{pattern}' directory, use a match-all pattern:\n\n \ - fd . '{pattern}'\n\n\ - Instead, if you want your pattern to match the full file path, use:\n\n \ - fd --full-path '{pattern}'", - pattern = &opts.pattern, - sep = std::path::MAIN_SEPARATOR, - )) - } else { - Ok(()) + if let Some(ref pattern) = opts.pattern { + if !opts.full_path + && pattern.contains(std::path::MAIN_SEPARATOR) + && Path::new(pattern).is_dir() + { + return Err(anyhow!( + "The search pattern '{pattern}' contains a path-separation character ('{sep}') \ + and will not lead to any search results.\n\n\ + If you want to search for all files inside the '{pattern}' directory, use a match-all pattern:\n\n \ + fd . '{pattern}'\n\n\ + Instead, if you want your pattern to match the full file path, use:\n\n \ + fd --full-path '{pattern}'", + pattern = pattern, + sep = std::path::MAIN_SEPARATOR, + )); + } } + Ok(()) } -fn build_pattern_regex(pattern: &str, opts: &Opts) -> Result { - Ok(if opts.glob && !pattern.is_empty() { - let glob = GlobBuilder::new(pattern).literal_separator(true).build()?; - glob.regex().to_owned() +fn determine_pattern_type(opts: &Opts) -> PatternType { + #[cfg(feature = "pcre")] + if opts.pcre { + return PatternType::Pcre; + } + if opts.glob { + PatternType::Glob } else if opts.fixed_strings { - // Treat pattern as literal string if '--fixed-strings' is used - regex::escape(pattern) + PatternType::Fixed } else { - String::from(pattern) - }) + PatternType::Regex + } } fn check_path_separator_length(path_separator: Option<&str>) -> Result<()> { @@ -190,15 +191,7 @@ fn check_path_separator_length(path_separator: Option<&str>) -> Result<()> { } } -fn construct_config(mut opts: Opts, pattern_regexps: &[String]) -> Result { - // The search will be case-sensitive if the command line flag is set or - // if any of the patterns has an uppercase character (smart case). - let case_sensitive = !opts.ignore_case - && (opts.case_sensitive - || pattern_regexps - .iter() - .any(|pat| pattern_has_uppercase_char(pat))); - +fn construct_config(mut opts: Opts) -> Result { let path_separator = opts .path_separator .take() @@ -244,7 +237,6 @@ fn construct_config(mut opts: Opts, pattern_regexps: &[String]) -> Result Result> { } fn ensure_use_hidden_option_for_leading_dot_pattern( + patterns: &[String], config: &Config, - pattern_regexps: &[String], + pattern_type: PatternType, ) -> Result<()> { if cfg!(unix) && config.ignore_hidden - && pattern_regexps - .iter() - .any(|pat| pattern_matches_strings_with_leading_dot(pat)) + && patterns_match_strings_with_leading_dots(patterns, pattern_type) { Err(anyhow!( "The pattern(s) seems to only match files with a leading dot, but hidden files are \ @@ -474,17 +465,20 @@ fn ensure_use_hidden_option_for_leading_dot_pattern( } } -fn build_regex(pattern_regex: String, config: &Config) -> Result { - RegexBuilder::new(&pattern_regex) - .case_insensitive(!config.case_sensitive) - .dot_matches_new_line(true) - .build() - .map_err(|e| { - anyhow!( - "{}\n\nNote: You can use the '--fixed-strings' option to search for a \ - literal string instead of a regular expression. Alternatively, you can \ - also use the '--glob' option to match on a glob pattern.", - e.to_string() - ) - }) +fn patterns_match_strings_with_leading_dots( + patterns: &[String], + pattern_type: PatternType, +) -> bool { + let mut iter = patterns.iter(); + match pattern_type { + PatternType::Regex => iter.any(|pat| pattern_matches_strings_with_leading_dot(pat)), + // For PCRE just do a basic check if the pattern starts with "\." for a literal + // . since we can't parse it to an AST. + #[cfg(feature = "pcre")] + PatternType::Pcre => iter.any(|pat| pat.starts_with("^\\.")), + // fixed strings aren't anchored so always false + PatternType::Fixed => false, + // globs just check if it starts with a . + PatternType::Glob => patterns.iter().any(|pat| pat.starts_with(".")), + } } diff --git a/src/patterns.rs b/src/patterns.rs new file mode 100644 index 000000000..984294221 --- /dev/null +++ b/src/patterns.rs @@ -0,0 +1,175 @@ +use std::cell::RefCell; +use std::path::Path; + +use anyhow::{anyhow, Result}; +use globset::{Glob, GlobBuilder, GlobMatcher, GlobSet, GlobSetBuilder}; +use memchr::memmem; +use regex::bytes::{RegexSet, RegexSetBuilder}; + +pub trait Matcher { + fn matches_path(&self, path: &Path) -> bool; +} + +pub type Patterns = Box; + +#[derive(Eq, PartialEq, Copy, Clone)] +pub enum PatternType { + Regex, + Fixed, + Glob, + #[cfg(feature = "pcre")] + Pcre, +} + +impl Matcher for RegexSet { + fn matches_path(&self, path: &Path) -> bool { + let haystack = path.as_os_str().as_encoded_bytes(); + let matches = self.matches(haystack); + // Return true if the number of regexes that matched + // equals the total number of regexes. + matches.iter().count() == self.len() + } +} + +#[cfg(feature = "pcre")] +impl Matcher for Vec { + fn matches_path(&self, path: &Path) -> bool { + let path = path.as_os_str().as_encoded_bytes(); + self.iter().all(|pat| pat.is_match(path).unwrap()) + } +} + +thread_local! { + /// Thread local cache for Vec to use for globset matches + static GLOB_MATCHES: RefCell> = const { RefCell::new(Vec::new()) }; +} +impl Matcher for GlobSet { + fn matches_path(&self, path: &Path) -> bool { + GLOB_MATCHES.with_borrow_mut(|matches| { + self.matches_into(path, matches); + matches.len() == self.len() + }) + } +} + +/// In the common case a single glob, it is simpler, and +/// faster to just use a single Glob instead of a GlobSet +impl Matcher for GlobMatcher { + fn matches_path(&self, path: &Path) -> bool { + self.is_match(path) + } +} + +/// Matcher that matches fixed strings +pub struct FixedStrings(pub Vec); + +impl Matcher for FixedStrings { + fn matches_path(&self, path: &Path) -> bool { + let path = path.as_os_str().as_encoded_bytes(); + self.0.iter().all(|f| bytes_contains(path, f.as_bytes())) + } +} + +/// Matcher that matches everything +pub struct MatchAll; +impl Matcher for MatchAll { + fn matches_path(&self, _path: &Path) -> bool { + true + } +} +pub fn build_patterns( + mut patterns: Vec, + pattern_type: PatternType, + ignore_case: bool, +) -> Result { + if patterns.is_empty() { + return Ok(Box::new(MatchAll)); + } + match pattern_type { + PatternType::Glob => build_glob_matcher(patterns, ignore_case), + #[cfg(feature = "pcre")] + PatternType::Pcre => Ok(Box::new(build_pcre_matcher(patterns, ignore_case)?)), + PatternType::Fixed if !ignore_case => Ok(Box::new(FixedStrings(patterns))), + typ => { + // TODO: is there a better way we could handle case insensitive fixed strings? + if typ == PatternType::Fixed { + for pattern in patterns.iter_mut() { + *pattern = regex::escape(pattern); + } + } + Ok(Box::new(build_regex_matcher(patterns, ignore_case)?)) + } + } +} + +fn build_glob_matcher(patterns: Vec, ignore_case: bool) -> Result { + Ok(if patterns.len() == 1 { + Box::new(build_glob(&patterns[0], ignore_case)?.compile_matcher()) + } else { + let mut builder = GlobSetBuilder::new(); + for pat in patterns { + builder.add(build_glob(&pat, ignore_case)?); + } + Box::new(builder.build()?) + }) +} + +fn build_glob(pattern: &str, ignore_case: bool) -> Result { + Ok(GlobBuilder::new(pattern) + .literal_separator(true) + .case_insensitive(ignore_case) + .build()?) +} + +// Should we enable the unicde/utf8 features for regex and pcre? + +#[cfg(feature = "pcre")] +fn build_pcre_matcher( + patterns: Vec, + ignore_case: bool, +) -> Result> { + use pcre2::bytes::RegexBuilder; + patterns + .iter() + .map(|pat| { + RegexBuilder::new() + .dotall(true) + .caseless(ignore_case) + .build(pat) + .map_err(|e| { + anyhow!( + "{}\n\nNote: You can use the '--fixed-strings' option to search for a \ + literal string instead of a regular expression. Alternatively, you can \ + also use the '--glob' option to match on a glob pattern.", + e.to_string() + ) + }) + }) + .collect() +} + +#[cfg(feature = "pcre")] +const PCRE_ALT_MSG: &str = " Use --pcre to enable perl-compatible regex features."; +#[cfg(not(feature = "pcre"))] +const PCRE_ALT_MSG: &str = ""; + +fn build_regex_matcher(patterns: Vec, ignore_case: bool) -> Result { + RegexSetBuilder::new(patterns) + .case_insensitive(ignore_case) + .dot_matches_new_line(true) + .build() + .map_err(|e| { + anyhow!( + "{}\n\nNote: You can use the '--fixed-strings' option to search for a \ + literal string instead of a regular expression. Alternatively, you can \ + also use the '--glob' option to match on a glob pattern.{}", + e.to_string(), + PCRE_ALT_MSG + ) + }) +} + +/// Test if the needle is a substring of the haystack +fn bytes_contains(haystack: &[u8], needle: &[u8]) -> bool { + memmem::find(haystack, needle).is_some() +} diff --git a/src/walk.rs b/src/walk.rs index d203702f2..552a9ce22 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -1,8 +1,7 @@ use std::borrow::Cow; -use std::ffi::OsStr; use std::io::{self, Write}; use std::mem; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; use std::thread; @@ -13,7 +12,6 @@ use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, SendError, Sender}; use etcetera::BaseStrategy; use ignore::overrides::{Override, OverrideBuilder}; use ignore::{WalkBuilder, WalkParallel, WalkState}; -use regex::bytes::Regex; use crate::config::Config; use crate::dir_entry::DirEntry; @@ -22,6 +20,7 @@ use crate::exec; use crate::exit_codes::{merge_exitcodes, ExitCode}; use crate::filesystem; use crate::output; +use crate::patterns::Patterns; /// The receiver thread can either be buffering results or directly streaming to the console. #[derive(PartialEq)] @@ -305,7 +304,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { /// State shared by the sender and receiver threads. struct WorkerState { /// The search patterns. - patterns: Vec, + patterns: Patterns, /// The command line configuration. config: Config, /// Flag for cleanly shutting down the parallel walk @@ -315,7 +314,7 @@ struct WorkerState { } impl WorkerState { - fn new(patterns: Vec, config: Config) -> Self { + fn new(patterns: Patterns, config: Config) -> Self { let quit_flag = Arc::new(AtomicBool::new(false)); let interrupt_flag = Arc::new(AtomicBool::new(false)); @@ -511,13 +510,13 @@ impl WorkerState { // Check the name first, since it doesn't require metadata let entry_path = entry.path(); - let search_str: Cow = if config.search_full_path { + let search_str: Cow = if config.search_full_path { let path_abs_buf = filesystem::path_absolute_form(entry_path) .expect("Retrieving absolute path succeeds"); - Cow::Owned(path_abs_buf.as_os_str().to_os_string()) + Cow::Owned(path_abs_buf) } else { match entry_path.file_name() { - Some(filename) => Cow::Borrowed(filename), + Some(filename) => Cow::Borrowed(filename.as_ref()), None => unreachable!( "Encountered file system entry without a file name. This should only \ happen for paths like 'foo/bar/..' or '/' which are not supposed to \ @@ -526,17 +525,14 @@ impl WorkerState { } }; - if !patterns - .iter() - .all(|pat| pat.is_match(&filesystem::osstr_to_bytes(search_str.as_ref()))) - { + if !patterns.matches_path(&search_str) { return WalkState::Continue; } // Filter out unwanted extensions. if let Some(ref exts_regex) = config.extensions { if let Some(path_str) = entry_path.file_name() { - if !exts_regex.is_match(&filesystem::osstr_to_bytes(path_str)) { + if !exts_regex.is_match(path_str.as_encoded_bytes()) { return WalkState::Continue; } } else { @@ -668,6 +664,6 @@ impl WorkerState { /// If the `--exec` argument was supplied, this will create a thread pool for executing /// jobs in parallel from a given command line and the discovered paths. Otherwise, each /// path will simply be written to standard output. -pub fn scan(paths: &[PathBuf], patterns: Vec, config: Config) -> Result { +pub fn scan(paths: &[PathBuf], patterns: Patterns, config: Config) -> Result { WorkerState::new(patterns, config).scan(paths) }