Skip to content

Commit

Permalink
Initial attempt at adding support for pcre
Browse files Browse the repository at this point in the history
This also significantly refactors the code for creating the regexes to
match with, and in some cases avoids using regex altogether.
  • Loading branch information
tmccombs committed Aug 13, 2024
1 parent 9cf415c commit 9ef7587
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 111 deletions.
48 changes: 46 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ etcetera = "0.8"
normpath = "1.1.1"
crossbeam-channel = "0.5.13"
clap_complete = {version = "4.5.8", optional = true}
memchr = "2.7.4"
faccess = "0.2.4"
pcre2 = {version = "0.2.9", optional = true}

[dependencies.clap]
version = "4.5.13"
Expand Down Expand Up @@ -91,4 +93,5 @@ codegen-units = 1
use-jemalloc = ["jemallocator"]
completions = ["clap_complete"]
base = ["use-jemalloc"]
pcre = ["dep:pcre2"]
default = ["use-jemalloc", "completions"]
16 changes: 13 additions & 3 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,18 @@ pub struct Opts {
)]
pub regex: bool,

/// Use the PCRE regex engine
///
/// This allows you to use features like backreferences and lookarounds.
#[cfg(feature = "pcre")]
#[arg(
long,
overrides_with_all(["glob", "regex"]),
conflicts_with("fixed_strings"),
long_help
)]
pub pcre: bool,

/// Treat the pattern as a literal string instead of a regular expression. Note
/// that this also performs substring comparison. If you want to match on an
/// exact filename, consider using '--glob'.
Expand Down Expand Up @@ -605,13 +617,11 @@ pub struct Opts {
/// is considered a match. If your pattern starts with a dash (-), make sure to
/// pass '--' first, or it will be considered as a flag (fd -- '-foo').
#[arg(
default_value = "",
hide_default_value = true,
value_name = "pattern",
help = "the search pattern (a regular expression, unless '--glob' is used; optional)",
long_help
)]
pub pattern: String,
pub pattern: Option<String>,

/// Set the path separator to use when printing file paths. The default is
/// the OS-specific separator ('/' on Unix, '\' on Windows).
Expand Down
3 changes: 0 additions & 3 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ use crate::fmt::FormatTemplate;

/// Configuration options for *fd*.
pub struct Config {
/// Whether the search is case-sensitive or case-insensitive.
pub case_sensitive: bool,

/// Whether to search within the full file path or just the base name (filename or directory
/// name).
pub search_full_path: bool,
Expand Down
18 changes: 0 additions & 18 deletions src/filesystem.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use std::borrow::Cow;
use std::env;
use std::ffi::OsStr;
use std::fs;
use std::io;
#[cfg(any(unix, target_os = "redox"))]
Expand Down Expand Up @@ -99,22 +97,6 @@ pub fn is_pipe(_: fs::FileType) -> bool {
false
}

#[cfg(any(unix, target_os = "redox"))]
pub fn osstr_to_bytes(input: &OsStr) -> Cow<[u8]> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(input.as_bytes())
}

#[cfg(windows)]
pub fn osstr_to_bytes(input: &OsStr) -> Cow<[u8]> {
let string = input.to_string_lossy();

match string {
Cow::Owned(string) => Cow::Owned(string.into_bytes()),
Cow::Borrowed(string) => Cow::Borrowed(string.as_bytes()),
}
}

/// Remove the `./` prefix from a path.
pub fn strip_current_dir(path: &Path) -> &Path {
path.strip_prefix(".").unwrap_or(path)
Expand Down
136 changes: 65 additions & 71 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ mod filter;
mod fmt;
mod hyperlink;
mod output;
mod patterns;
mod regex_helper;
mod walk;

Expand All @@ -21,9 +22,9 @@ use std::time;

use anyhow::{anyhow, bail, Context, Result};
use clap::{CommandFactory, Parser};
use globset::GlobBuilder;
use lscolors::LsColors;
use regex::bytes::{Regex, RegexBuilder, RegexSetBuilder};
use patterns::PatternType;
use regex::bytes::RegexSetBuilder;

use crate::cli::{ColorWhen, HyperlinkWhen, Opts};
use crate::config::Config;
Expand All @@ -33,6 +34,7 @@ use crate::filetypes::FileTypes;
#[cfg(unix)]
use crate::filter::OwnerFilter;
use crate::filter::TimeFilter;
use crate::patterns::build_patterns;
use crate::regex_helper::{pattern_has_uppercase_char, pattern_matches_strings_with_leading_dot};

// We use jemalloc for performance reasons, see https://github.com/sharkdp/fd/pull/481
Expand Down Expand Up @@ -70,7 +72,7 @@ fn main() {
}

fn run() -> Result<ExitCode> {
let opts = Opts::parse();
let mut opts = Opts::parse();

#[cfg(feature = "completions")]
if let Some(shell) = opts.gen_completions()? {
Expand All @@ -84,28 +86,24 @@ fn run() -> Result<ExitCode> {
}

ensure_search_pattern_is_not_a_path(&opts)?;
let pattern = &opts.pattern;
let exprs = &opts.exprs;
let empty = Vec::new();
let mut patterns = opts.exprs.take().unwrap_or(Vec::new());
if let Some(pattern) = opts.pattern.take() {
patterns.push(pattern);
}

let pattern_regexps = exprs
.as_ref()
.unwrap_or(&empty)
.iter()
.chain([pattern])
.map(|pat| build_pattern_regex(pat, &opts))
.collect::<Result<Vec<String>>>()?;
let pattern_type = determine_pattern_type(&opts);
// The search will be case-sensitive if the command line flag is set or
// if any of the patterns has an uppercase character (smart case).
let ignore_case = opts.ignore_case
|| !(opts.case_sensitive || patterns.iter().any(|pat| pattern_has_uppercase_char(pat)));

let config = construct_config(opts, &pattern_regexps)?;
let config = construct_config(opts)?;

ensure_use_hidden_option_for_leading_dot_pattern(&config, &pattern_regexps)?;
ensure_use_hidden_option_for_leading_dot_pattern(&patterns, &config, pattern_type)?;

let regexps = pattern_regexps
.into_iter()
.map(|pat| build_regex(pat, &config))
.collect::<Result<Vec<Regex>>>()?;
let matcher = build_patterns(patterns, pattern_type, ignore_case)?;

walk::scan(&search_paths, regexps, config)
walk::scan(&search_paths, matcher, config)
}

#[cfg(feature = "completions")]
Expand Down Expand Up @@ -145,35 +143,38 @@ fn set_working_dir(opts: &Opts) -> Result<()> {

/// Detect if the user accidentally supplied a path instead of a search pattern
fn ensure_search_pattern_is_not_a_path(opts: &Opts) -> Result<()> {
if !opts.full_path
&& opts.pattern.contains(std::path::MAIN_SEPARATOR)
&& Path::new(&opts.pattern).is_dir()
{
Err(anyhow!(
"The search pattern '{pattern}' contains a path-separation character ('{sep}') \
and will not lead to any search results.\n\n\
If you want to search for all files inside the '{pattern}' directory, use a match-all pattern:\n\n \
fd . '{pattern}'\n\n\
Instead, if you want your pattern to match the full file path, use:\n\n \
fd --full-path '{pattern}'",
pattern = &opts.pattern,
sep = std::path::MAIN_SEPARATOR,
))
} else {
Ok(())
if let Some(ref pattern) = opts.pattern {
if !opts.full_path
&& pattern.contains(std::path::MAIN_SEPARATOR)
&& Path::new(pattern).is_dir()
{
return Err(anyhow!(
"The search pattern '{pattern}' contains a path-separation character ('{sep}') \
and will not lead to any search results.\n\n\
If you want to search for all files inside the '{pattern}' directory, use a match-all pattern:\n\n \
fd . '{pattern}'\n\n\
Instead, if you want your pattern to match the full file path, use:\n\n \
fd --full-path '{pattern}'",
pattern = pattern,
sep = std::path::MAIN_SEPARATOR,
));
}
}
Ok(())
}

fn build_pattern_regex(pattern: &str, opts: &Opts) -> Result<String> {
Ok(if opts.glob && !pattern.is_empty() {
let glob = GlobBuilder::new(pattern).literal_separator(true).build()?;
glob.regex().to_owned()
fn determine_pattern_type(opts: &Opts) -> PatternType {
#[cfg(feature = "pcre")]
if opts.pcre {
return PatternType::Pcre;
}
if opts.glob {
PatternType::Glob
} else if opts.fixed_strings {
// Treat pattern as literal string if '--fixed-strings' is used
regex::escape(pattern)
PatternType::Fixed
} else {
String::from(pattern)
})
PatternType::Regex
}
}

fn check_path_separator_length(path_separator: Option<&str>) -> Result<()> {
Expand All @@ -190,15 +191,7 @@ fn check_path_separator_length(path_separator: Option<&str>) -> Result<()> {
}
}

fn construct_config(mut opts: Opts, pattern_regexps: &[String]) -> Result<Config> {
// The search will be case-sensitive if the command line flag is set or
// if any of the patterns has an uppercase character (smart case).
let case_sensitive = !opts.ignore_case
&& (opts.case_sensitive
|| pattern_regexps
.iter()
.any(|pat| pattern_has_uppercase_char(pat)));

fn construct_config(mut opts: Opts) -> Result<Config> {
let path_separator = opts
.path_separator
.take()
Expand Down Expand Up @@ -244,7 +237,6 @@ fn construct_config(mut opts: Opts, pattern_regexps: &[String]) -> Result<Config
let has_command = command.is_some();

Ok(Config {
case_sensitive,
search_full_path: opts.full_path,
ignore_hidden: !(opts.hidden || opts.rg_alias_ignore()),
read_fdignore: !(opts.no_ignore || opts.rg_alias_ignore()),
Expand Down Expand Up @@ -455,14 +447,13 @@ fn extract_time_constraints(opts: &Opts) -> Result<Vec<TimeFilter>> {
}

fn ensure_use_hidden_option_for_leading_dot_pattern(
patterns: &[String],
config: &Config,
pattern_regexps: &[String],
pattern_type: PatternType,
) -> Result<()> {
if cfg!(unix)
&& config.ignore_hidden
&& pattern_regexps
.iter()
.any(|pat| pattern_matches_strings_with_leading_dot(pat))
&& patterns_match_strings_with_leading_dots(patterns, pattern_type)
{
Err(anyhow!(
"The pattern(s) seems to only match files with a leading dot, but hidden files are \
Expand All @@ -474,17 +465,20 @@ fn ensure_use_hidden_option_for_leading_dot_pattern(
}
}

fn build_regex(pattern_regex: String, config: &Config) -> Result<regex::bytes::Regex> {
RegexBuilder::new(&pattern_regex)
.case_insensitive(!config.case_sensitive)
.dot_matches_new_line(true)
.build()
.map_err(|e| {
anyhow!(
"{}\n\nNote: You can use the '--fixed-strings' option to search for a \
literal string instead of a regular expression. Alternatively, you can \
also use the '--glob' option to match on a glob pattern.",
e.to_string()
)
})
fn patterns_match_strings_with_leading_dots(
patterns: &[String],
pattern_type: PatternType,
) -> bool {
let mut iter = patterns.iter();
match pattern_type {
PatternType::Regex => iter.any(|pat| pattern_matches_strings_with_leading_dot(pat)),
// For PCRE just do a basic check if the pattern starts with "\." for a literal
// . since we can't parse it to an AST.
#[cfg(feature = "pcre")]
PatternType::Pcre => iter.any(|pat| pat.starts_with("^\\.")),
// fixed strings aren't anchored so always false
PatternType::Fixed => false,
// globs just check if it starts with a .
PatternType::Glob => patterns.iter().any(|pat| pat.starts_with(".")),
}
}
Loading

0 comments on commit 9ef7587

Please sign in to comment.