From dde4727a1d80e5aff5eea3023c07318757292b7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20F=C3=A4rnstrand?= Date: Fri, 26 Jul 2024 13:58:41 +0200 Subject: [PATCH 1/5] Partially through messing with how to take decisions --- Cargo.lock | 72 +++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ src/config.rs | 42 ++++++++++++------ src/main.rs | 116 +++++++++++++++++++++++++++++++++++++++++++++++++- src/rules.rs | 17 +------- 5 files changed, 221 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 296940f..55e8492 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,6 +75,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "anyhow" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" + [[package]] name = "automod" version = "1.0.14" @@ -365,6 +371,48 @@ version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "caff54706df99d2a78a5a4e3455ff45448d81ef1bb63c22cd14052ca0e993a3f" +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "proc-macro2" version = "1.0.83" @@ -383,6 +431,21 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + [[package]] name = "rayon" version = "1.10.0" @@ -510,6 +573,12 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa42c91313f1d05da9b26f267f931cf178d4aba455b4c4622dd7355eb80c6640" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "smawk" version = "0.3.2" @@ -796,7 +865,10 @@ checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6" name = "unicop" version = "0.1.0" dependencies = [ + "anyhow", + "glob", "miette", + "phf", "serde", "toml", "tree-sitter", diff --git a/Cargo.toml b/Cargo.toml index 58ac136..e6f7217 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,9 @@ unic-char-range = "0.9.0" toml = "0.8.14" serde = { version = "1.0.203", features = ["derive"] } walkdir = "2.5.0" +anyhow = "1.0.86" +glob = "0.3.1" +phf = { version = "0.11.2", features = ["macros"] } [dev-dependencies] trycmd = "0.15.5" diff --git a/src/config.rs b/src/config.rs index 945a8a8..976ae13 100644 --- a/src/config.rs +++ b/src/config.rs @@ -60,7 +60,7 @@ fn unicode_notation_to_char(unicode_notation: &str) -> Result = phf::phf_map! { + "comment" => CodeType::Comment, + "block_comment" => CodeType::Comment, +}; + +impl Language { + pub fn lookup_code_type(&self, tree_sitter_code_type: &str) -> Option { + match self { + Language::Rust => RUST_CODE_TYPES.get(tree_sitter_code_type).copied(), + _ => None, + } + } +} + #[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)] -struct ConfigRules { +pub struct ConfigRules { #[serde(default)] - default: RuleSet, + pub default: RuleSet, #[serde(flatten)] - code_type_rules: HashMap, + pub code_type_rules: HashMap, } #[derive(Debug, Eq, PartialEq, serde::Deserialize)] -struct LanguageRules { +pub struct LanguageRules { + // None = Inherit default path globs + // Some([]) = No paths will ever match this language + // Some([...]) = Match every file against these glob patterns. + // Run this language parser if at least one matches. #[serde(default)] - paths: Vec, + pub paths: Option>, #[serde(flatten)] - rules: ConfigRules, + pub rules: ConfigRules, } #[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)] -struct Config { +pub struct Config { #[serde(default)] - global: ConfigRules, + pub global: ConfigRules, #[serde(default)] - language: HashMap, + pub language: HashMap, } #[cfg(test)] @@ -170,7 +188,7 @@ deny = ["Tibetan"] language: HashMap::from([( Language::Rust, LanguageRules { - paths: vec![], + paths: None, rules: ConfigRules { default: RuleSet { allow: vec![ diff --git a/src/main.rs b/src/main.rs index 690d159..6d7ffc2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,18 +1,91 @@ +use std::collections::HashMap; use std::env; use std::fs; +use std::io; use std::path::Path; +use anyhow::Context; +use config::CodeType; +use config::Config; +use config::Language; use miette::{miette, LabeledSpan, NamedSource, Severity}; +use rules::Decision; +use rules::RuleSet; use unic_ucd_name::Name; mod config; mod rules; -fn main() { +// Replaces the previous idea of "RuleChain"s. +struct RuleDispatcher { + user_config: Config, + default_config: Config, +} + +impl RuleDispatcher { + pub fn decision(&self, c: char, language: Language, code_type: Option) -> Decision { + if let Some(decision) = Self::decision_for_config(&self.user_config, c, language, code_type) + { + return decision; + } + if let Some(decision) = + Self::decision_for_config(&self.default_config, c, language, code_type) + { + return decision; + } + Decision::Deny + } + + // Rulechain: + // 1. Code type specific ruleset for specific language + // 2. Default ruleset for specific language + // 3. Code type specific ruleset in global section + // 4. Default rules in global section + fn decision_for_config( + config: &Config, + c: char, + language: Language, + code_type: Option, + ) -> Option { + if let Some(language_rules) = config.language.get(&language) { + // 1. + if let Some(language_code_type_rules) = + code_type.and_then(|ct| language_rules.rules.code_type_rules.get(&ct)) + { + if let Some(decision) = language_code_type_rules.decision(c) { + return Some(decision); + } + } + // 2. + if let Some(decision) = language_rules.rules.default.decision(c) { + return Some(decision); + } + } + // 3. + if let Some(global_code_type_rules) = + code_type.and_then(|ct| config.global.code_type_rules.get(&ct)) + { + if let Some(decision) = global_code_type_rules.decision(c) { + return Some(decision); + } + } + // 4. + if let Some(decision) = config.global.default.decision(c) { + return Some(decision); + } + // This config does not have any opinion on this character + None + } +} + +fn main() -> anyhow::Result<()> { let mut args: Vec = env::args().skip(1).collect(); if args.is_empty() { args = vec![String::from(".")] } + + let _config = get_config()?; + for arg in args { for entry in walkdir::WalkDir::new(arg) { match entry { @@ -22,6 +95,7 @@ fn main() { } } } + Ok(()) } fn check_file(path: &Path) { @@ -83,3 +157,43 @@ fn detect_language(path: &Path) -> Option { _ => None, } } + +fn get_config() -> anyhow::Result { + match std::fs::read_to_string("./unicop.toml") { + Ok(config_str) => toml::from_str(&config_str).context("Failed to parse config"), + Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(get_default_config()), + Err(e) => Err(e).context("Failed to read config file"), + } +} + +/// Comments and string literals allow all unicode except Bidi characters, +/// all other kinds of code deny all unicode. +fn get_default_config() -> Config { + Config { + global: config::ConfigRules { + default: RuleSet { + allow: vec![], + deny: vec![], + }, + code_type_rules: [ + ( + config::CodeType::Comment, + RuleSet { + allow: vec![rules::CharacterType::Anything], + deny: vec![rules::CharacterType::Bidi], + }, + ), + ( + config::CodeType::StringLiteral, + RuleSet { + allow: vec![rules::CharacterType::Anything], + deny: vec![rules::CharacterType::Bidi], + }, + ), + ] + .into_iter() + .collect(), + }, + language: HashMap::new(), + } +} diff --git a/src/rules.rs b/src/rules.rs index 1f6eb43..10e4fb1 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -3,21 +3,6 @@ pub enum Decision { Deny, } -pub struct RuleChain { - pub rules: Vec, -} - -impl RuleChain { - pub fn decision(&self, c: char) -> Decision { - for ruleset in &self.rules { - if let Some(decision) = ruleset.decision(c) { - return decision; - } - } - Decision::Deny - } -} - #[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)] pub struct RuleSet { #[serde(default)] @@ -27,7 +12,7 @@ pub struct RuleSet { } impl RuleSet { - fn decision(&self, c: char) -> Option { + pub fn decision(&self, c: char) -> Option { let allow_specificity = self .allow .iter() From 83335614d3acdb389f85869c2f6a2b6a9a151921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20D=C3=A9trez?= Date: Fri, 26 Jul 2024 14:28:42 +0200 Subject: [PATCH 2/5] Replace PoC logic with the rule dispatcher --- src/config.rs | 14 +++++++++++++- src/main.rs | 49 +++++++++++++++++++++++++++++++------------------ src/rules.rs | 6 +++++- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/src/config.rs b/src/config.rs index 976ae13..263d3fb 100644 --- a/src/config.rs +++ b/src/config.rs @@ -79,11 +79,23 @@ static RUST_CODE_TYPES: phf::Map<&'static str, CodeType> = phf::phf_map! { "block_comment" => CodeType::Comment, }; +static JAVASCRIPT_CODE_TYPES: phf::Map<&'static str, CodeType> = phf::phf_map! { + "comment" => CodeType::Comment, + "block_comment" => CodeType::Comment, + "string_fragment" => CodeType::StringLiteral, +}; + +static PYTHON_CODE_TYPES: phf::Map<&'static str, CodeType> = phf::phf_map! { + "string_content" => CodeType::StringLiteral, + "comment" => CodeType::Comment, +}; + impl Language { pub fn lookup_code_type(&self, tree_sitter_code_type: &str) -> Option { match self { + Language::Javascript => JAVASCRIPT_CODE_TYPES.get(tree_sitter_code_type).copied(), Language::Rust => RUST_CODE_TYPES.get(tree_sitter_code_type).copied(), - _ => None, + Language::Python => PYTHON_CODE_TYPES.get(tree_sitter_code_type).copied(), } } } diff --git a/src/main.rs b/src/main.rs index 6d7ffc2..7760746 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,15 +18,16 @@ mod rules; // Replaces the previous idea of "RuleChain"s. struct RuleDispatcher { - user_config: Config, + user_config: Option, default_config: Config, } impl RuleDispatcher { pub fn decision(&self, c: char, language: Language, code_type: Option) -> Decision { - if let Some(decision) = Self::decision_for_config(&self.user_config, c, language, code_type) - { - return decision; + if let Some(user_config) = &self.user_config { + if let Some(decision) = Self::decision_for_config(user_config, c, language, code_type) { + return decision; + } } if let Some(decision) = Self::decision_for_config(&self.default_config, c, language, code_type) @@ -84,13 +85,18 @@ fn main() -> anyhow::Result<()> { args = vec![String::from(".")] } - let _config = get_config()?; + let default_config = get_default_config(); + let user_config = get_user_config()?; + let dispatcher = RuleDispatcher { + user_config, + default_config, + }; for arg in args { for entry in walkdir::WalkDir::new(arg) { match entry { Err(err) => eprintln!("{:}", err), - Ok(entry) if entry.file_type().is_file() => check_file(entry.path()), + Ok(entry) if entry.file_type().is_file() => check_file(&dispatcher, entry.path()), Ok(_) => {} } } @@ -98,15 +104,15 @@ fn main() -> anyhow::Result<()> { Ok(()) } -fn check_file(path: &Path) { - let Some(lang) = detect_language(path) else { +fn check_file(dispatcher: &RuleDispatcher, path: &Path) { + let Some((lang, tslang)) = detect_language(path) else { return; }; let filename = path.display().to_string(); let src = fs::read_to_string(path).unwrap(); let named_source = NamedSource::new(&filename, src.clone()); let mut parser = tree_sitter::Parser::new(); - parser.set_language(&lang).expect("Error loading grammar"); + parser.set_language(&tslang).expect("Error loading grammar"); let tree = parser.parse(&src, None).unwrap(); if tree.root_node().has_error() { println!( @@ -123,9 +129,11 @@ fn check_file(path: &Path) { .root_node() .named_descendant_for_byte_range(off, off + ch.len_utf8()) .unwrap(); - let kind = node.kind(); - if kind == "comment" || kind == "string_fragment" { - continue; + let tskind = node.kind(); + let code_type = lang.lookup_code_type(tskind); + match dispatcher.decision(ch, lang, code_type) { + Decision::Allow => continue, + Decision::Deny => {} } let chname = Name::of(ch).unwrap(); let report = miette!( @@ -148,20 +156,22 @@ fn check_file(path: &Path) { // the Rust crates. So for now we have a simplified language-detection with hard-coded // configurations. // See https://tree-sitter.github.io/tree-sitter/syntax-highlighting#language-detection -fn detect_language(path: &Path) -> Option { +fn detect_language(path: &Path) -> Option<(Language, tree_sitter::Language)> { match path.extension()?.to_str()? { // https://github.com/tree-sitter/tree-sitter-javascript/blob/master/package.json - "js" | "mjs" | "cjs" | "jsx" => Some(tree_sitter_javascript::language()), + "js" | "mjs" | "cjs" | "jsx" => { + Some((Language::Javascript, tree_sitter_javascript::language())) + } // https://github.com/tree-sitter/tree-sitter-python/blob/master/package.json - "py" => Some(tree_sitter_python::language()), + "py" => Some((Language::Python, tree_sitter_python::language())), _ => None, } } -fn get_config() -> anyhow::Result { +fn get_user_config() -> anyhow::Result> { match std::fs::read_to_string("./unicop.toml") { Ok(config_str) => toml::from_str(&config_str).context("Failed to parse config"), - Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(get_default_config()), + Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(None), Err(e) => Err(e).context("Failed to read config file"), } } @@ -169,10 +179,13 @@ fn get_config() -> anyhow::Result { /// Comments and string literals allow all unicode except Bidi characters, /// all other kinds of code deny all unicode. fn get_default_config() -> Config { + let ascii = unic_ucd_block::BlockIter::new() + .find(|b| b.name == "Basic Latin") + .unwrap(); Config { global: config::ConfigRules { default: RuleSet { - allow: vec![], + allow: vec![rules::CharacterType::Block(ascii)], deny: vec![], }, code_type_rules: [ diff --git a/src/rules.rs b/src/rules.rs index 10e4fb1..a5e2097 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -59,7 +59,11 @@ impl CharacterType { match self { Self::CodePoint(rule_char) => *rule_char == c, Self::Range(range) => range.contains(c), - Self::Bidi => todo!(), + Self::Bidi => [ + '\u{202A}', '\u{202b}', '\u{202c}', '\u{202d}', '\u{202e}', '\u{2066}', '\u{2067}', + '\u{2068}', '\u{2069}', + ] + .contains(&c), Self::Block(block) => block.range.contains(c), Self::Anything => true, } From 9bb4c30e25c93faeb7da21314a91e603db1bd099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20D=C3=A9trez?= Date: Fri, 26 Jul 2024 14:59:42 +0200 Subject: [PATCH 3/5] Working stuff!!! --- Cargo.lock | 11 ++++++++ Cargo.toml | 1 + src/config.rs | 41 +++++++++++++++++++++++---- src/main.rs | 78 +++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 105 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 55e8492..7092fce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -765,6 +765,16 @@ dependencies = [ "tree-sitter", ] +[[package]] +name = "tree-sitter-rust" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "277690f420bf90741dea984f3da038ace46c4fe6047cba57a66822226cde1c93" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "trycmd" version = "0.15.5" @@ -874,6 +884,7 @@ dependencies = [ "tree-sitter", "tree-sitter-javascript", "tree-sitter-python", + "tree-sitter-rust", "trycmd", "unic-char-range", "unic-ucd-block", diff --git a/Cargo.toml b/Cargo.toml index e6f7217..41d3660 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ walkdir = "2.5.0" anyhow = "1.0.86" glob = "0.3.1" phf = { version = "0.11.2", features = ["macros"] } +tree-sitter-rust = "0.21.2" [dev-dependencies] trycmd = "0.15.5" diff --git a/src/config.rs b/src/config.rs index 263d3fb..92f348f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -75,8 +75,10 @@ pub enum Language { } static RUST_CODE_TYPES: phf::Map<&'static str, CodeType> = phf::phf_map! { - "comment" => CodeType::Comment, - "block_comment" => CodeType::Comment, + "doc_comment" => CodeType::Comment, + "line_comment" => CodeType::Comment, + "string_content" => CodeType::StringLiteral, + "char_literal" => CodeType::StringLiteral, }; static JAVASCRIPT_CODE_TYPES: phf::Map<&'static str, CodeType> = phf::phf_map! { @@ -98,6 +100,14 @@ impl Language { Language::Python => PYTHON_CODE_TYPES.get(tree_sitter_code_type).copied(), } } + + pub fn grammar(&self) -> tree_sitter::Language { + match self { + Language::Javascript => tree_sitter_javascript::language(), + Language::Python => tree_sitter_python::language(), + Language::Rust => tree_sitter_rust::language(), + } + } } #[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)] @@ -114,12 +124,30 @@ pub struct LanguageRules { // Some([]) = No paths will ever match this language // Some([...]) = Match every file against these glob patterns. // Run this language parser if at least one matches. - #[serde(default)] - pub paths: Option>, + #[serde(default, deserialize_with = "deserialize_pattern")] + pub paths: Option>, #[serde(flatten)] pub rules: ConfigRules, } +fn deserialize_pattern<'de, D>(deserializer: D) -> Result>, D::Error> +where + D: serde::Deserializer<'de>, +{ + let s: Option> = serde::Deserialize::deserialize(deserializer)?; + match s { + None => Ok(None), + Some(v) => { + let res = v + .iter() + .map(|s| glob::Pattern::new(&s)) + .collect::, _>>() + .map_err(serde::de::Error::custom)?; + Ok(Some(res)) + } + } +} + #[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)] pub struct Config { #[serde(default)] @@ -171,6 +199,9 @@ deny = ["*"] allow = ["*"] deny = ["bidi"] +[language.rust] +paths = ["**/*.rs"] + [language.rust.default] allow = ["Tibetan", "U+9000"] deny = ["U+5000..U+5004"] @@ -200,7 +231,7 @@ deny = ["Tibetan"] language: HashMap::from([( Language::Rust, LanguageRules { - paths: None, + paths: Some(vec![glob::Pattern::new("**/*.rs").unwrap()]), rules: ConfigRules { default: RuleSet { allow: vec![ diff --git a/src/main.rs b/src/main.rs index 7760746..bf04322 100644 --- a/src/main.rs +++ b/src/main.rs @@ -23,6 +23,31 @@ struct RuleDispatcher { } impl RuleDispatcher { + pub fn language(&self, filepath: &Path) -> Option { + if let Some(userconf) = &self.user_config { + if let Some(lang) = Self::language_for_config(&userconf, filepath) { + return Some(lang); + } + } + if let Some(lang) = Self::language_for_config(&self.default_config, filepath) { + return Some(lang); + } + None + } + + fn language_for_config(config: &Config, filepath: &Path) -> Option { + for (lang, langconf) in &config.language { + if let Some(paths) = &langconf.paths { + for glob in paths { + if glob.matches_path(filepath) { + return Some(lang.clone()); + } + } + } + } + None + } + pub fn decision(&self, c: char, language: Language, code_type: Option) -> Decision { if let Some(user_config) = &self.user_config { if let Some(decision) = Self::decision_for_config(user_config, c, language, code_type) { @@ -105,14 +130,16 @@ fn main() -> anyhow::Result<()> { } fn check_file(dispatcher: &RuleDispatcher, path: &Path) { - let Some((lang, tslang)) = detect_language(path) else { + let Some(lang) = dispatcher.language(path) else { return; }; let filename = path.display().to_string(); let src = fs::read_to_string(path).unwrap(); let named_source = NamedSource::new(&filename, src.clone()); let mut parser = tree_sitter::Parser::new(); - parser.set_language(&tslang).expect("Error loading grammar"); + parser + .set_language(&lang.grammar()) + .expect("Error loading grammar"); let tree = parser.parse(&src, None).unwrap(); if tree.root_node().has_error() { println!( @@ -150,24 +177,6 @@ fn check_file(dispatcher: &RuleDispatcher, path: &Path) { } } -// Tree-sitter grammars include some configurations to help decide whether the language applies to -// a given file. -// Unfortunately, neither the language-detection algorithm nor the configurations are included in -// the Rust crates. So for now we have a simplified language-detection with hard-coded -// configurations. -// See https://tree-sitter.github.io/tree-sitter/syntax-highlighting#language-detection -fn detect_language(path: &Path) -> Option<(Language, tree_sitter::Language)> { - match path.extension()?.to_str()? { - // https://github.com/tree-sitter/tree-sitter-javascript/blob/master/package.json - "js" | "mjs" | "cjs" | "jsx" => { - Some((Language::Javascript, tree_sitter_javascript::language())) - } - // https://github.com/tree-sitter/tree-sitter-python/blob/master/package.json - "py" => Some((Language::Python, tree_sitter_python::language())), - _ => None, - } -} - fn get_user_config() -> anyhow::Result> { match std::fs::read_to_string("./unicop.toml") { Ok(config_str) => toml::from_str(&config_str).context("Failed to parse config"), @@ -207,6 +216,33 @@ fn get_default_config() -> Config { .into_iter() .collect(), }, - language: HashMap::new(), + language: HashMap::from([ + ( + Language::Rust, + config::LanguageRules { + paths: Some(vec![glob::Pattern::new("**/*.rs").unwrap()]), + rules: Default::default(), + }, + ), + ( + Language::Python, + config::LanguageRules { + paths: Some(vec![glob::Pattern::new("**/*.py").unwrap()]), + rules: Default::default(), + }, + ), + ( + Language::Javascript, + config::LanguageRules { + paths: Some(vec![ + glob::Pattern::new("**/*.js").unwrap(), + glob::Pattern::new("**/*.mjs").unwrap(), + glob::Pattern::new("**/*.cjs").unwrap(), + glob::Pattern::new("**/*.jsx").unwrap(), + ]), + rules: Default::default(), + }, + ), + ]), } } From 38971010fbdb849c1d8f293773063691cf709ef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20D=C3=A9trez?= Date: Fri, 26 Jul 2024 15:16:42 +0200 Subject: [PATCH 4/5] Add comment about where we got the bidi characters --- src/rules.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rules.rs b/src/rules.rs index a5e2097..6fb8314 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -60,6 +60,7 @@ impl CharacterType { Self::CodePoint(rule_char) => *rule_char == c, Self::Range(range) => range.contains(c), Self::Bidi => [ + // List of bidirectional formatting characters from https://en.wikipedia.org/wiki/Trojan_Source '\u{202A}', '\u{202b}', '\u{202c}', '\u{202d}', '\u{202e}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}', ] From c32ac66cab2d4e25e2cfade09c24ba120f3e6fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20D=C3=A9trez?= Date: Fri, 26 Jul 2024 16:56:42 +0200 Subject: [PATCH 5/5] Fix clippy warning --- src/config.rs | 2 +- src/main.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/config.rs b/src/config.rs index 92f348f..5a52418 100644 --- a/src/config.rs +++ b/src/config.rs @@ -140,7 +140,7 @@ where Some(v) => { let res = v .iter() - .map(|s| glob::Pattern::new(&s)) + .map(|s| glob::Pattern::new(s)) .collect::, _>>() .map_err(serde::de::Error::custom)?; Ok(Some(res)) diff --git a/src/main.rs b/src/main.rs index bf04322..2ac1b39 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,7 +25,7 @@ struct RuleDispatcher { impl RuleDispatcher { pub fn language(&self, filepath: &Path) -> Option { if let Some(userconf) = &self.user_config { - if let Some(lang) = Self::language_for_config(&userconf, filepath) { + if let Some(lang) = Self::language_for_config(userconf, filepath) { return Some(lang); } } @@ -40,7 +40,7 @@ impl RuleDispatcher { if let Some(paths) = &langconf.paths { for glob in paths { if glob.matches_path(filepath) { - return Some(lang.clone()); + return Some(*lang); } } }