From f2b7a7a9b7456515aad143a33bece9b5dcb6c8d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20F=C3=A4rnstrand?= Date: Fri, 27 Sep 2024 16:43:29 +0200 Subject: [PATCH] Add --print-characters flag. Helps discover the unicode ranges --- src/config.rs | 8 ++---- src/main.rs | 54 +++++++++++++++++++++++++++++++++-------- src/rules.rs | 2 +- src/unicode_notation.rs | 9 +++++++ 4 files changed, 56 insertions(+), 17 deletions(-) create mode 100644 src/unicode_notation.rs diff --git a/src/config.rs b/src/config.rs index 5f11986..00a43bc 100644 --- a/src/config.rs +++ b/src/config.rs @@ -48,12 +48,8 @@ impl FromStr for CharacterType { } fn unicode_notation_to_char(unicode_notation: &str) -> Result { - let parse = |unicode_notation: &str| -> Option { - let hex_str_number = unicode_notation.strip_prefix("U+")?; - let int_number = u32::from_str_radix(hex_str_number, 16).ok()?; - char::from_u32(int_number) - }; - parse(unicode_notation).ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned())) + crate::unicode_notation::unicode_notation_to_char(unicode_notation) + .ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned())) } /// All types of code that can have special rules about what is allowed or denied. diff --git a/src/main.rs b/src/main.rs index 89bf231..9dc8c8e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,22 +1,21 @@ use std::collections::HashMap; use std::fs; use std::io; -use std::path::Path; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use anyhow::Context; use clap::Parser; -use config::CodeType; -use config::Config; -use config::Language; use miette::{miette, LabeledSpan, NamedSource, Severity}; -use rules::Decision; -use rules::RuleSet; use unic_ucd_name::Name; +use crate::config::{CodeType, Config, Language}; +use crate::rules::{CharacterType, Decision, RuleSet}; +use crate::unicode_notation::char_to_unicode_notation; + mod config; mod rules; mod unicode_blocks; +mod unicode_notation; // Replaces the previous idea of "RuleChain"s. struct RuleDispatcher { @@ -113,10 +112,23 @@ struct Args { paths: Vec, /// Print the names of all the Unicode blocks that this tool recognizes, then exits. + /// /// Enable verbose output to also print the code point ranges for each block. #[arg(long)] print_unicode_blocks: bool, + /// Print the character(s) in the given character type, then exit. + /// + /// As argument you can specify anything you can add to the allow end deny lists in the + /// config file. For example: + /// + /// `--print-characters "Mathematical Operators"` will print all unicode code points + /// in that block. + /// + /// `--print-characters U+100..U+1ff` will print all characters between 100 and 1ff (hex) + #[arg(long)] + print_characters: Option, + /// Enable more verbose output. #[arg(short, long)] verbose: bool, @@ -131,15 +143,28 @@ fn main() -> anyhow::Result<()> { for (&name, range) in &unicode_blocks::UNICODE_BLOCKS { print!("{name}"); if args.verbose { - let range_start = u32::from(*range.start()); - let range_end = u32::from(*range.end()); - print!(": U+{range_start}..U+{range_end}"); + let range_start = char_to_unicode_notation(*range.start()); + let range_end = char_to_unicode_notation(*range.end()); + print!(": {range_start}..{range_end}"); } println!(); } return Ok(()); } + if let Some(character_type) = args.print_characters { + match character_type { + CharacterType::CodePoint(c) => print_char_range(c..=c), + CharacterType::Range(range) => print_char_range(range), + CharacterType::Bidi => print_char_range(rules::BIDI_CHARACTERS.iter().copied()), + CharacterType::Block(block) => print_char_range(block.clone()), + // TODO: `char::MIN` and `char::MAX` are heading for stabilization. When they are + // stable we can replace these constants for those in std. + CharacterType::Anything => print_char_range('\0'..='\u{10ffff}'), + } + return Ok(()); + } + let default_config = get_default_config(); let mut dispatcher = RuleDispatcher { user_config: None, @@ -331,3 +356,12 @@ fn get_default_config() -> Config { ]), } } + +/// Prints to stdout, one line per character in the iterator. +/// The format is to first print the unicode notation followed +/// by the actual character +fn print_char_range(range: impl Iterator) { + for c in range { + println!("{}: {c}", char_to_unicode_notation(c)); + } +} \ No newline at end of file diff --git a/src/rules.rs b/src/rules.rs index a6ebee7..b399fb0 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -46,7 +46,7 @@ impl RuleSet { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum CharacterType { /// Single character (eg. "U+9000") CodePoint(char), diff --git a/src/unicode_notation.rs b/src/unicode_notation.rs new file mode 100644 index 0000000..5318ed0 --- /dev/null +++ b/src/unicode_notation.rs @@ -0,0 +1,9 @@ +pub fn unicode_notation_to_char(unicode_notation: &str) -> Option { + let hex_str_number = unicode_notation.strip_prefix("U+")?; + let int_number = u32::from_str_radix(hex_str_number, 16).ok()?; + char::from_u32(int_number) +} + +pub fn char_to_unicode_notation(c: char) -> String { + format!("U+{:X}", u32::from(c)) +}