From f2b7a7a9b7456515aad143a33bece9b5dcb6c8d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20F=C3=A4rnstrand?= <linus@mullvad.net>
Date: Fri, 27 Sep 2024 16:43:29 +0200
Subject: [PATCH] Add --print-characters flag. Helps discover the unicode
 ranges

---
 src/config.rs           |  8 ++----
 src/main.rs             | 54 +++++++++++++++++++++++++++++++++--------
 src/rules.rs            |  2 +-
 src/unicode_notation.rs |  9 +++++++
 4 files changed, 56 insertions(+), 17 deletions(-)
 create mode 100644 src/unicode_notation.rs
diff --git a/src/config.rs b/src/config.rs
index 5f11986..00a43bc 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -48,12 +48,8 @@ impl FromStr for CharacterType {
 }
 
 fn unicode_notation_to_char(unicode_notation: &str) -> Result<char, InvalidCharacterType> {
-    let parse = |unicode_notation: &str| -> Option<char> {
-        let hex_str_number = unicode_notation.strip_prefix("U+")?;
-        let int_number = u32::from_str_radix(hex_str_number, 16).ok()?;
-        char::from_u32(int_number)
-    };
-    parse(unicode_notation).ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned()))
+    crate::unicode_notation::unicode_notation_to_char(unicode_notation)
+        .ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned()))
 }
 
 /// All types of code that can have special rules about what is allowed or denied.
diff --git a/src/main.rs b/src/main.rs
index 89bf231..9dc8c8e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,22 +1,21 @@
 use std::collections::HashMap;
 use std::fs;
 use std::io;
-use std::path::Path;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 use anyhow::Context;
 use clap::Parser;
-use config::CodeType;
-use config::Config;
-use config::Language;
 use miette::{miette, LabeledSpan, NamedSource, Severity};
-use rules::Decision;
-use rules::RuleSet;
 use unic_ucd_name::Name;
 
+use crate::config::{CodeType, Config, Language};
+use crate::rules::{CharacterType, Decision, RuleSet};
+use crate::unicode_notation::char_to_unicode_notation;
+
 mod config;
 mod rules;
 mod unicode_blocks;
+mod unicode_notation;
 
 // Replaces the previous idea of "RuleChain"s.
 struct RuleDispatcher {
@@ -113,10 +112,23 @@ struct Args {
     paths: Vec<PathBuf>,
 
     /// Print the names of all the Unicode blocks that this tool recognizes, then exits.
+    ///
     /// Enable verbose output to also print the code point ranges for each block.
     #[arg(long)]
     print_unicode_blocks: bool,
 
+    /// Print the character(s) in the given character type, then exit.
+    ///
+    /// As argument you can specify anything you can add to the allow end deny lists in the
+    /// config file. For example:
+    ///
+    /// `--print-characters "Mathematical Operators"` will print all unicode code points
+    /// in that block.
+    ///
+    /// `--print-characters U+100..U+1ff` will print all characters between 100 and 1ff (hex)
+    #[arg(long)]
+    print_characters: Option<CharacterType>,
+
     /// Enable more verbose output.
     #[arg(short, long)]
     verbose: bool,
@@ -131,15 +143,28 @@ fn main() -> anyhow::Result<()> {
         for (&name, range) in &unicode_blocks::UNICODE_BLOCKS {
             print!("{name}");
             if args.verbose {
-                let range_start = u32::from(*range.start());
-                let range_end = u32::from(*range.end());
-                print!(": U+{range_start}..U+{range_end}");
+                let range_start = char_to_unicode_notation(*range.start());
+                let range_end = char_to_unicode_notation(*range.end());
+                print!(": {range_start}..{range_end}");
             }
             println!();
         }
         return Ok(());
     }
 
+    if let Some(character_type) = args.print_characters {
+        match character_type {
+            CharacterType::CodePoint(c) => print_char_range(c..=c),
+            CharacterType::Range(range) => print_char_range(range),
+            CharacterType::Bidi => print_char_range(rules::BIDI_CHARACTERS.iter().copied()),
+            CharacterType::Block(block) => print_char_range(block.clone()),
+            // TODO: `char::MIN` and `char::MAX` are heading for stabilization. When they are
+            // stable we can replace these constants for those in std.
+            CharacterType::Anything => print_char_range('\0'..='\u{10ffff}'),
+        }
+        return Ok(());
+    }
+
     let default_config = get_default_config();
     let mut dispatcher = RuleDispatcher {
         user_config: None,
@@ -331,3 +356,12 @@ fn get_default_config() -> Config {
         ]),
     }
 }
+
+/// Prints to stdout, one line per character in the iterator.
+/// The format is to first print the unicode notation followed
+/// by the actual character
+fn print_char_range(range: impl Iterator<Item = char>) {
+    for c in range {
+        println!("{}: {c}", char_to_unicode_notation(c));
+    }
+}
\ No newline at end of file
diff --git a/src/rules.rs b/src/rules.rs
index a6ebee7..b399fb0 100644
--- a/src/rules.rs
+++ b/src/rules.rs
@@ -46,7 +46,7 @@ impl RuleSet {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub enum CharacterType {
     /// Single character (eg. "U+9000")
     CodePoint(char),
diff --git a/src/unicode_notation.rs b/src/unicode_notation.rs
new file mode 100644
index 0000000..5318ed0
--- /dev/null
+++ b/src/unicode_notation.rs
@@ -0,0 +1,9 @@
+pub fn unicode_notation_to_char(unicode_notation: &str) -> Option<char> {
+    let hex_str_number = unicode_notation.strip_prefix("U+")?;
+    let int_number = u32::from_str_radix(hex_str_number, 16).ok()?;
+    char::from_u32(int_number)
+}
+
+pub fn char_to_unicode_notation(c: char) -> String {
+    format!("U+{:X}", u32::from(c))
+}