Skip to content

Commit

Permalink
Generate our own unicode block constants
Browse files Browse the repository at this point in the history
Add a script to generate a custom unicode_block module with constants
representing unicode blocks. This replaces the blocks from
unic-ucd-blocks.

Also refactor CharacterType::Range to use a RangeInclusive so we can
also remove the dependency on unic-char-range.
  • Loading branch information
gregoire-mullvad committed Jul 26, 2024
1 parent 3897101 commit 1203f23
Show file tree
Hide file tree
Showing 7 changed files with 705 additions and 40 deletions.
13 changes: 0 additions & 13 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ tree-sitter = "0.22.6"
tree-sitter-javascript = "0.21.2"
tree-sitter-python = "0.21.0"
unic-ucd-name = "0.9.0"
unic-ucd-block = "0.9.0"
unic-char-range = "0.9.0"
toml = "0.8.14"
serde = { version = "1.0.203", features = ["derive"] }
walkdir = "2.5.0"
Expand Down
38 changes: 38 additions & 0 deletions hack/genblocks
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python
import io
import urllib.request
import re

BLOCKDEF = re.compile(
r"^(?P<low>[0-9A-Fa-f]+)\.\.(?P<high>[0-9A-Fa-f]+); (?P<name>.*)$"
)


resp = urllib.request.urlopen("https://www.unicode.org/Public/UNIDATA/Blocks.txt")
blocks = []
for line in io.TextIOWrapper(resp):
if match := BLOCKDEF.match(line.strip()):
name = match.group("name")
if name in {"Low Surrogates", "High Surrogates", "High Private Use Surrogates"}:
continue
low = match.group("low")
high = match.group("high")
blocks.append((name, low, high))


def constname(blockname):
return blockname.replace(" ", "_").replace("-", "_").upper()


print("// Code generated by hack/genblocks. DO NOT EDIT.")
print()
for name, low, high in blocks:
rustrange = f"'\\u{{{low}}}'..='\\u{{{high}}}'"
print(f"pub const {constname(name)}: std::ops::RangeInclusive<char> = {rustrange};")

print(
"""pub static UNICODE_BLOCKS: phf::Map<&'static str, std::ops::RangeInclusive<char>> = phf::phf_map! {"""
)
for name, _, _ in blocks:
print(f' "{name}" => {constname(name)},')
print("};")
22 changes: 6 additions & 16 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,13 @@ impl FromStr for CharacterType {
if s == "*" {
return Ok(Self::Anything);
}
for block in unic_ucd_block::BlockIter::new() {
if block.name == s {
return Ok(Self::Block(block));
}
if let Some(range) = crate::unicode_blocks::UNICODE_BLOCKS.get(s) {
return Ok(Self::Block(range));
}
if let Some((low, high)) = s.split_once("..") {
let low = unicode_notation_to_char(low)?;
let high = unicode_notation_to_char(high)?;
return Ok(Self::Range(unic_char_range::CharRange { low, high }));
return Ok(Self::Range(low..=high));
}
unicode_notation_to_char(s).map(Self::CodePoint)
}
Expand Down Expand Up @@ -158,9 +156,6 @@ pub struct Config {

#[cfg(test)]
mod tests {
use unic_char_range::CharRange;
use unic_ucd_block::BlockIter;

use super::*;
use crate::rules::*;

Expand Down Expand Up @@ -212,8 +207,6 @@ deny = ["Tibetan"]
)
.unwrap();

let tibetan_block = BlockIter::new().find(|b| b.name == "Tibetan").unwrap();

let expected_config = Config {
global: ConfigRules {
default: RuleSet {
Expand All @@ -235,19 +228,16 @@ deny = ["Tibetan"]
rules: ConfigRules {
default: RuleSet {
allow: vec![
CharacterType::Block(tibetan_block),
CharacterType::Block(&crate::unicode_blocks::TIBETAN),
CharacterType::CodePoint('\u{9000}'),
],
deny: vec![CharacterType::Range(CharRange {
low: '\u{5000}',
high: '\u{5004}',
})],
deny: vec![CharacterType::Range('\u{5000}'..='\u{5004}')],
},
code_type_rules: HashMap::from([(
CodeType::StringLiteral,
RuleSet {
allow: vec![],
deny: vec![CharacterType::Block(tibetan_block)],
deny: vec![CharacterType::Block(&crate::unicode_blocks::TIBETAN)],
},
)]),
},
Expand Down
6 changes: 2 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use unic_ucd_name::Name;

mod config;
mod rules;
mod unicode_blocks;

// Replaces the previous idea of "RuleChain"s.
struct RuleDispatcher {
Expand Down Expand Up @@ -188,13 +189,10 @@ fn get_user_config() -> anyhow::Result<Option<Config>> {
/// Comments and string literals allow all unicode except Bidi characters,
/// all other kinds of code deny all unicode.
fn get_default_config() -> Config {
let ascii = unic_ucd_block::BlockIter::new()
.find(|b| b.name == "Basic Latin")
.unwrap();
Config {
global: config::ConfigRules {
default: RuleSet {
allow: vec![rules::CharacterType::Block(ascii)],
allow: vec![rules::CharacterType::Block(&unicode_blocks::BASIC_LATIN)],
deny: vec![],
},
code_type_rules: [
Expand Down
10 changes: 5 additions & 5 deletions src/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ pub enum CharacterType {
/// Single character (eg. "U+9000")
CodePoint(char),
/// An inclusive range of characters (eg. "U+1400..U+1409")
Range(unic_char_range::CharRange),
Range(std::ops::RangeInclusive<char>),
/// All bidirectional control characters (right to left etc)
Bidi,
/// Named ranges of characters (eg. "Tibetan", "Box Drawing")
Block(unic_ucd_block::Block),
Block(&'static std::ops::RangeInclusive<char>),
/// Any possible character.
Anything,
}
Expand All @@ -58,14 +58,14 @@ impl CharacterType {
fn matches(&self, c: char) -> bool {
match self {
Self::CodePoint(rule_char) => *rule_char == c,
Self::Range(range) => range.contains(c),
Self::Range(range) => range.contains(&c),
Self::Bidi => [
// List of bidirectional formatting characters from https://en.wikipedia.org/wiki/Trojan_Source
'\u{202A}', '\u{202b}', '\u{202c}', '\u{202d}', '\u{202e}', '\u{2066}', '\u{2067}',
'\u{2068}', '\u{2069}',
]
.contains(&c),
Self::Block(block) => block.range.contains(c),
Self::Block(range) => range.contains(&c),
Self::Anything => true,
}
}
Expand All @@ -88,7 +88,7 @@ impl PartialEq for CharacterType {
(CodePoint(self_c), CodePoint(other_c)) => self_c == other_c,
(Range(self_r), Range(other_r)) => self_r == other_r,
(Bidi, Bidi) => true,
(Block(self_block), Block(other_block)) => self_block.name == other_block.name,
(Block(self_range), Block(other_range)) => self_range == other_range,
(Anything, Anything) => true,
_ => false,
}
Expand Down
Loading

0 comments on commit 1203f23

Please sign in to comment.