diff --git a/Cargo.lock b/Cargo.lock index a7df07a..ef74713 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "ahash" version = "0.3.8" @@ -84,6 +86,9 @@ name = "fst" version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f" +dependencies = [ + "utf8-ranges", +] [[package]] name = "getrandom" @@ -221,6 +226,12 @@ dependencies = [ "parking_lot", ] +[[package]] +name = "utf8-ranges" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ae116fef2b7fea257ed6440d3cfcff7f190865f170cdad00bb6465bf18ecba" + [[package]] name = "wasi" version = "0.10.2+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index 8e8ff80..fc679ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,28 +1,29 @@ [package] authors = ["Cole Lawrence "] +description = "English Wiktionary parsed for part-of-speech info and placed into a precompiled FST" edition = "2018" +include = ["src/**/*", "dist/*.fst", "Cargo.toml"] +license = "MIT OR Apache-2.0" name = "wiktionary-part-of-speech-extract" -description = "English Wiktionary parsed for part-of-speech info and placed into a precompiled FST" version = "0.1.0" -license = "MIT OR Apache-2.0" -include = ["src/**/*", "dist/*.fst", "Cargo.toml"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -regex = "1.4.5" -ustr = "0.7.0" fst = "0.4.5" -unidecode = "0.3.0" once_cell = "1.7.2" +regex = "1.4.5" +unidecode = "0.3.0" +ustr = "0.7.0" [[bin]] +bench = false name = "regenerate" path = "src/bin/regenerate/regenerate.rs" required-features = ["raw-masking"] test = false -bench = false # ... [features] raw-masking = [] +spell-check = ["fst/levenshtein"] diff --git a/src/tags.rs b/src/tags.rs index 68a6786..c060c8e 100644 --- a/src/tags.rs +++ b/src/tags.rs @@ -145,6 +145,61 @@ impl> TagsLookup { } } +/// TODO: Spell check may not work until we start retaining nouns in wiktionary +#[cfg(feature = "spell-check")] +pub mod spell_check { + use super::*; + use fst::automaton::{Levenshtein, LevenshteinError}; + + pub struct SpellCheckAlternative { + pub word: String, + pub mask: u64, + } + + impl SpellCheckAlternative { + /// Get the spell check alternative's word. + pub fn word(&self) -> &str { + &self.word + } + + /// Get the spell check alternative's [TagSet]. + pub fn tag_set(&self) -> TagSet { + TagSet::from_mask(self.mask as u32) + } + } + + pub enum SpellCheckError { + LevenshteinError(LevenshteinError), + } + + impl From for SpellCheckError { + fn from(err: LevenshteinError) -> Self { + SpellCheckError::LevenshteinError(err) + } + } + + impl> TagsLookup { + /// It would be recommended to use an edit distance of no more than 2 + pub fn spellcheck(&self, key: &str) -> Result, SpellCheckError> { + use fst::{IntoStreamer, Streamer}; + + // memoization by allowing the parent to pass in some sort of memoization controller? + let query = Levenshtein::new(key, 2)?; + let mut stream = self.0.search(&query).into_stream(); + + let mut alernatives = vec![]; + while let Some((word, mask)) = stream.next() { + alernatives.push(SpellCheckAlternative { + word: unsafe { String::from_utf8_unchecked(word.to_vec()) }, + mask, + }); + } + + Ok(alernatives) + } + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Tag { /// adj