From 12107734ef52c13a4f97652d44d9e44493425217 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Wed, 30 Oct 2024 23:29:39 -0400 Subject: [PATCH] chore(transformations): fix filter markdown/commonmark --- Cargo.lock | 68 ++++++++++-------- spider/Cargo.toml | 2 +- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- .../src/transformation/content.rs | 72 ++++--------------- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 59 insertions(+), 93 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1baf06df0..110c875c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -530,6 +530,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chardetng" version = "0.1.17" @@ -1237,9 +1243,9 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.18" +version = "0.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b2bb7e4e60024aa3c6a35f0badb53ad12b939b6efa7b82f18e8305a3035a783" +checksum = "7881fb5986e805fab3fad1ba6101f4df265ab48adaca8069bcc40d43c61c08b4" dependencies = [ "auto_encoder", "html5ever", @@ -1247,6 +1253,7 @@ dependencies = [ "markup5ever_rcdom", "percent-encoding", "regex", + "url", ] [[package]] @@ -1910,7 +1917,7 @@ dependencies = [ "http 1.1.0", "hyper 1.5.0", "hyper-util", - "rustls 0.23.15", + "rustls 0.23.16", "rustls-native-certs", "rustls-pki-types", "tokio", @@ -1937,9 +1944,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-channel", @@ -2157,9 +2164,9 @@ checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libm" -version = "0.2.8" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] name = "libredox" @@ -3099,7 +3106,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.0.0", - "rustls 0.23.15", + "rustls 0.23.16", "socket2", "thiserror", "tokio", @@ -3116,7 +3123,7 @@ dependencies = [ "rand 0.8.5", "ring", "rustc-hash 2.0.0", - "rustls 0.23.15", + "rustls 0.23.16", "slab", "thiserror", "tinyvec", @@ -3125,10 +3132,11 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b" +checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" dependencies = [ + "cfg_aliases", "libc", "once_cell", "socket2", @@ -3333,9 +3341,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" -version = "0.12.8" +version = "0.12.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" +checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "async-compression", "base64 0.22.1", @@ -3364,7 +3372,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.15", + "rustls 0.23.16", "rustls-native-certs", "rustls-pemfile", "rustls-pki-types", @@ -3495,9 +3503,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.37" +version = "0.38.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" +checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" dependencies = [ "bitflags 2.6.0", "errno", @@ -3522,9 +3530,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.15" +version = "0.23.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fbb44d7acc4e873d613422379f69f237a1b141928c02f6bc6ccfddddc2d7993" +checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" dependencies = [ "log", "once_cell", @@ -3711,18 +3719,18 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.213" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.213" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", @@ -3909,7 +3917,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.10.27" +version = "2.10.28" dependencies = [ "ahash", "async-openai", @@ -3970,7 +3978,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.10.27" +version = "2.10.28" dependencies = [ "adblock", "async-tungstenite", @@ -4005,7 +4013,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.10.27" +version = "2.10.28" dependencies = [ "clap", "env_logger", @@ -4029,7 +4037,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.10.27" +version = "2.10.28" dependencies = [ "aho-corasick", "fast_html2md", @@ -4051,7 +4059,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.10.27" +version = "2.10.28" dependencies = [ "indexmap 1.9.3", "serde", @@ -4063,7 +4071,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.10.27" +version = "2.10.28" dependencies = [ "env_logger", "lazy_static", @@ -4493,7 +4501,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.15", + "rustls 0.23.16", "rustls-pki-types", "tokio", ] @@ -4799,7 +4807,7 @@ dependencies = [ "flate2", "log", "once_cell", - "rustls 0.23.15", + "rustls 0.23.16", "rustls-pki-types", "serde", "serde_json", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index ea4839558..82fb116f1 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.10.27" +version = "2.10.28" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 181d61602..b3cbea95c 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.10.27" +version = "2.10.28" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 2e970af0b..1e7799451 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.10.27" +version = "2.10.28" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 6086bdf21..f0354f945 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.10.27" +version = "2.10.28" authors = [ "j-mendez " ] diff --git a/spider_transformations/src/transformation/content.rs b/spider_transformations/src/transformation/content.rs index 75f374637..1d0715dbc 100644 --- a/spider_transformations/src/transformation/content.rs +++ b/spider_transformations/src/transformation/content.rs @@ -1,6 +1,7 @@ use crate::html2xml::convert_html_to_xml; use aho_corasick::AhoCorasick; use html2md; +use html2md::ignore::IgnoreTagFactory; use phf::phf_set; use regex::Regex; use serde::{Deserialize, Deserializer}; @@ -122,24 +123,6 @@ pub struct SelectorConfiguration { pub exclude_selector: Option, } -/// ignore tags for markdown transformation -#[derive(Clone)] -pub struct IgnoreTagFactory; - -impl html2md::TagHandlerFactory for IgnoreTagFactory { - fn instantiate(&self) -> Box { - Box::new(self.clone()) - } -} - -impl html2md::TagHandler for IgnoreTagFactory { - fn handle(&mut self, _tag: &html2md::Handle, _printer: &mut html2md::StructuredPrinter) {} - fn after_handle(&mut self, _printer: &mut html2md::StructuredPrinter) {} - fn skip_descendants(&self) -> bool { - true - } -} - /// is the content html and safe for formatting. static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! { b"" ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 061f84e6f..2199b5f27 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.10.27" +version = "2.10.28" authors = [ "j-mendez " ]