From b3e1aeaea2856eeb5d14a9c6fc82847447c26806 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 19 Oct 2024 18:05:22 -0400 Subject: [PATCH] chore(transformations): add filter svg markdown --- Cargo.lock | 12 ++++++------ spider/Cargo.toml | 2 +- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_transformations/src/transformation/content.rs | 7 +++++++ spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 19 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b2e8fd4eb..823a61f00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.9.10" +version = "2.9.12" dependencies = [ "ahash", "async-openai", @@ -3945,7 +3945,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.9.10" +version = "2.9.12" dependencies = [ "adblock", "async-tungstenite", @@ -3980,7 +3980,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.9.10" +version = "2.9.12" dependencies = [ "clap", "env_logger", @@ -4004,7 +4004,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.9.10" +version = "2.9.12" dependencies = [ "aho-corasick", "fast_html2md", @@ -4023,7 +4023,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.9.10" +version = "2.9.12" dependencies = [ "indexmap 1.9.3", "serde", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.9.10" +version = "2.9.12" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index ceef4f40d..41fe421d3 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.9.10" +version = "2.9.12" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 47e4cf54d..7caae2da4 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.9.10" +version = "2.9.12" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index a2ed45696..189fb77fa 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.9.10" +version = "2.9.12" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index e3c4bcba5..ba7adc597 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.9.10" +version = "2.9.12" authors = [ "j-mendez " ] diff --git a/spider_transformations/src/transformation/content.rs b/spider_transformations/src/transformation/content.rs index a0b34242e..e0b504db0 100644 --- a/spider_transformations/src/transformation/content.rs +++ b/spider_transformations/src/transformation/content.rs @@ -102,6 +102,8 @@ pub struct TransformConfig { pub return_format: ReturnFormat, /// Filter Images. pub filter_images: bool, + /// Trim the content for LLMs. + pub clean_html: bool } /// ignore tags for markdown transformation @@ -332,8 +334,12 @@ pub fn transform_content( tag_factory.insert(String::from("noscript"), tag.clone()); if filter_images { + tag_factory.insert(String::from("svg"), tag.clone()); tag_factory.insert(String::from("img"), tag.clone()); tag_factory.insert(String::from("picture"), tag.clone()); + } + + if c.clean_html { html = clean_html(&html) } @@ -341,6 +347,7 @@ pub fn transform_content( let html = html2md::parse_html_custom(&html, &tag_factory, false); let html = aho_clean_markdown(&html); + html } ReturnFormat::Html2Text => match encoding { diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 530176b81..9ca932a03 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.9.10" +version = "2.9.12" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 991880950..94002168e 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.9.10" +version = "2.9.12" authors = [ "j-mendez " ]