Skip to content

Commit

Permalink
chore(transformations): add filter svg markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 20, 2024
1 parent 7963e1e commit b3e1aea
Show file tree
Hide file tree
Showing 8 changed files with 19 additions and 12 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.9.10"
version = "2.9.12"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.9.10"
version = "2.9.12"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.9.10"
version = "2.9.12"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.9.10"
version = "2.9.12"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
7 changes: 7 additions & 0 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ pub struct TransformConfig {
pub return_format: ReturnFormat,
/// Filter Images.
pub filter_images: bool,
/// Trim the content for LLMs.
pub clean_html: bool
}

/// ignore tags for markdown transformation
Expand Down Expand Up @@ -332,15 +334,20 @@ pub fn transform_content(
tag_factory.insert(String::from("noscript"), tag.clone());

if filter_images {
tag_factory.insert(String::from("svg"), tag.clone());
tag_factory.insert(String::from("img"), tag.clone());
tag_factory.insert(String::from("picture"), tag.clone());
}

if c.clean_html {
html = clean_html(&html)
}

tag_factory.insert(String::from("iframe"), tag);

let html = html2md::parse_html_custom(&html, &tag_factory, false);
let html = aho_clean_markdown(&html);

html
}
ReturnFormat::Html2Text => match encoding {
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.9.10"
version = "2.9.12"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.9.10"
version = "2.9.12"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit b3e1aea

Please sign in to comment.