Skip to content

Commit

Permalink
chore(transform): add dynamic rewriting html
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 25, 2024
1 parent 6cd558d commit fe995e8
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 20 deletions.
13 changes: 7 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.10.22"
version = "2.10.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.10.22"
version = "2.10.23"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.10.22"
version = "2.10.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
3 changes: 2 additions & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.10.22"
version = "2.10.23"
authors = [
"j-mendez <[email protected]>"
]
Expand All @@ -27,6 +27,7 @@ serde = { version = "1", features = ["derive"] }
fast_html2md = "0"
phf = "0.11"
phf_codegen = "0.11"
lol_html = { version = "1" }

[dependencies.spider]
version = "2"
Expand Down
81 changes: 73 additions & 8 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,52 @@ pub fn aho_clean_markdown(html: &str) -> String {
}
}

/// Clean the html elements from the markup.
pub fn clean_html_elements(html: &str, tags: Vec<&str>) -> String {
use lol_html::{element, rewrite_str, RewriteStrSettings};
match rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: tags
.iter()
.map(|tag| {
element!(tag, |el| {
el.remove();
Ok(())
})
})
.collect::<Vec<_>>()
.into(),
..RewriteStrSettings::default()
},
) {
Ok(r) => r,
_ => html.into(),
}
}

/// Buld the static ignore list of html elements.
pub(crate) fn build_static_vector(config: &TransformConfig) -> Vec<&'static str> {
let mut tags = Vec::new();

if config.filter_images {
tags.push("img");
tags.push("picture");
}

if config.filter_svg {
tags.push("svg");
}

if config.main_content {
tags.push("nav");
tags.push("footer");
tags.push("aside");
}

tags
}

/// transform the content to markdown shortcut
pub fn transform_markdown(html: &str, commonmark: bool) -> String {
let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> = HashMap::new();
Expand Down Expand Up @@ -329,9 +375,26 @@ pub fn transform_content(
return base_html;
}

let return_format = c.return_format;
let url_parsed = res.get_url_parsed().as_ref();

let base_html = if c.return_format.eq(&ReturnFormat::CommonMark)
|| c.return_format.eq(&ReturnFormat::Markdown)
{
base_html
} else {
let mut ignore_list = build_static_vector(c);

if let Some(ignore) = ignore_tags {
ignore_list.extend(ignore.iter().map(|s| s.as_str()));
}

if ignore_list.is_empty() {
base_html
} else {
clean_html_elements(&base_html, ignore_list)
}
};

// process readability
let base_html = if c.readability {
match llm_readability::extractor::extract(
Expand All @@ -348,7 +411,7 @@ pub fn transform_content(
base_html
};

match return_format {
match c.return_format {
ReturnFormat::Raw | ReturnFormat::Bytes => base_html,
ReturnFormat::CommonMark => {
let mut tag_factory: HashMap<String, Box<dyn html2md::TagHandlerFactory>> =
Expand Down Expand Up @@ -455,12 +518,14 @@ pub fn transform_content(
super::text_extract::extract_text(&d)
}
ReturnFormat::XML => {
let target_url = match url_parsed {
Some(u) => u.to_string(),
_ => EXAMPLE_URL.to_string(),
};

if let Ok(xml) = convert_html_to_xml(&base_html.trim(), &target_url, &encoding) {
if let Ok(xml) = convert_html_to_xml(
&base_html.trim(),
&match url_parsed {
Some(u) => u.to_string(),
_ => EXAMPLE_URL.to_string(),
},
&encoding,
) {
xml
} else {
Default::default()
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.10.22"
version = "2.10.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.10.22"
version = "2.10.23"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit fe995e8

Please sign in to comment.