Skip to content

Commit

Permalink
chore(transforms): add optional ignore tags
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 25, 2024
1 parent dccef3d commit 6cd558d
Show file tree
Hide file tree
Showing 9 changed files with 166 additions and 85 deletions.
192 changes: 123 additions & 69 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.10.21"
version = "2.10.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.10.21"
version = "2.10.22"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.10.21"
version = "2.10.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.10.21"
version = "2.10.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
31 changes: 29 additions & 2 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ pub struct TransformConfig {
pub clean_html: bool,
/// Filter svgs.
pub filter_svg: bool,
/// Main content for the page. Exclude the nav, footer, and etc.
pub main_content: bool,
}

/// Select elements to show or hide using a CSS selector.
Expand Down Expand Up @@ -318,6 +320,7 @@ pub fn transform_content(
c: &TransformConfig,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
ignore_tags: &Option<Vec<String>>,
) -> String {
let base_html = get_html_with_selector(res, encoding, selector_config);

Expand All @@ -337,7 +340,6 @@ pub fn transform_content(
Some(u) => u,
_ => &EXAMPLE_URL,
},
&None,
) {
Ok(product) => product.content,
_ => base_html,
Expand Down Expand Up @@ -366,6 +368,18 @@ pub fn transform_content(
tag_factory.insert(String::from("svg"), tag.clone());
}

if c.main_content {
tag_factory.insert(String::from("nav"), tag.clone());
tag_factory.insert(String::from("footer"), tag.clone());
tag_factory.insert(String::from("aside"), tag.clone());
}

if let Some(ignore) = ignore_tags {
for ignore_tag_name in ignore {
tag_factory.insert(ignore_tag_name.into(), tag.clone());
}
}

let base_html = if c.clean_html {
clean_html(&base_html)
} else {
Expand Down Expand Up @@ -399,6 +413,18 @@ pub fn transform_content(
tag_factory.insert(String::from("svg"), tag.clone());
}

if c.main_content {
tag_factory.insert(String::from("nav"), tag.clone());
tag_factory.insert(String::from("footer"), tag.clone());
tag_factory.insert(String::from("aside"), tag.clone());
}

if let Some(ignore) = ignore_tags {
for ignore_tag_name in ignore {
tag_factory.insert(ignore_tag_name.into(), tag.clone());
}
}

let base_html = if c.clean_html {
clean_html(&base_html)
} else {
Expand Down Expand Up @@ -449,6 +475,7 @@ pub fn transform_content_to_bytes(
c: &TransformConfig,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
ignore_tags: &Option<Vec<String>>,
) -> Bytes {
if is_binary_file(res.get_html_bytes_u8()) {
let b = res.get_bytes();
Expand All @@ -458,7 +485,7 @@ pub fn transform_content_to_bytes(
Default::default()
}
} else {
let content = transform_content(res, c, encoding, selector_config);
let content = transform_content(res, c, encoding, selector_config, ignore_tags);
let b = content.as_bytes();
Bytes::copy_from_slice(b)
}
Expand Down
16 changes: 8 additions & 8 deletions spider_transformations/src/transformation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ mod tests {

conf.return_format = ReturnFormat::Markdown;

let content = content::transform_content(&page, &conf, &None, &None);
let content = content::transform_content(&page, &conf, &None, &None, &None);

assert!(
content
Expand All @@ -65,7 +65,7 @@ mod tests {

conf.return_format = ReturnFormat::Html2Text;

let content = content::transform_content(&page, &conf, &None, &None);
let content = content::transform_content(&page, &conf, &None, &None, &None);

assert!(
content
Expand All @@ -76,7 +76,7 @@ mod tests {
conf.return_format = ReturnFormat::Bytes;
conf.readability = true;

let content = content::transform_content(&page, &conf, &None, &None);
let content = content::transform_content(&page, &conf, &None, &None, &None);

assert!(
content
Expand All @@ -85,7 +85,7 @@ mod tests {
);

conf.return_format = ReturnFormat::XML;
let content = content::transform_content(&page, &conf, &Some("UTF-8".into()), &None);
let content = content::transform_content(&page, &conf, &Some("UTF-8".into()), &None, &None);
assert!(
content
== r#"<html xmlns="http://www.w3.org/1999/xhtml" class="paper"><head>
Expand All @@ -112,7 +112,7 @@ mod tests {
conf.return_format = ReturnFormat::XML;
page_response.content = Some(Bytes::from(markup));
let page = build(url, page_response);
let content = content::transform_content(&page, &conf, &None, &None);
let content = content::transform_content(&page, &conf, &None, &None, &None);
assert!(
content
== r#"<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"><head><meta charset="utf-8" /><title>Transform Test</title></head><body><h1>Fun is fun</h1><a href="https://spider.cloud">Spider Cloud</a><pre>The content is ready</pre><script><![CDATA[document.querySelector(&amp;quot;pre&amp;quot;)]]></script></body></html>"#,
Expand All @@ -137,7 +137,7 @@ mod tests {

select_config.root_selector = Some("pre".into());

let content = content::transform_content(&page, &conf, &None, &Some(select_config));
let content = content::transform_content(&page, &conf, &None, &Some(select_config), &None);

assert!(
content.contains(&"The content is ready"),
Expand All @@ -162,7 +162,7 @@ mod tests {

select_config.exclude_selector = Some("pre".into());

let content = content::transform_content(&page, &conf, &None, &Some(select_config));
let content = content::transform_content(&page, &conf, &None, &Some(select_config), &None);

assert!(
content.contains(&"Transform Test# Fun is fun\n[Spider Cloud](https://spider.cloud)"),
Expand All @@ -187,7 +187,7 @@ mod tests {

let page = build("https://example.com/example.pdf", page_response);

let content = content::transform_content(&page, &conf, &None, &None);
let content = content::transform_content(&page, &conf, &None, &None, &None);

assert!(content.is_empty(), "The tranform to markdown is invalid");
}
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.10.21"
version = "2.10.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.10.21"
version = "2.10.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 6cd558d

Please sign in to comment.