Skip to content

Commit

Permalink
chore(transformations): fix filter markdown/commonmark
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 31, 2024
1 parent 692c7b0 commit 1210773
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 93 deletions.
68 changes: 38 additions & 30 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.10.27"
version = "2.10.28"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.10.27"
version = "2.10.28"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.10.27"
version = "2.10.28"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.10.27"
version = "2.10.28"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
72 changes: 15 additions & 57 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::html2xml::convert_html_to_xml;
use aho_corasick::AhoCorasick;
use html2md;
use html2md::ignore::IgnoreTagFactory;
use phf::phf_set;
use regex::Regex;
use serde::{Deserialize, Deserializer};
Expand Down Expand Up @@ -122,24 +123,6 @@ pub struct SelectorConfiguration {
pub exclude_selector: Option<String>,
}

/// ignore tags for markdown transformation
#[derive(Clone)]
pub struct IgnoreTagFactory;

impl html2md::TagHandlerFactory for IgnoreTagFactory {
fn instantiate(&self) -> Box<dyn html2md::TagHandler> {
Box::new(self.clone())
}
}

impl html2md::TagHandler for IgnoreTagFactory {
fn handle(&mut self, _tag: &html2md::Handle, _printer: &mut html2md::StructuredPrinter) {}
fn after_handle(&mut self, _printer: &mut html2md::StructuredPrinter) {}
fn skip_descendants(&self) -> bool {
true
}
}

/// is the content html and safe for formatting.
static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! {
b"<!doctype html",
Expand Down Expand Up @@ -375,13 +358,9 @@ pub fn transform_content(
return base_html;
}

let url_parsed = res.get_url_parsed().as_ref();
let url_parsed = res.get_url_parsed();

let base_html = if c.return_format.eq(&ReturnFormat::CommonMark)
|| c.return_format.eq(&ReturnFormat::Markdown)
{
base_html
} else {
let base_html = {
let mut ignore_list = build_static_vector(c);

if let Some(ignore) = ignore_tags {
Expand Down Expand Up @@ -422,21 +401,6 @@ pub fn transform_content(
tag_factory.insert(String::from("style"), tag.clone());
tag_factory.insert(String::from("noscript"), tag.clone());

if c.filter_images {
tag_factory.insert(String::from("img"), tag.clone());
tag_factory.insert(String::from("picture"), tag.clone());
}

if c.filter_svg {
tag_factory.insert(String::from("svg"), tag.clone());
}

if c.main_content {
tag_factory.insert(String::from("nav"), tag.clone());
tag_factory.insert(String::from("footer"), tag.clone());
tag_factory.insert(String::from("aside"), tag.clone());
}

if let Some(ignore) = ignore_tags {
for ignore_tag_name in ignore {
tag_factory.insert(ignore_tag_name.into(), tag.clone());
Expand All @@ -451,7 +415,12 @@ pub fn transform_content(

tag_factory.insert(String::from("iframe"), tag);

let html = html2md::parse_html_custom(&base_html.trim(), &tag_factory, true);
let html = html2md::parse_html_custom_with_url(
&base_html.trim(),
&tag_factory,
true,
&url_parsed,
);
let html = aho_clean_markdown(&html);

html
Expand All @@ -465,22 +434,6 @@ pub fn transform_content(
tag_factory.insert(String::from("script"), tag.clone());
tag_factory.insert(String::from("style"), tag.clone());
tag_factory.insert(String::from("noscript"), tag.clone());
tag_factory.insert(String::from("br"), tag.clone());

if c.filter_images {
tag_factory.insert(String::from("img"), tag.clone());
tag_factory.insert(String::from("picture"), tag.clone());
}

if c.filter_svg {
tag_factory.insert(String::from("svg"), tag.clone());
}

if c.main_content {
tag_factory.insert(String::from("nav"), tag.clone());
tag_factory.insert(String::from("footer"), tag.clone());
tag_factory.insert(String::from("aside"), tag.clone());
}

if let Some(ignore) = ignore_tags {
for ignore_tag_name in ignore {
Expand All @@ -496,7 +449,12 @@ pub fn transform_content(

tag_factory.insert(String::from("iframe"), tag);

let html = html2md::parse_html_custom(&base_html.trim(), &tag_factory, false);
let html = html2md::parse_html_custom_with_url(
&base_html.trim(),
&tag_factory,
false,
url_parsed,
);
let html = aho_clean_markdown(&html);

html
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.10.27"
version = "2.10.28"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
Loading

0 comments on commit 1210773

Please sign in to comment.