From 2065600f2e81c76bfb4ef7cea594c45716baeed8 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 22 Oct 2024 06:33:51 -0400 Subject: [PATCH] chore(encoding): add html lang auto parsing --- .gitignore | 3 +- Cargo.lock | 32 +- spider/Cargo.toml | 2 +- .../src/packages/scraper/element_ref/mod.rs | 7 +- spider/src/packages/scraper/html/mod.rs | 39 +- spider/src/page.rs | 477 +++++++++--------- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- .../src/transformation/content.rs | 82 ++- .../src/transformation/mod.rs | 64 ++- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 13 files changed, 438 insertions(+), 278 deletions(-) diff --git a/.gitignore b/.gitignore index 6ee28b21b..ef32a5ed2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ target _temp_spider_downloads storage http-cacache -release.sh \ No newline at end of file +release.sh +spider_transformations/example.pdf \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 2e2820cee..9bfa5050e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -285,9 +285,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "auto_encoder" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cfd944c56af79853532412fe19a63faa0cca5e0872ce3de7970dd28fd63dcc1" +checksum = "8e4e9a5bc6b139af3161c73fb788de27bf1cf1fbe3f5e27b43804a9cd85a000e" dependencies = [ "chardetng", "encoding_rs", @@ -454,9 +454,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.2" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" dependencies = [ "serde", ] @@ -3695,18 +3695,18 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.210" +version = "1.0.211" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "1ac55e59090389fb9f0dd9e0f3c09615afed1d19094284d0b200441f13550793" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.211" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "54be4f245ce16bc58d57ef2716271d0d4519e0f6defa147f6e081005bcb278ff" dependencies = [ "proc-macro2", "quote", @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.9.17" +version = "2.10.0" dependencies = [ "ahash", "async-openai", @@ -3945,7 +3945,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.9.17" +version = "2.10.0" dependencies = [ "adblock", "async-tungstenite", @@ -3980,7 +3980,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.9.17" +version = "2.10.0" dependencies = [ "clap", "env_logger", @@ -4004,7 +4004,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.9.17" +version = "2.10.0" dependencies = [ "aho-corasick", "fast_html2md", @@ -4025,7 +4025,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.9.17" +version = "2.10.0" dependencies = [ "indexmap 1.9.3", "serde", @@ -4037,7 +4037,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.9.17" +version = "2.10.0" dependencies = [ "env_logger", "lazy_static", @@ -4413,9 +4413,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.40.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" +checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" dependencies = [ "backtrace", "bytes", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index cd15292f6..14d5c0ffe 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.9.17" +version = "2.10.0" authors = [ "j-mendez " ] diff --git a/spider/src/packages/scraper/element_ref/mod.rs b/spider/src/packages/scraper/element_ref/mod.rs index 255bea866..78faf3b4c 100644 --- a/spider/src/packages/scraper/element_ref/mod.rs +++ b/spider/src/packages/scraper/element_ref/mod.rs @@ -17,11 +17,13 @@ use crate::packages::scraper::selector::Selector; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ElementRef<'a> { node: NodeRef<'a, Node>, + /// The language of the element. Not used atm. + pub lang: &'a str, } impl<'a> ElementRef<'a> { fn new(node: NodeRef<'a, Node>) -> Self { - ElementRef { node } + ElementRef { node, lang: "" } } /// Wraps a `NodeRef` only if it references a `Node::Element`. @@ -60,7 +62,8 @@ impl<'a> ElementRef<'a> { match serialize(&mut buf, self, opts) { _ => (), }; - crate::page::encode_bytes_from_language(&buf, "") + // we need to get the initial encoding of the html lang if used. + crate::page::encode_bytes_from_language(&buf, self.lang) } /// Returns the HTML of this element. diff --git a/spider/src/packages/scraper/html/mod.rs b/spider/src/packages/scraper/html/mod.rs index 3bc878d8d..229be240c 100644 --- a/spider/src/packages/scraper/html/mod.rs +++ b/spider/src/packages/scraper/html/mod.rs @@ -1,7 +1,7 @@ //! HTML documents and fragments. use ego_tree::iter::Nodes; -use ego_tree::Tree; +use ego_tree::{NodeId, Tree}; use fast_html5ever::serialize::SerializeOpts; use fast_html5ever::tree_builder::QuirksMode; use fast_html5ever::QualName; @@ -12,6 +12,10 @@ use crate::packages::scraper::element_ref::ElementRef; use crate::packages::scraper::node::Node; use crate::packages::scraper::selector::Selector; +lazy_static! { + static ref HTML_SELECTOR: Selector = Selector::parse("html").unwrap(); +} + /// An HTML tree. /// /// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the @@ -22,9 +26,10 @@ use crate::packages::scraper::selector::Selector; pub struct Html { /// The quirks mode. pub quirks_mode: QuirksMode, - /// The node tree. pub tree: Tree, + /// The html language of the document. + pub lang: String, } impl Html { @@ -33,6 +38,7 @@ impl Html { Html { quirks_mode: QuirksMode::NoQuirks, tree: Tree::new(Node::Document), + lang: Default::default(), } } @@ -41,6 +47,7 @@ impl Html { Html { quirks_mode: QuirksMode::NoQuirks, tree: Tree::new(Node::Fragment), + lang: Default::default(), } } @@ -96,6 +103,25 @@ impl Html { ElementRef::wrap(root_node).unwrap() } + /// Set the html language of the document by getting the lang attr + pub fn set_language(&mut self, lang: String) { + self.lang = lang; + } + + /// Get the language for the page. + pub fn get_lang(&self) -> &str { + if self.lang.is_empty() { + if let Some(element) = self.select(&HTML_SELECTOR).next() { + if let Some(lang) = element.value().attr("lang") { + return lang; + } + } + &self.lang + } else { + &self.lang + } + } + /// Serialize entire document into HTML. pub fn html(&self) -> String { let opts = SerializeOpts { @@ -107,7 +133,14 @@ impl Html { match serialize(&mut buf, self, opts) { _ => (), }; - crate::page::encode_bytes_from_language(&buf, "") + crate::page::encode_bytes_from_language(&buf, self.get_lang()) + } + + /// Find and remove a node + pub fn remove_node(&mut self, node_id: NodeId) { + if let Some(mut node) = self.tree.get_mut(node_id) { + node.detach(); + } } } diff --git a/spider/src/page.rs b/spider/src/page.rs index c22bda2b6..87a7491eb 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -972,35 +972,37 @@ impl Page { ) -> HashSet { let mut map = HashSet::new(); - if html.starts_with(" { - self.push_link( - href, - &mut map, - &selectors.0, - parent_host, - parent_host_scheme, - base_input_domain, - ); - } - _ => (), - }; + if !html.is_empty() { + if html.starts_with(" { + self.push_link( + href, + &mut map, + &selectors.0, + parent_host, + parent_host_scheme, + base_input_domain, + ); + } + _ => (), + }; + } } } } @@ -1040,61 +1042,62 @@ impl Page { let mut map = HashSet::new(); let html = self.get_html(); - if html.starts_with(" { - if src.starts_with("/") { - if src.starts_with("/_next/static/chunks/pages/") - || src.starts_with("/webpack-runtime-") - || element.attr("id") == Some("gatsby-chunk-mapping") - { - static_app = true; - continue; - } + if !html.is_empty() { + if html.starts_with(" { + if src.starts_with("/") { + if src.starts_with("/_next/static/chunks/pages/") + || src.starts_with("/webpack-runtime-") + || element.attr("id") == Some("gatsby-chunk-mapping") + { + static_app = true; + continue; + } - match self.abs_path(src) { - Some(abs) => { - match abs - .path_segments() - .ok_or_else(|| "cannot be base") - { - Ok(mut paths) => { - while let Some(p) = paths.next() { - // todo: get the path last before None instead of checking for ends_with - if p.ends_with(".js") - && JS_FRAMEWORK_ASSETS.contains(&p) - { - rerender = true; - } else { - match node.as_text() { - Some(text) => { - lazy_static! { - static ref DOM_WATCH_METHODS: regex::RegexSet = { - let set = unsafe { - regex::RegexSet::new(&[ + match self.abs_path(src) { + Some(abs) => { + match abs + .path_segments() + .ok_or_else(|| "cannot be base") + { + Ok(mut paths) => { + while let Some(p) = paths.next() { + // todo: get the path last before None instead of checking for ends_with + if p.ends_with(".js") + && JS_FRAMEWORK_ASSETS.contains(&p) + { + rerender = true; + } else { + match node.as_text() { + Some(text) => { + lazy_static! { + static ref DOM_WATCH_METHODS: regex::RegexSet = { + let set = unsafe { + regex::RegexSet::new(&[ r"/.createElementNS/gm", r"/.removeChild/gm", r"/.insertBefore/gm", @@ -1108,34 +1111,35 @@ impl Page { r"/.write/gm", ]) .unwrap_unchecked() - }; + }; - set - }; + set + }; + } + rerender = + DOM_WATCH_METHODS + .is_match(text); } - rerender = DOM_WATCH_METHODS - .is_match(text); + _ => (), } - _ => (), } } } - } - _ => (), - }; - - if rerender { - // we should re-use the html content instead with events. - let uu = self.get_html(); - let browser = browser.to_owned(); - let configuration = configuration.clone(); - let target_url = self.url.clone(); - let context_id = context_id.clone(); - let parent_host = parent_host.clone(); - - tokio::task::spawn(async move { - // we need to use about:blank here since we set the HTML content directly - match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout, &context_id).await { + _ => (), + }; + + if rerender { + // we should re-use the html content instead with events. + let uu = self.get_html(); + let browser = browser.to_owned(); + let configuration = configuration.clone(); + let target_url = self.url.clone(); + let context_id = context_id.clone(); + let parent_host = parent_host.clone(); + + tokio::task::spawn(async move { + // we need to use about:blank here since we set the HTML content directly + match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout, &context_id).await { Ok(new_page) => { let intercept_handle = crate::features::chrome::setup_chrome_interception_base( @@ -1196,86 +1200,89 @@ impl Page { } _ => (), } - }); + }); - break; + break; + } } + _ => (), } - _ => (), } } + _ => (), } - _ => (), } - } - - if element_name == "a" { - // add fullresources? - match element.attr("href") { - Some(href) => match self.abs_path(href) { - Some(mut abs) => { - let host_name = abs.host_str(); - let mut can_process = parent_host_match( - host_name, - &base_domain, - parent_host, - base_input_domain, - ); - if can_process { - if abs.scheme() != parent_host_scheme.as_str() { - let _ = abs.set_scheme(parent_host_scheme.as_str()); - } - let hchars = abs.path(); - - if let Some(position) = hchars.rfind('.') { - let resource_ext = &hchars[position + 1..hchars.len()]; - - if !ONLY_RESOURCES.contains::( - &resource_ext.into(), - ) { - can_process = false; + if element_name == "a" { + // add fullresources? + match element.attr("href") { + Some(href) => match self.abs_path(href) { + Some(mut abs) => { + let host_name = abs.host_str(); + let mut can_process = parent_host_match( + host_name, + &base_domain, + parent_host, + base_input_domain, + ); + + if can_process { + if abs.scheme() != parent_host_scheme.as_str() { + let _ = abs.set_scheme(parent_host_scheme.as_str()); + } + let hchars = abs.path(); + + if let Some(position) = hchars.rfind('.') { + let resource_ext = + &hchars[position + 1..hchars.len()]; + + if !ONLY_RESOURCES + .contains::( + &resource_ext.into(), + ) + { + can_process = false; + } } - } - if can_process - && (base_domain.is_empty() - || base_domain.as_str() == domain_name(&abs)) - { - map.insert(abs.as_str().to_string().into()); + if can_process + && (base_domain.is_empty() + || base_domain.as_str() == domain_name(&abs)) + { + map.insert(abs.as_str().to_string().into()); + } } } - } + _ => (), + }, _ => (), - }, - _ => (), - }; + }; + } } } - } - if rerender { - drop(stream); - match rx.await { - Ok(v) => { - let extended_map = self - .links_stream_base::( - selectors, - &match v.content { - Some(h) => String::from_utf8_lossy(&h).to_string(), - _ => Default::default(), - }, - ) - .await; - map.extend(extended_map) - } - Err(e) => { - crate::utils::log("receiver error", e.to_string()); - } - }; + if rerender { + drop(stream); + match rx.await { + Ok(v) => { + let extended_map = self + .links_stream_base::( + selectors, + &match v.content { + Some(h) => String::from_utf8_lossy(&h).to_string(), + _ => Default::default(), + }, + ) + .await; + map.extend(extended_map) + } + Err(e) => { + crate::utils::log("receiver error", e.to_string()); + } + }; + } } } - map } @@ -1288,80 +1295,82 @@ impl Page { let mut map = HashSet::new(); let html = self.get_html(); - if html.starts_with(" match self.abs_path(href) { - Some(mut abs) => { - let host_name = abs.host_str(); - let mut can_process = parent_host_match( - host_name, - base_domain, - parent_host, - base_input_domain, - ); - - let mut external_domain = false; - - if !can_process - && host_name.is_some() - && !self.external_domains_caseless.is_empty() - { - can_process = self - .external_domains_caseless - .contains::( - &host_name.unwrap_or_default().into(), - ) || self - .external_domains_caseless - .contains::( - &CASELESS_WILD_CARD, + match element.attr(ele_attribute) { + Some(href) => match self.abs_path(href) { + Some(mut abs) => { + let host_name = abs.host_str(); + let mut can_process = parent_host_match( + host_name, + base_domain, + parent_host, + base_input_domain, ); - external_domain = can_process; - } - if can_process { - if abs.scheme() != parent_host_scheme.as_str() { - let _ = abs.set_scheme(parent_host_scheme.as_str()); + let mut external_domain = false; + + if !can_process + && host_name.is_some() + && !self.external_domains_caseless.is_empty() + { + can_process = self + .external_domains_caseless + .contains::( + &host_name.unwrap_or_default().into(), + ) || self + .external_domains_caseless + .contains::( + &CASELESS_WILD_CARD, + ); + external_domain = can_process; } - let h = abs.as_str(); + if can_process { + if abs.scheme() != parent_host_scheme.as_str() { + let _ = abs.set_scheme(parent_host_scheme.as_str()); + } - if can_process - && (base_domain.is_empty() - || external_domain - || base_domain.as_str() == domain_name(&abs)) - { - map.insert(h.to_string().into()); + let h = abs.as_str(); + + if can_process + && (base_domain.is_empty() + || external_domain + || base_domain.as_str() == domain_name(&abs)) + { + map.insert(h.to_string().into()); + } } } - } + _ => (), + }, _ => (), - }, - _ => (), - }; + }; + } } } } @@ -1475,12 +1484,10 @@ impl Page { } /// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS. -#[cfg(feature = "encoding")] -fn encode_bytes(html: &Bytes, label: &str) -> String { +pub fn encode_bytes(html: &Bytes, label: &str) -> String { auto_encoder::encode_bytes(html, label) } -#[cfg(feature = "encoding")] /// Get the content with proper encoding from a language. Pass in a proper language like "jp". This does nothing without the "encoding" flag. pub fn encode_bytes_from_language(html: &[u8], language: &str) -> String { auto_encoder::encode_bytes_from_language(html, language) diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index d42231711..7bfdef240 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.9.17" +version = "2.10.0" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index f45ef217e..2056b62a8 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.9.17" +version = "2.10.0" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index d7a1055a1..ede4066e3 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.9.17" +version = "2.10.0" authors = [ "j-mendez " ] diff --git a/spider_transformations/src/transformation/content.rs b/spider_transformations/src/transformation/content.rs index c2ee9ea00..1aba2e687 100644 --- a/spider_transformations/src/transformation/content.rs +++ b/spider_transformations/src/transformation/content.rs @@ -4,6 +4,8 @@ use html2md; use phf::phf_set; use regex::Regex; use serde::{Deserialize, Deserializer}; +use spider::auto_encoder::is_binary_file; +use spider::bytes::Bytes; use spider::lazy_static::lazy_static; use spider::packages::scraper::Html; use spider::packages::scraper::{ElementRef, Selector}; @@ -107,6 +109,15 @@ pub struct TransformConfig { pub clean_html: bool, } +/// Select elements to show or hide using a CSS selector. +#[derive(Debug, Default, Clone)] +pub struct SelectorConfiguration { + /// The root html selector. + pub root_selector: Option, + /// Exclude the matching css selector from the output. + pub exclude_selector: Option, +} + /// ignore tags for markdown transformation #[derive(Clone)] pub struct IgnoreTagFactory; @@ -259,22 +270,42 @@ fn get_html(res: &Page, encoding: &Option) -> String { fn get_html_with_selector( res: &Page, encoding: &Option, - root_selector: Option<&String>, + selector_config: &Option, ) -> String { let html = get_html(&res, &encoding); - if let Some(selector) = root_selector { - if let Ok(parsed_selector) = Selector::parse(selector) { - let fragment = Html::parse_fragment(&html); - let root_element = fragment.select(&parsed_selector).next(); - if let Some(root_node) = root_element { - let content = root_node.html(); - if !content.is_empty() { - return content; + if let Some(selector_config) = selector_config.as_ref() { + let mut fragment = Html::parse_fragment(&html); + + if let Some(selector) = selector_config.root_selector.as_ref() { + if let Ok(parsed_selector) = Selector::parse(&selector) { + if let Some(root_node) = fragment.select(&parsed_selector).next() { + if selector_config.exclude_selector.is_some() { + fragment.clone_from(&Html::parse_fragment(&root_node.html())); + } else { + // return the direct html found + return root_node.html(); + } } } } - }; + + if let Some(exclude_selector) = selector_config.exclude_selector.as_ref() { + if let Ok(exclude_sel) = Selector::parse(&exclude_selector) { + let mut elements_to_remove = vec![]; + + for elem in fragment.root_element().select(&exclude_sel) { + elements_to_remove.push(elem.id()); + } + + for id in elements_to_remove { + fragment.remove_node(id); + } + } + } + + return fragment.root_element().html(); + } html } @@ -284,12 +315,18 @@ pub fn transform_content( res: &Page, c: &TransformConfig, encoding: &Option, - root_selector: &Option, + selector_config: &Option, ) -> String { + // prevent transforming binary files or re-encoding it + if is_binary_file(res.get_html_bytes_u8()) { + return Default::default(); + } + let return_format = c.return_format; let filter_images = c.filter_images; + let url_parsed = res.get_url_parsed().as_ref(); - let base_html = get_html_with_selector(res, encoding, root_selector.as_ref()); + let base_html = get_html_with_selector(res, encoding, selector_config); // process readability let base_html = if c.readability { @@ -397,3 +434,24 @@ pub fn transform_content( } } } + +/// transform the content to bytes to prevent loss of precision. +pub fn transform_content_to_bytes( + res: &Page, + c: &TransformConfig, + encoding: &Option, + selector_config: &Option, +) -> Bytes { + if is_binary_file(res.get_html_bytes_u8()) { + let b = res.get_bytes(); + if let Some(b) = b { + b.clone() + } else { + Default::default() + } + } else { + let content = transform_content(res, c, encoding, selector_config); + let b = content.as_bytes(); + Bytes::copy_from_slice(b) + } +} diff --git a/spider_transformations/src/transformation/mod.rs b/spider_transformations/src/transformation/mod.rs index 1df07a265..45c3d05f5 100644 --- a/spider_transformations/src/transformation/mod.rs +++ b/spider_transformations/src/transformation/mod.rs @@ -7,9 +7,16 @@ pub mod text_extract; #[cfg(test)] mod tests { - use crate::transformation::content::{self, ReturnFormat}; + use std::vec; + + use crate::transformation::content::{self, ReturnFormat, SelectorConfiguration}; use maud::PreEscaped; - use spider::{bytes::Bytes, page::build, utils::PageResponse}; + use spider::{ + bytes::Bytes, + page::build, + tokio::{self, fs::File}, + utils::PageResponse, + }; /// the template to re-use fn template() -> PreEscaped { @@ -126,11 +133,62 @@ mod tests { conf.return_format = ReturnFormat::Markdown; - let content = content::transform_content(&page, &conf, &None, &Some("pre".into())); + let mut select_config = SelectorConfiguration::default(); + + select_config.root_selector = Some("pre".into()); + + let content = content::transform_content(&page, &conf, &None, &Some(select_config)); assert!( content.contains(&"The content is ready"), "The tranform to markdown is invalid" ); } + + #[test] + fn test_transformations_exclude_selector() { + let markup = template().into_string(); + let url = "https://spider.cloud"; + + let mut conf = content::TransformConfig::default(); + let mut page_response = PageResponse::default(); + + page_response.content = Some(Bytes::from(markup)); + let page = build(url, page_response); + + conf.return_format = ReturnFormat::Markdown; + + let mut select_config = SelectorConfiguration::default(); + + select_config.exclude_selector = Some("pre".into()); + + let content = content::transform_content(&page, &conf, &None, &Some(select_config)); + + assert!( + content.contains(&"Transform Test# Fun is fun\n[Spider Cloud](https://spider.cloud)"), + "The tranform to markdown is invalid" + ); + } + + #[ignore] + #[tokio::test] + async fn test_transformations_pdf_handling() { + use spider::tokio::io::AsyncReadExt; + let mut f = File::open("./example.pdf").await.unwrap(); + let mut data = vec![]; + f.read_to_end(&mut data).await.unwrap(); + + let mut conf = content::TransformConfig::default(); + conf.return_format = ReturnFormat::XML; + let mut page_response = PageResponse::default(); + let b = Bytes::from(data); + + page_response.content = Some(b); + + let page = build("https://example.com/example.pdf", page_response); + + let content = content::transform_content(&page, &conf, &None, &None); + + assert!(content.is_empty(), "The tranform to markdown is invalid"); + } } diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 63732b5dd..9e1c72c58 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.9.17" +version = "2.10.0" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index f1940b49e..6e93f6e98 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.9.17" +version = "2.10.0" authors = [ "j-mendez " ]