diff --git a/Cargo.lock b/Cargo.lock index 4c845c8ad..5707393f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3917,7 +3917,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.12.7" +version = "2.12.8" dependencies = [ "ahash", "aho-corasick", @@ -3979,7 +3979,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.12.7" +version = "2.12.8" dependencies = [ "adblock", "async-tungstenite", @@ -4014,7 +4014,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.12.7" +version = "2.12.8" dependencies = [ "clap", "env_logger", @@ -4038,7 +4038,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.12.7" +version = "2.12.8" dependencies = [ "aho-corasick", "fast_html2md", @@ -4060,7 +4060,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.12.7" +version = "2.12.8" dependencies = [ "indexmap 1.9.3", "serde", @@ -4072,7 +4072,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.12.7" +version = "2.12.8" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 2a95edbdd..5d3685b51 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.12.7" +version = "2.12.8" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index f21dfbea8..9b2ec2c40 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -123,7 +123,7 @@ pub struct AIResults { #[cfg(not(feature = "decentralized"))] pub struct Page { /// The bytes of the resource. - html: Option, + html: Option>, /// Base absolute url for page. base: Option, /// The raw url for the page. Useful since Url::parse adds a trailing slash. @@ -170,7 +170,7 @@ pub struct Page { #[derive(Debug, Clone, Default)] pub struct Page { /// The bytes of the resource. - html: Option, + html: Option>, #[cfg(feature = "headers")] /// The headers of the page request response. pub headers: Option, @@ -318,10 +318,10 @@ pub fn get_page_selectors(url: &str, subdomains: bool, tld: bool) -> Option, is_success: bool) -> bool { +pub fn validate_empty(content: &Option>, is_success: bool) -> bool { match content { Some(ref content) => { - if content.is_empty() || content == "" || is_success && + if content.is_empty() || content.starts_with(b"") || is_success && content.starts_with(b"\r\n\r\n\r\n\r\n\r\n\r\n") { false @@ -743,7 +743,10 @@ impl Page { /// Set the html directly of the page pub fn set_html_bytes(&mut self, html: Option) { - self.html = html; + self.html = match html { + Some(html) => Some(Box::new(html)), + _ => None, + }; } /// Set the url directly of the page. Useful for transforming the content and rewriting the url. @@ -1585,7 +1588,7 @@ pub fn encode_bytes(html: &Bytes, label: &str) -> String { /// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS. #[cfg(feature = "encoding")] -pub fn get_html_encoded(html: &Option, label: &str) -> String { +pub fn get_html_encoded(html: &Option>, label: &str) -> String { match html.as_ref() { Some(html) => encode_bytes(html, label), _ => Default::default(), diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index ffef62d02..72c8f0c4b 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -180,7 +180,7 @@ async fn cf_handle( #[derive(Debug, Default)] pub struct PageResponse { /// The page response resource. - pub content: Option, + pub content: Option>, #[cfg(feature = "headers")] /// The headers of the response. (Always None if a webdriver protocol is used for fetching.). pub headers: Option, @@ -1402,7 +1402,7 @@ pub async fn handle_response_bytes( } } - let mut content: Option = None; + let mut content: Option> = None; if !block_streaming { let mut stream = res.bytes_stream(); @@ -1433,7 +1433,7 @@ pub async fn handle_response_bytes( } } - content.replace(data.into()); + content.replace(Box::new(data.into())); } PageResponse { diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index d19564ad8..2195edbad 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.12.7" +version = "2.12.8" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 4bd9c0904..c8d590e00 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.12.7" +version = "2.12.8" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 776a18ae5..084827b78 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.12.7" +version = "2.12.8" authors = [ "j-mendez " ] diff --git a/spider_transformations/src/html2xml.rs b/spider_transformations/src/html2xml.rs index f2215b814..d5b7fa3b0 100644 --- a/spider_transformations/src/html2xml.rs +++ b/spider_transformations/src/html2xml.rs @@ -4,6 +4,7 @@ use html5ever::{parse_document, QualName}; use markup5ever::namespace_url; use markup5ever::ns; use spider::auto_encoder::auto_encode_bytes; +use spider::bytes::Bytes; use spider::page::get_html_encoded; use std::default::Default; use std::error::Error; @@ -16,8 +17,10 @@ pub fn convert_html_to_xml( encoding: &Option, ) -> Result> { if encoding.is_some() { + let bytes: Box = Box::new(base_convert_xml(html, url, encoding)?.into()); + Ok(get_html_encoded( - &Some(base_convert_xml(html, url, encoding)?.into()), + &Some(bytes), &match encoding { Some(encoding) => encoding, _ => "UTF-8", diff --git a/spider_transformations/src/transformation/mod.rs b/spider_transformations/src/transformation/mod.rs index f3d94cd3f..4d2d90dbf 100644 --- a/spider_transformations/src/transformation/mod.rs +++ b/spider_transformations/src/transformation/mod.rs @@ -50,7 +50,7 @@ mod tests { let mut conf = content::TransformConfig::default(); let mut page_response = PageResponse::default(); - page_response.content = Some(Bytes::from(markup)); + page_response.content = Some(Bytes::from(markup).into()); let page = build(url, page_response); conf.return_format = ReturnFormat::Markdown; @@ -110,7 +110,7 @@ mod tests { let mut conf = content::TransformConfig::default(); let mut page_response = PageResponse::default(); conf.return_format = ReturnFormat::XML; - page_response.content = Some(Bytes::from(markup)); + page_response.content = Some(Bytes::from(markup).into()); let page = build(url, page_response); let content = content::transform_content(&page, &conf, &None, &None, &None); assert!( @@ -128,7 +128,7 @@ mod tests { let mut conf = content::TransformConfig::default(); let mut page_response = PageResponse::default(); - page_response.content = Some(Bytes::from(markup)); + page_response.content = Some(Bytes::from(markup).into()); let page = build(url, page_response); conf.return_format = ReturnFormat::Markdown; @@ -153,7 +153,7 @@ mod tests { let mut conf = content::TransformConfig::default(); let mut page_response = PageResponse::default(); - page_response.content = Some(Bytes::from(markup)); + page_response.content = Some(Bytes::from(markup).into()); let page = build(url, page_response); conf.return_format = ReturnFormat::Markdown; @@ -183,7 +183,7 @@ mod tests { let mut page_response = PageResponse::default(); let b = Bytes::from(data); - page_response.content = Some(b); + page_response.content = Some(b.into()); let page = build("https://example.com/example.pdf", page_response); diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index b4e559912..1cacaed07 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.12.7" +version = "2.12.8" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 2ecf82995..29ad46d49 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.12.7" +version = "2.12.8" authors = [ "j-mendez " ]