Skip to content

Commit

Permalink
chore(page): add box html bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 5, 2024
1 parent 9874686 commit eb68312
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 27 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.12.7"
version = "2.12.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
15 changes: 9 additions & 6 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ pub struct AIResults {
#[cfg(not(feature = "decentralized"))]
pub struct Page {
/// The bytes of the resource.
html: Option<Bytes>,
html: Option<Box<Bytes>>,
/// Base absolute url for page.
base: Option<Url>,
/// The raw url for the page. Useful since Url::parse adds a trailing slash.
Expand Down Expand Up @@ -170,7 +170,7 @@ pub struct Page {
#[derive(Debug, Clone, Default)]
pub struct Page {
/// The bytes of the resource.
html: Option<Bytes>,
html: Option<Box<Bytes>>,
#[cfg(feature = "headers")]
/// The headers of the page request response.
pub headers: Option<reqwest::header::HeaderMap>,
Expand Down Expand Up @@ -318,10 +318,10 @@ pub fn get_page_selectors(url: &str, subdomains: bool, tld: bool) -> Option<Rela

#[cfg(not(feature = "decentralized"))]
/// Is the resource valid?
pub fn validate_empty(content: &Option<Bytes>, is_success: bool) -> bool {
pub fn validate_empty(content: &Option<Box<Bytes>>, is_success: bool) -> bool {
match content {
Some(ref content) => {
if content.is_empty() || content == "<html><head></head><body></body></html>" || is_success &&
if content.is_empty() || content.starts_with(b"<html><head></head><body></body></html>") || is_success &&
content.starts_with(b"<html>\r\n<head>\r\n<META NAME=\"robots\" CONTENT=\"noindex,nofollow\">\r\n<script src=\"/") &&
content.ends_with(b"\">\r\n</script>\r\n<body>\r\n</body></html>\r\n") {
false
Expand Down Expand Up @@ -743,7 +743,10 @@ impl Page {

/// Set the html directly of the page
pub fn set_html_bytes(&mut self, html: Option<Bytes>) {
self.html = html;
self.html = match html {
Some(html) => Some(Box::new(html)),
_ => None,
};
}

/// Set the url directly of the page. Useful for transforming the content and rewriting the url.
Expand Down Expand Up @@ -1585,7 +1588,7 @@ pub fn encode_bytes(html: &Bytes, label: &str) -> String {

/// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS.
#[cfg(feature = "encoding")]
pub fn get_html_encoded(html: &Option<Bytes>, label: &str) -> String {
pub fn get_html_encoded(html: &Option<Box<Bytes>>, label: &str) -> String {
match html.as_ref() {
Some(html) => encode_bytes(html, label),
_ => Default::default(),
Expand Down
6 changes: 3 additions & 3 deletions spider/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ async fn cf_handle(
#[derive(Debug, Default)]
pub struct PageResponse {
/// The page response resource.
pub content: Option<bytes::Bytes>,
pub content: Option<Box<bytes::Bytes>>,
#[cfg(feature = "headers")]
/// The headers of the response. (Always None if a webdriver protocol is used for fetching.).
pub headers: Option<reqwest::header::HeaderMap>,
Expand Down Expand Up @@ -1402,7 +1402,7 @@ pub async fn handle_response_bytes(
}
}

let mut content: Option<bytes::Bytes> = None;
let mut content: Option<Box<bytes::Bytes>> = None;

if !block_streaming {
let mut stream = res.bytes_stream();
Expand Down Expand Up @@ -1433,7 +1433,7 @@ pub async fn handle_response_bytes(
}
}

content.replace(data.into());
content.replace(Box::new(data.into()));
}

PageResponse {
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.12.7"
version = "2.12.8"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.12.7"
version = "2.12.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.12.7"
version = "2.12.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
5 changes: 4 additions & 1 deletion spider_transformations/src/html2xml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use html5ever::{parse_document, QualName};
use markup5ever::namespace_url;
use markup5ever::ns;
use spider::auto_encoder::auto_encode_bytes;
use spider::bytes::Bytes;
use spider::page::get_html_encoded;
use std::default::Default;
use std::error::Error;
Expand All @@ -16,8 +17,10 @@ pub fn convert_html_to_xml(
encoding: &Option<String>,
) -> Result<String, Box<dyn Error>> {
if encoding.is_some() {
let bytes: Box<Bytes> = Box::new(base_convert_xml(html, url, encoding)?.into());

Ok(get_html_encoded(
&Some(base_convert_xml(html, url, encoding)?.into()),
&Some(bytes),
&match encoding {
Some(encoding) => encoding,
_ => "UTF-8",
Expand Down
10 changes: 5 additions & 5 deletions spider_transformations/src/transformation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ mod tests {
let mut conf = content::TransformConfig::default();
let mut page_response = PageResponse::default();

page_response.content = Some(Bytes::from(markup));
page_response.content = Some(Bytes::from(markup).into());
let page = build(url, page_response);

conf.return_format = ReturnFormat::Markdown;
Expand Down Expand Up @@ -110,7 +110,7 @@ mod tests {
let mut conf = content::TransformConfig::default();
let mut page_response = PageResponse::default();
conf.return_format = ReturnFormat::XML;
page_response.content = Some(Bytes::from(markup));
page_response.content = Some(Bytes::from(markup).into());
let page = build(url, page_response);
let content = content::transform_content(&page, &conf, &None, &None, &None);
assert!(
Expand All @@ -128,7 +128,7 @@ mod tests {
let mut conf = content::TransformConfig::default();
let mut page_response = PageResponse::default();

page_response.content = Some(Bytes::from(markup));
page_response.content = Some(Bytes::from(markup).into());
let page = build(url, page_response);

conf.return_format = ReturnFormat::Markdown;
Expand All @@ -153,7 +153,7 @@ mod tests {
let mut conf = content::TransformConfig::default();
let mut page_response = PageResponse::default();

page_response.content = Some(Bytes::from(markup));
page_response.content = Some(Bytes::from(markup).into());
let page = build(url, page_response);

conf.return_format = ReturnFormat::Markdown;
Expand Down Expand Up @@ -183,7 +183,7 @@ mod tests {
let mut page_response = PageResponse::default();
let b = Bytes::from(data);

page_response.content = Some(b);
page_response.content = Some(b.into());

let page = build("https://example.com/example.pdf", page_response);

Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.12.7"
version = "2.12.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.12.7"
version = "2.12.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit eb68312

Please sign in to comment.