diff --git a/Cargo.lock b/Cargo.lock index a039d0189..0b8c9e0a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2223,9 +2223,9 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lol_html" -version = "1.2.1" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4629ff9c2deeb7aad9b2d0f379fc41937a02f3b739f007732c46af40339dee5" +checksum = "964b47c14635e111f7efddcd8f1f8794195f66225fef19822fa942b217a859cf" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -3917,7 +3917,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.11.20" +version = "2.12.4" dependencies = [ "ahash", "async-openai", @@ -3978,7 +3978,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.11.20" +version = "2.12.4" dependencies = [ "adblock", "async-tungstenite", @@ -4013,7 +4013,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.11.20" +version = "2.12.4" dependencies = [ "clap", "env_logger", @@ -4037,7 +4037,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.11.20" +version = "2.12.4" dependencies = [ "aho-corasick", "fast_html2md", @@ -4059,7 +4059,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.11.20" +version = "2.12.4" dependencies = [ "indexmap 1.9.3", "serde", @@ -4071,7 +4071,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.11.20" +version = "2.12.4" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index e85abde79..c31db8ff8 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.11.20" +version = "2.12.4" authors = [ "j-mendez " ] @@ -50,7 +50,7 @@ http-cache-reqwest = { version = "0.14.0", optional = true } const_format = { version = "0.2", optional = true } async-openai = { version = "0.25", optional = true } tiktoken-rs = { version = "0.5", optional = true } -lol_html = { version = "1", optional = true } +lol_html = { version = "2" } serde_json = { version = "1", optional = true } quick-xml = { version = "0.36", features = ["serde", "serialize", "async-tokio"]} moka = { version = "0.12", features = ["future"], optional = true } @@ -147,7 +147,7 @@ smart = ["chrome", "dep:rand", "chrome_intercept"] encoding = [] headers = ["dep:httpdate"] real_browser = ["dep:statrs", "dep:rand"] -openai = ["chrome", "serde", "chrome_intercept", "dep:async-openai", "dep:tiktoken-rs", "dep:lol_html", "dep:serde_json"] +openai = ["chrome", "serde", "chrome_intercept", "dep:async-openai", "dep:tiktoken-rs", "dep:serde_json"] openai_slim_fit = [] decentralized_headers = ["dep:const_format", "dep:itertools"] spoof = ["dep:fastrand"] diff --git a/spider/src/packages/scraper/element_ref/mod.rs b/spider/src/packages/scraper/element_ref/mod.rs index 2f9eac806..f5073ccd8 100644 --- a/spider/src/packages/scraper/element_ref/mod.rs +++ b/spider/src/packages/scraper/element_ref/mod.rs @@ -59,9 +59,7 @@ impl<'a> ElementRef<'a> { create_missing_parent: false, }; let mut buf = Vec::new(); - match serialize(&mut buf, self, opts) { - _ => (), - }; + let _ = serialize(&mut buf, self, opts); // we need to get the initial encoding of the html lang if used. auto_encoder::auto_encode_bytes(&buf) } diff --git a/spider/src/packages/scraper/html/mod.rs b/spider/src/packages/scraper/html/mod.rs index c910f26b9..67d60e072 100644 --- a/spider/src/packages/scraper/html/mod.rs +++ b/spider/src/packages/scraper/html/mod.rs @@ -130,9 +130,7 @@ impl Html { create_missing_parent: false, }; let mut buf = Vec::new(); - match serialize(&mut buf, self, opts) { - _ => (), - }; + let _ = serialize(&mut buf, self, opts); auto_encoder::auto_encode_bytes(&buf) } diff --git a/spider/src/packages/scraper/html/tree_sink.rs b/spider/src/packages/scraper/html/tree_sink.rs index b040f6ae5..b3d7abfdc 100644 --- a/spider/src/packages/scraper/html/tree_sink.rs +++ b/spider/src/packages/scraper/html/tree_sink.rs @@ -164,15 +164,16 @@ impl TreeSink for Html { // Detach the given node from its parent. fn remove_from_parent(&mut self, target: &Self::Handle) { - self.tree.get_mut(*target).unwrap().detach(); + if let Some(mut p) = self.tree.get_mut(*target) { + p.detach(); + } } // Remove all the children from node and append them to new_parent. fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) { - self.tree - .get_mut(*new_parent) - .unwrap() - .reparent_from_id_append(*node); + if let Some(mut p) = self.tree.get_mut(*new_parent) { + p.reparent_from_id_append(*node); + } } // Add each attribute to the given element, if no attribute with that name already exists. The diff --git a/spider/src/page.rs b/spider/src/page.rs index d01198bd1..f8b151a6d 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -3,8 +3,6 @@ use crate::compact_str::CompactString; #[cfg(all(feature = "chrome", not(feature = "decentralized")))] use crate::configuration::{AutomationScripts, ExecutionScripts}; -#[cfg(not(feature = "decentralized"))] -use crate::packages::scraper::Html; use crate::utils::log; use crate::utils::PageResponse; use crate::CaseInsensitiveString; @@ -28,6 +26,7 @@ lazy_static! { /// Wildcard match all domains. static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*"); static ref SSG_CAPTURE: Regex = Regex::new(r#""(.*?)""#).unwrap(); + static ref GATSBY: Option = Some("gatsby-chunk-mapping".into()); } #[cfg(any(feature = "smart", feature = "chrome_intercept"))] @@ -42,6 +41,35 @@ lazy_static! { }; } +#[cfg(all( + not(feature = "decentralized"), + not(feature = "full_resources"), + feature = "smart" +))] +lazy_static! { + static ref DOM_WATCH_METHODS: regex::bytes::RegexSet = { + let set = unsafe { + regex::bytes::RegexSet::new(&[ + r"\.createElementNS", + r"\.removeChild", + r"\.insertBefore", + r"\.createElement", + r"\.setAttribute", + r"\.createTextNode", + r"\.replaceChildren", + r"\.prepend", + r"\.append", + r"\.appendChild", + r"\.write", + r"\$\s*\(.*?\)", + ]) + .unwrap_unchecked() + }; + + set + }; +} + #[cfg(any(feature = "chrome_intercept"))] lazy_static! { /// allowed js frameworks and libs excluding some and adding additional URLs @@ -967,21 +995,19 @@ impl Page { self.links_stream_xml_links_stream_base(selectors, html, &mut map) .await; } else { - let html = Box::new(Html::parse_fragment(html)); - let mut stream = tokio_stream::iter(html.tree); - // the original url let parent_host = &selectors.1[0]; // the host schemes let parent_host_scheme = &selectors.1[1]; let base_input_domain = &selectors.2; // the domain after redirects let sub_matcher = &selectors.0; - while let Some(node) = stream.next().await { - if let Some(element) = node.as_element() { - if element.name() == "a" { - if let Some(href) = element.attr("href") { + let _ = rewrite_str_empty( + &html, + lol_html::RewriteStrSettings { + element_content_handlers: vec![lol_html::element!("a", |el| { + if let Some(href) = el.get_attribute("href") { self.push_link( - href, + &href, &mut map, &selectors.0, parent_host, @@ -990,11 +1016,14 @@ impl Page { sub_matcher, ); } - } - } - } + Ok(()) + })], + ..lol_html::RewriteStrSettings::default() + }, + ); } } + map } @@ -1015,9 +1044,6 @@ impl Page { self.links_stream_xml_links_stream_base(selectors, html, &mut map) .await; } else { - let html = Box::new(crate::packages::scraper::Html::parse_fragment(html)); - let mut stream = tokio_stream::iter(html.tree); - // the original url let parent_host = &selectors.1[0]; // the host schemes @@ -1027,13 +1053,14 @@ impl Page { let mut build_ssg_path = None; - while let Some(node) = stream.next().await { - if let Some(element) = node.as_element() { - match element.name() { - "a" => { - if let Some(href) = element.attr("href") { + let _ = rewrite_str_empty( + &html, + lol_html::RewriteStrSettings { + element_content_handlers: vec![ + lol_html::element!("a", |el| { + if let Some(href) = el.get_attribute("href") { self.push_link( - href, + &href, &mut map, &selectors.0, parent_host, @@ -1042,20 +1069,24 @@ impl Page { sub_matcher, ); } - } - "script" if build_ssg_path.is_none() => { - if let Some(source) = element.attr("src") { - if source.starts_with("/_next/static/") - && source.ends_with("/_ssgManifest.js") - { - build_ssg_path = Some(self.abs_path(source)); + Ok(()) + }), + lol_html::element!("script", |el| { + if build_ssg_path.is_none() { + if let Some(source) = el.get_attribute("src") { + if source.starts_with("/_next/static/") + && source.ends_with("/_ssgManifest.js") + { + build_ssg_path = Some(self.abs_path(&source)); + } } } - } - _ => (), - } - } - } + Ok(()) + }), + ], + ..lol_html::RewriteStrSettings::default() + }, + ); if let Some(build_ssg_path) = build_ssg_path { if let Some(s) = build_ssg_path { @@ -1094,6 +1125,7 @@ impl Page { } } } + map } @@ -1139,211 +1171,181 @@ impl Page { configuration: &crate::configuration::Configuration, context_id: &Option, ) -> HashSet { + use auto_encoder::auto_encode_bytes; + let mut map = HashSet::new(); - let html = self.get_html(); - if !html.is_empty() { - if html.starts_with(" { - if src.starts_with("/") { - if src.starts_with("/_next/static/chunks/pages/") - || src.starts_with("/webpack-runtime-") - || element.attr("id") == Some("gatsby-chunk-mapping") - { - static_app = true; - continue; - } + let rewrited_bytes = match rewrite_str_as_bytes( + &html_resource, + RewriteStrSettings { + element_content_handlers: vec![ + element!("script", |element| { + if !static_app { + if let Some(src) = element.get_attribute("src") { + if src.starts_with("/") { + if src.starts_with("/_next/static/chunks/pages/") + || src.starts_with("/webpack-runtime-") + || element.get_attribute("id").eq(&*GATSBY) + { + static_app = true; + } - match self.abs_path(src) { - Some(abs) => { - match abs + if let Some(abs) = self.abs_path(&src) { + if let Ok(mut paths) = abs .path_segments() .ok_or_else(|| "cannot be base") { - Ok(mut paths) => { - while let Some(p) = paths.next() { - // todo: get the path last before None instead of checking for ends_with - if p.ends_with(".js") - && JS_FRAMEWORK_ASSETS.contains(&p) - { - rerender = true; - } else { - match node.as_text() { - Some(text) => { - lazy_static! { - static ref DOM_WATCH_METHODS: regex::RegexSet = { - let set = unsafe { - regex::RegexSet::new(&[ - r"/.createElementNS/gm", - r"/.removeChild/gm", - r"/.insertBefore/gm", - r"/.createElement/gm", - r"/.setAttribute/gm", - r"/.createTextNode/gm", - r"/.replaceChildren/gm", - r"/.prepend/gm", - r"/.append/gm", - r"/.appendChild/gm", - r"/.write/gm", - r"\$\s*\(.*?\)", - ]) - .unwrap_unchecked() - }; - - set - }; - } - rerender = - DOM_WATCH_METHODS - .is_match(text); - } - _ => (), - } - } + while let Some(p) = paths.next() { + // todo: get the path last before None instead of checking for ends_with + if p.ends_with(".js") + && JS_FRAMEWORK_ASSETS.contains(&p) + { + rerender = true; } } - _ => (), - }; - - if rerender { - // we should re-use the html content instead with events. - let uu = self.get_html(); - let browser = browser.to_owned(); - let configuration = configuration.clone(); - let target_url = self.url.clone(); - let context_id = context_id.clone(); - let parent_host = parent_host.clone(); - - tokio::task::spawn(async move { - // we need to use about:blank here since we set the HTML content directly - match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout, &context_id).await { - Ok(new_page) => { - - let intercept_handle = crate::features::chrome::setup_chrome_interception_base( - &new_page, - configuration.chrome_intercept.enabled, - &configuration.auth_challenge_response, - configuration.chrome_intercept.block_visuals, - &parent_host, - ) - .await; - - crate::features::chrome::setup_chrome_events(&new_page, &configuration).await; - - let page_resource = - crate::utils::fetch_page_html_chrome_base( - &uu, - &new_page, - true, - true, - &Some(crate::configuration::WaitFor::new( - Some( - core::time::Duration::from_secs( - 120, - ), // default a duration for smart handling. (maybe expose later on.) - ), - None, - true, - true, - None, - Some(crate::configuration::WaitForSelector::new( Some(core::time::Duration::from_millis(500)), "body".into())) - )), - &configuration.screenshot, - false, - &configuration.openai_config, - Some(&target_url), - &configuration - .execution_scripts, - &configuration - .automation_scripts, - &configuration.viewport, - &configuration.request_timeout - ) - .await; - - match intercept_handle { - Some(h) => { - let _ = h.await; - } - _ => (), - } - if let Ok(resource) = page_resource { - if let Err(_) = tx.send(resource) - { - crate::utils::log( - "the receiver dropped", - "", - ); - } - } - } - _ => (), - } - }); - - break; } } - _ => (), } } } - _ => (), + Ok(()) + }), + element!("a", |el| { + if let Some(href) = el.get_attribute("href") { + self.push_link( + &href, + &mut map, + &selectors.0, + parent_host, + parent_host_scheme, + base_input_domain, + sub_matcher, + ); + } + + el.remove(); + + Ok(()) + }), + element!("*:not(script):not(a):not(body):not(head):not(html)", |el| { + el.remove(); + Ok(()) + }), + ], + document_content_handlers: vec![doc_comments!(|c| { + c.remove(); + Ok(()) + })], + ..RewriteStrSettings::default() + }, + ) { + Ok(s) => s, + _ => html_resource.as_bytes().to_vec(), + }; + + if rerender || DOM_WATCH_METHODS.is_match(&rewrited_bytes) { + // we should re-use the html content instead with events. + let browser = browser.to_owned(); + let configuration = configuration.clone(); + let target_url = self.url.clone(); + let context_id = context_id.clone(); + let parent_host = parent_host.clone(); + + tokio::task::spawn(async move { + if let Ok(new_page) = crate::features::chrome::attempt_navigation( + "about:blank", + &browser, + &configuration.request_timeout, + &context_id, + ) + .await + { + let intercept_handle = + crate::features::chrome::setup_chrome_interception_base( + &new_page, + configuration.chrome_intercept.enabled, + &configuration.auth_challenge_response, + configuration.chrome_intercept.block_visuals, + &parent_host, + ) + .await; + + crate::features::chrome::setup_chrome_events(&new_page, &configuration) + .await; + + let page_resource = crate::utils::fetch_page_html_chrome_base( + &html_resource, + &new_page, + true, + true, + &Some(crate::configuration::WaitFor::new( + Some( + core::time::Duration::from_secs(120), // default a duration for smart handling. (maybe expose later on.) + ), + None, + true, + true, + None, + Some(crate::configuration::WaitForSelector::new( + Some(core::time::Duration::from_millis(500)), + "body".into(), + )), + )), + &configuration.screenshot, + false, + &configuration.openai_config, + Some(&target_url), + &configuration.execution_scripts, + &configuration.automation_scripts, + &configuration.viewport, + &configuration.request_timeout, + ) + .await; + + if let Some(h) = intercept_handle { + let _ = h.await; } - } - if element_name == "a" { - // add fullresources? - if let Some(href) = element.attr("href") { - self.push_link( - href, - &mut map, - &selectors.0, - parent_host, - parent_host_scheme, - base_input_domain, - sub_matcher, - ); + if let Ok(resource) = page_resource { + if let Err(_) = tx.send(resource) { + crate::utils::log("the receiver dropped", ""); + } } } - } - } + }); - if rerender { - drop(stream); match rx.await { Ok(v) => { let extended_map = self .links_stream_base::( selectors, &match v.content { - Some(h) => String::from_utf8_lossy(&h).to_string(), + Some(h) => auto_encode_bytes(&h), _ => Default::default(), }, ) .await; + map.extend(extended_map) } Err(e) => { @@ -1353,6 +1355,7 @@ impl Page { } } } + map } @@ -1363,16 +1366,13 @@ impl Page { selectors: &RelativeSelectors, ) -> HashSet { let mut map = HashSet::new(); - let html = self.get_html(); - if !html.is_empty() { + if !self.is_empty() { + let html = self.get_html(); if html.starts_with(", _label: &str) -> String { } } +/// Rewrite a string without encoding it. +#[cfg(all( + not(feature = "decentralized"), + not(feature = "full_resources"), + feature = "smart" +))] +pub fn rewrite_str_as_bytes<'h, 's>( + html: &str, + settings: impl Into>, +) -> Result, lol_html::errors::RewritingError> { + let mut output = vec![]; + + let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |c: &[u8]| { + output.extend_from_slice(c); + }); + + rewriter.write(html.as_bytes())?; + rewriter.end()?; + + Ok(output) +} + +/// Basic rewriter without rewriting. +pub fn rewrite_str_empty<'h, 's>( + html: &str, + settings: impl Into>, +) -> Result<(), lol_html::errors::RewritingError> { + // we should use this in our chunks to rewrite while streaming. + let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |_c: &[u8]| {}); + + rewriter.write(html.as_bytes())?; + rewriter.end()?; + + Ok(()) +} + #[cfg(test)] pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION")); diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index 291f10fa3..ffef62d02 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -980,6 +980,15 @@ pub async fn fetch_page_html_chrome_base( }, ) .await; + + // perform extra navigate to trigger page actions. + if let Some(u) = url_target { + if u.starts_with("http") { + let _ = page + .evaluate(format!(r#"window.location = "{}";"#, u)) + .await; + } + } } } else { if let Err(e) = navigate(page, source, &mut chrome_http_req_res).await { diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 2feaa42cf..974bd1f4f 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.11.20" +version = "2.12.4" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 10fd2bcca..b4fa84785 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.11.20" +version = "2.12.4" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 6ccd40d8a..8ea8829d0 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.11.20" +version = "2.12.4" authors = [ "j-mendez " ] @@ -27,7 +27,7 @@ serde = { version = "1", features = ["derive"] } fast_html2md = "0" phf = "0.11" phf_codegen = "0.11" -lol_html = { version = "1" } +lol_html = { version = "2" } [dependencies.spider] version = "2" diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 6606bb9cd..d227128cf 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.11.20" +version = "2.12.4" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 0dafc8f1a..74bae4367 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.11.20" +version = "2.12.4" authors = [ "j-mendez " ]