diff --git a/Cargo.lock b/Cargo.lock index 3593ca546..d811ce3f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -490,9 +490,9 @@ dependencies = [ [[package]] name = "case_insensitive_string" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e2c45dcbad72e408d8a18a66794b9c13ebffb950c3bfd7efbee66deab80bf4c" +checksum = "4b877cdcbbc56bf1dccbc428f7e745c98bc3d9a8af507065365c2ad71073d8fe" dependencies = [ "compact_str", "serde", @@ -3909,7 +3909,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.10.24" +version = "2.10.25" dependencies = [ "ahash", "async-openai", @@ -3970,7 +3970,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.10.24" +version = "2.10.25" dependencies = [ "adblock", "async-tungstenite", @@ -4005,7 +4005,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.10.24" +version = "2.10.25" dependencies = [ "clap", "env_logger", @@ -4029,7 +4029,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.10.24" +version = "2.10.25" dependencies = [ "aho-corasick", "fast_html2md", @@ -4051,7 +4051,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.10.24" +version = "2.10.25" dependencies = [ "indexmap 1.9.3", "serde", @@ -4063,7 +4063,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.10.24" +version = "2.10.25" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 3e632cae2..083d6290d 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.10.24" +version = "2.10.25" authors = [ "j-mendez " ] diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index 248431985..abe1225d2 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -515,6 +515,47 @@ pub async fn setup_chrome_interception_base( None } +/// establish all the page events. +pub async fn setup_chrome_events(chrome_page: &chromiumoxide::Page, config: &Configuration) { + let stealth = async { + if cfg!(feature = "chrome_stealth") || config.stealth_mode { + match config.user_agent.as_ref() { + Some(agent) => { + let _ = chrome_page.enable_stealth_mode_with_agent(agent).await; + } + _ => { + let _ = chrome_page.enable_stealth_mode().await; + } + } + } + }; + let eval_docs = async { + match config.evaluate_on_new_document { + Some(ref script) => { + if config.fingerprint { + let _ = chrome_page + .evaluate_on_new_document(string_concat!( + crate::features::chrome::FP_JS, + script.as_str() + )) + .await; + } else { + let _ = chrome_page.evaluate_on_new_document(script.as_str()).await; + } + } + _ => { + if config.fingerprint { + let _ = chrome_page + .evaluate_on_new_document(crate::features::chrome::FP_JS) + .await; + } + } + } + }; + + tokio::join!(stealth, eval_docs, configure_browser(&chrome_page, &config)); +} + /// static chrome arguments to start #[cfg(all(feature = "chrome_cpu", feature = "real_browser"))] pub static CHROME_ARGS: [&'static str; 27] = [ diff --git a/spider/src/page.rs b/spider/src/page.rs index 7dbe02634..3a2b19926 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -1164,7 +1164,7 @@ impl Page { ) .await; - crate::website::Website::setup_chrome_events(&new_page, &configuration).await; + crate::website:: crate::features::chrome::setup_chrome_events(&new_page, &configuration).await; let page_resource = crate::utils::fetch_page_html_chrome_base( diff --git a/spider/src/website.rs b/spider/src/website.rs index f90f80d26..b73d0a923 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -27,9 +27,6 @@ use tokio::{ use tokio_stream::StreamExt; use url::Url; -#[cfg(feature = "chrome")] -use crate::features::chrome::{configure_browser, launch_browser}; - #[cfg(feature = "cache")] use http_cache_reqwest::{CACacheManager, Cache, CacheMode, HttpCache, HttpCacheOptions}; @@ -249,7 +246,7 @@ pub struct Website { channel_queue: Option<(broadcast::Sender, Arc>)>, /// The status of the active crawl this is mapped to a general status and not the HTTP status code. status: CrawlStatus, - /// The initial status code of the first request + /// The initial status code of the first request. initial_status_code: StatusCode, /// Set the crawl ID to track. This allows explicit targeting for shutdown, pause, and etc. #[cfg(feature = "control")] @@ -450,7 +447,7 @@ impl Website { } /// Validate if url exceeds crawl budget and should not be handled. - pub fn is_over_budget(&mut self, link: &CaseInsensitiveString) -> bool { + pub(crate) fn is_over_budget(&mut self, link: &CaseInsensitiveString) -> bool { if self.configuration.inner_budget.is_some() || self.configuration.depth_distance > 0 { match Url::parse(link.inner()) { Ok(r) => { @@ -1297,7 +1294,7 @@ impl Website { retry_count -= 1; } - log("fetch", &url); + log::info!("fetch {}", &url); // allow initial page mutation match page.final_redirect_destination.as_deref() { @@ -1367,48 +1364,6 @@ impl Website { } } - /// establish all the page events. - #[cfg(feature = "chrome")] - pub async fn setup_chrome_events(chrome_page: &chromiumoxide::Page, config: &Configuration) { - let stealth = async { - if cfg!(feature = "chrome_stealth") || config.stealth_mode { - match config.user_agent.as_ref() { - Some(agent) => { - let _ = chrome_page.enable_stealth_mode_with_agent(agent).await; - } - _ => { - let _ = chrome_page.enable_stealth_mode().await; - } - } - } - }; - let eval_docs = async { - match config.evaluate_on_new_document { - Some(ref script) => { - if config.fingerprint { - let _ = chrome_page - .evaluate_on_new_document(string_concat!( - crate::features::chrome::FP_JS, - script.as_str() - )) - .await; - } else { - let _ = chrome_page.evaluate_on_new_document(script.as_str()).await; - } - } - _ => { - if config.fingerprint { - let _ = chrome_page - .evaluate_on_new_document(crate::features::chrome::FP_JS) - .await; - } - } - } - }; - - tokio::join!(stealth, eval_docs, configure_browser(&chrome_page, &config)); - } - /// Expand links for crawl. #[cfg(all(not(feature = "decentralized"), feature = "chrome"))] async fn crawl_establish( @@ -1422,7 +1377,7 @@ impl Website { .is_allowed_default(&self.get_base_link()) .eq(&ProcessLinkStatus::Allowed) { - Website::setup_chrome_events(chrome_page, &self.configuration).await; + crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration).await; let intercept_handle = self.setup_chrome_interception(&chrome_page).await; @@ -1440,43 +1395,34 @@ impl Website { ) .await; - match intercept_handle { - Some(h) => { - let _ = h.await; - } - _ => (), + if let Some(h) = intercept_handle { + let _ = h.await; } - match page.final_redirect_destination { - Some(ref domain) => { - let domain: Box = - CaseInsensitiveString::new(&domain).into(); - - let prior_domain = self.domain_parsed.take(); - - self.domain_parsed = match url::Url::parse(&domain.inner()) { - Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))), - _ => None, - }; - self.url = domain; - match self.setup_selectors() { - Some(s) => { - base.0 = s.0; - base.1 = s.1; - match prior_domain { - Some(prior_domain) => match prior_domain.host_str() { - Some(dname) => { - base.2 = dname.into(); - } - _ => (), - }, + if let Some(ref domain) = page.final_redirect_destination { + let domain: Box = CaseInsensitiveString::new(&domain).into(); + let prior_domain = self.domain_parsed.take(); + self.domain_parsed = match url::Url::parse(&domain.inner()) { + Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))), + _ => None, + }; + self.url = domain; + match self.setup_selectors() { + Some(s) => { + base.0 = s.0; + base.1 = s.1; + match prior_domain { + Some(prior_domain) => match prior_domain.host_str() { + Some(dname) => { + base.2 = dname.into(); + } _ => (), - } + }, + _ => (), } - _ => (), } + _ => (), } - _ => (), } let links = if !page.is_empty() { @@ -1543,7 +1489,7 @@ impl Website { ) .await { - Website::setup_chrome_events(&chrome_page, &config).await; + crate::features::chrome::setup_chrome_events(&chrome_page, &config).await; let intercept_handle = crate::features::chrome::setup_chrome_interception_base( &chrome_page, config.chrome_intercept.enabled, @@ -2267,7 +2213,7 @@ impl Website { continue; } - log("fetch", &link); + log::info!("fetch {}", &link); self.links_visited.insert(link.clone()); let shared = shared.clone(); @@ -2413,7 +2359,11 @@ impl Website { .await { Ok(new_page) => { - Website::setup_chrome_events(&new_page, &self.configuration).await; + crate::features::chrome::setup_chrome_events( + &new_page, + &self.configuration, + ) + .await; let semaphore = if self.configuration.shared_queue { SEM_SHARED.clone() @@ -2505,7 +2455,7 @@ impl Website { continue; } - log("fetch", &link); + log::info!("fetch {}", &link); self.links_visited.insert(link.clone()); let shared = shared.clone(); @@ -2522,7 +2472,7 @@ impl Website { let next = match attempt_navigation("about:blank", &shared.4, &shared.5.request_timeout, &shared.6 ).await { Ok(new_page) => { - Website::setup_chrome_events(&new_page, &shared.5).await; + crate::features::chrome::setup_chrome_events(&new_page, &shared.5).await; let mut page = Page::new( &target_url, &shared.0, @@ -2745,7 +2695,6 @@ impl Website { } else { Arc::new(Semaphore::const_new(*DEFAULT_PERMITS)) }; - Website::setup_chrome_events(&new_page, &self.configuration).await; let mut q = match &self.channel_queue { Some(q) => Some(q.0.subscribe()), _ => None, @@ -2759,6 +2708,9 @@ impl Website { self.crawl_establish(&client, &mut selectors, false, &new_page) .await, ); + + self.configuration.configure_allowlist(); + let mut set: JoinSet> = JoinSet::new(); let chandle = Handle::current(); @@ -2776,7 +2728,6 @@ impl Website { )); let add_external = shared.3.len() > 0; - self.configuration.configure_allowlist(); let on_link_find_callback = self.on_link_find_callback; let full_resources = self.configuration.full_resources; let return_page_links = self.configuration.return_page_links; @@ -2815,7 +2766,7 @@ impl Website { continue; } - log("fetch", &link); + log::info!("fetch {}", &link); self.links_visited.insert(link.clone()); let shared = shared.clone(); @@ -2825,7 +2776,7 @@ impl Website { match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8 ).await { Ok(new_page) => { - Website::setup_chrome_events(&new_page, &shared.6).await; + crate::features::chrome::setup_chrome_events(&new_page, &shared.6).await; let intercept_handle = crate::features::chrome::setup_chrome_interception_base( &new_page, @@ -3073,7 +3024,7 @@ impl Website { continue; } - log("fetch", &link); + log::info!("fetch {}", &link); self.links_visited.insert(link.clone()); @@ -3194,7 +3145,6 @@ impl Website { self.drain_extra_links().collect(); let (mut interval, throttle) = self.setup_crawl(); - self.configuration.configure_allowlist(); let on_link_find_callback = self.on_link_find_callback; let return_page_links = self.configuration.return_page_links; @@ -3213,6 +3163,7 @@ impl Website { ) .await, ); + self.configuration.configure_allowlist(); let mut set: JoinSet> = JoinSet::new(); let chandle = Handle::current(); @@ -3260,7 +3211,7 @@ impl Website { continue; } - log("fetch", &link); + log::info!("fetch {}", &link); self.links_visited.insert(link.clone()); let shared = shared.clone(); @@ -3833,7 +3784,7 @@ impl Website { ) .await; - Website::setup_chrome_events(&new_page, &shared.3).await; + crate::features::chrome::setup_chrome_events(&new_page, &shared.3).await; let page = Page::new( &link.inner(), @@ -4017,7 +3968,9 @@ impl Website { tokio::task::JoinHandle<()>, Option, )> { - match launch_browser(&self.configuration, self.get_url_parsed()).await { + match crate::features::chrome::launch_browser(&self.configuration, self.get_url_parsed()) + .await + { Some((browser, browser_handle, context_id)) => { let browser: Arc = Arc::new(browser); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 2eb2ebeae..00314d01a 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.10.24" +version = "2.10.25" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 2e6f382c6..90b4f1689 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.10.24" +version = "2.10.25" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 1b731cf6c..b5083926e 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.10.24" +version = "2.10.25" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 187c96cf6..0b8ef3a55 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.10.24" +version = "2.10.25" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index b9f31963b..28ca58b6e 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.10.24" +version = "2.10.25" authors = [ "j-mendez " ]