From 51814aa98be6d2a912cd76d8e9b4439f5ba54f7b Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 5 Oct 2024 22:22:45 -0400 Subject: [PATCH] chore(chrome): patch logs stealth mode --- Cargo.lock | 14 ++++----- examples/Cargo.toml | 2 +- spider/Cargo.toml | 2 +- spider/README.md | 16 +++++------ spider/src/utils/mod.rs | 3 ++ spider/src/website.rs | 47 +++++++++++++++++++++++++++++++ spider_chrome/Cargo.toml | 2 +- spider_chrome/src/page.rs | 4 +-- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 12 files changed, 74 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f646152a6..3ee43f1a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3747,7 +3747,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.8.5" +version = "2.8.7" dependencies = [ "ahash", "async-openai", @@ -3805,7 +3805,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.8.5" +version = "2.8.7" dependencies = [ "adblock", "async-tungstenite", @@ -3840,7 +3840,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.8.5" +version = "2.8.7" dependencies = [ "clap", "env_logger", @@ -3852,7 +3852,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "2.6.33" +version = "2.8.6" dependencies = [ "convert_case 0.6.0", "env_logger", @@ -3864,7 +3864,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.8.5" +version = "2.8.7" dependencies = [ "aho-corasick", "fast_html2md", @@ -3883,7 +3883,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.8.5" +version = "2.8.7" dependencies = [ "indexmap 1.9.3", "spider", @@ -3892,7 +3892,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.8.5" +version = "2.8.7" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 655ea5091..1127e2531 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "2.6.33" +version = "2.8.6" authors = [ "j-mendez ", ] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 6a4b0ec49..455c7118d 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.8.5" +version = "2.8.7" authors = [ "j-mendez " ] diff --git a/spider/README.md b/spider/README.md index 26d776897..268cd30c3 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "2.0.12" +spider = "2" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "2.0.12", features = ["regex", "ua_generator"] } +spider = { version = "2", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -139,7 +139,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "2.0.12", features = ["decentralized"] } +spider = { version = "2", features = ["decentralized"] } ``` ```sh @@ -170,7 +170,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "2.0.12", features = ["sync"] } +spider = { version = "2", features = ["sync"] } ``` ```rust,no_run @@ -201,7 +201,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "2.0.12", features = ["regex"] } +spider = { version = "2", features = ["regex"] } ``` ```rust,no_run @@ -228,7 +228,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "2.0.12", features = ["control"] } +spider = { version = "2", features = ["control"] } ``` ```rust @@ -298,7 +298,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "2.0.12", features = ["sync", "cron"] } +spider = { version = "2", features = ["sync", "cron"] } ``` ```rust,no_run @@ -337,7 +337,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "2.0.12", features = ["chrome", "chrome_intercept"] } +spider = { version = "2", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. diff --git a/spider/src/utils/mod.rs b/spider/src/utils/mod.rs index 012ea7860..123f97521 100644 --- a/spider/src/utils/mod.rs +++ b/spider/src/utils/mod.rs @@ -852,6 +852,9 @@ async fn perform_smart_mouse_movement(page: &chromiumoxide::Page) { #[cfg(all(not(feature = "real_browser"), feature = "smart"))] async fn perform_smart_mouse_movement(_page: &chromiumoxide::Page) {} +#[cfg(all(not(feature = "real_browser"), not(feature = "smart")))] +async fn perform_smart_mouse_movement(_page: &chromiumoxide::Page) {} + #[cfg(feature = "chrome")] /// Perform a network request to a resource extracting all content as text streaming via chrome. pub async fn fetch_page_html_chrome_base( diff --git a/spider/src/website.rs b/spider/src/website.rs index e0cd0f606..ce81d5322 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -2463,6 +2463,29 @@ impl Website { ) .await; + let mut retry_count = shared.5.retry; + + while page.should_retry && retry_count > 0 { + if let Some(timeout) = page.get_timeout() { + tokio::time::sleep(timeout).await; + } + page.clone_from( + &Page::new( + &target_url, + &shared.0, + &new_page, + &shared.5.wait_for, + &shared.5.screenshot, + false, + &shared.5.openai_config, + &shared.5.execution_scripts, + &shared.5.automation_scripts, + ) + .await, + ); + retry_count -= 1; + } + if add_external { page.set_external( shared @@ -2730,6 +2753,30 @@ impl Website { ) .await; + let mut retry_count = shared.6.retry; + + while page.should_retry && retry_count > 0 { + if let Some(timeout) = page.get_timeout() { + tokio::time::sleep(timeout).await; + } + page.clone_from( + &Page::new( + &target_url, + &shared.0, + &new_page, + &shared.6.wait_for, + &shared.6.screenshot, + false, + &shared.6.openai_config, + &shared.6.execution_scripts, + &shared.6.automation_scripts, + ) + .await, + ); + retry_count -= 1; + } + + match intercept_handle { Some(h) => { let _ = h.await; diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 46cff02e0..20c8c5eb2 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.8.5" +version = "2.8.7" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/page.rs b/spider_chrome/src/page.rs index b0fa99249..9a04b0244 100644 --- a/spider_chrome/src/page.rs +++ b/spider_chrome/src/page.rs @@ -156,10 +156,10 @@ impl Page { Ok(()) } - /// Sets `window.chrome` on frame creation + /// Sets `window.chrome` on frame creation and console.log methods. async fn hide_chrome(&self) -> Result<(), CdpError> { self.execute(AddScriptToEvaluateOnNewDocumentParams { - source: "window.chrome = { runtime: {} };".to_string(), + source: "window.chrome = { runtime: {} };['log', 'warn', 'error', 'info', 'debug', 'table'].forEach((method) => { console[method] = () => {}; });".to_string(), world_name: None, include_command_line_api: None, run_immediately: None, diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index f6ca28560..082150c2c 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.8.5" +version = "2.8.7" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 5a7de0cde..15e486474 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.8.5" +version = "2.8.7" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index e1dcfea72..e8e2cd0f9 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.8.5" +version = "2.8.7" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 92a4530de..fcc2ac7a2 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.8.5" +version = "2.8.7" authors = [ "j-mendez " ]