diff --git a/Cargo.lock b/Cargo.lock index a837fdd6f..2535a3f66 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3677,7 +3677,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.6.16" +version = "2.6.18" dependencies = [ "adblock", "ahash", @@ -3734,7 +3734,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.6.16" +version = "2.6.18" dependencies = [ "clap", "env_logger", @@ -3746,7 +3746,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "2.6.16" +version = "2.6.18" dependencies = [ "convert_case 0.6.0", "env_logger", @@ -3758,7 +3758,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.6.16" +version = "2.6.18" dependencies = [ "aho-corasick", "fast_html2md", @@ -3777,7 +3777,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.6.16" +version = "2.6.18" dependencies = [ "indexmap 1.9.3", "spider", @@ -3786,7 +3786,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.6.16" +version = "2.6.18" dependencies = [ "env_logger", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index a345759ce..b79098bae 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "2.6.16" +version = "2.6.18" authors = [ "j-mendez ", ] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 3d47b342a..5816aceda 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.6.16" +version = "2.6.18" authors = [ "j-mendez " ] diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index b64197cde..dab0572ee 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -154,6 +154,8 @@ pub struct Configuration { pub shared_queue: bool, /// Return the page links in the subscription channels. This does nothing without the flag `sync` enabled. pub return_page_links: bool, + /// Run the initial links first before concurrently crawling. + pub run_initial_links: bool, /// The blacklist urls. blacklist: AllowList, /// The whitelist urls. @@ -363,6 +365,12 @@ impl Configuration { self } + /// Run the initial drained links before concurrently crawling. Make sure to set the crawl limit equal to the extra_links count in order to run first. + pub fn with_run_initial_drain(&mut self, run_initial_drain: bool) -> &mut Self { + self.run_initial_links = run_initial_drain; + self + } + /// Preserve the HOST header. pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self { self.preserve_host_header = preserve; diff --git a/spider/src/website.rs b/spider/src/website.rs index ec3882410..e1d7d1f88 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -563,7 +563,9 @@ impl Website { } /// Drain the links visited. - pub fn drain_links(&mut self) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> { + pub fn drain_links( + &mut self, + ) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> { self.links_visited.drain() } @@ -2036,9 +2038,14 @@ impl Website { self.start(); match self.setup_selectors() { Some(mut selector) => { + let mut exact_budget: usize = 0; + if match self.configuration.inner_budget { Some(ref b) => match b.get(&*WILD_CARD_PATH) { - Some(b) => b.eq(&1), + Some(b) => { + exact_budget = *b as usize; + b.eq(&1) + } _ => false, }, _ => false, @@ -2055,7 +2062,14 @@ impl Website { Arc::new(Semaphore::const_new(*DEFAULT_PERMITS)) }; - links.extend(self._crawl_establish(client, &mut selector, false).await); + // we are running an exact crawl to match the ended pages. + let mut post_poned_initial = + self.configuration.run_initial_links && exact_budget.eq(&(links.len() - 1)); + + if !post_poned_initial { + links.extend(self._crawl_establish(client, &mut selector, false).await); + } + self.configuration.configure_allowlist(); let on_link_find_callback = self.on_link_find_callback; let full_resources = self.configuration.full_resources; @@ -2177,6 +2191,14 @@ impl Website { }; } + if post_poned_initial { + links.extend( + self._crawl_establish(client, &mut shared.1.clone(), false) + .await, + ); + post_poned_initial = false; + } + if links.is_empty() { break; } @@ -3725,6 +3747,12 @@ impl Website { self } + /// Run the initial drained links before concurrently crawling. Make sure to set the crawl limit equal to the extra_links count in order to run first. + pub fn with_run_initial_drain(&mut self, run_initial_drain: bool) -> &mut Self { + self.configuration.with_run_initial_drain(run_initial_drain); + self + } + /// Preserve the HOST header. pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self { self.configuration.with_preserve_host_header(preserve); diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 771b19375..42a82b51c 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.6.16" +version = "2.6.18" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index d168b4b85..55c569d1a 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.6.16" +version = "2.6.18" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 9622f6bbb..47fd6523f 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.6.16" +version = "2.6.18" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 15e28cce4..b885df00e 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.6.16" +version = "2.6.18" authors = [ "j-mendez " ]