Skip to content

Commit

Permalink
chore(raw): add with_run_initial_drain config
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 26, 2024
1 parent 9320a4b commit 7933166
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 15 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "2.6.16"
version = "2.6.18"
authors = [
"j-mendez <[email protected]>",
]
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.6.16"
version = "2.6.18"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
8 changes: 8 additions & 0 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ pub struct Configuration {
pub shared_queue: bool,
/// Return the page links in the subscription channels. This does nothing without the flag `sync` enabled.
pub return_page_links: bool,
/// Run the initial links first before concurrently crawling.
pub run_initial_links: bool,
/// The blacklist urls.
blacklist: AllowList,
/// The whitelist urls.
Expand Down Expand Up @@ -363,6 +365,12 @@ impl Configuration {
self
}

/// Run the initial drained links before concurrently crawling. Make sure to set the crawl limit equal to the extra_links count in order to run first.
pub fn with_run_initial_drain(&mut self, run_initial_drain: bool) -> &mut Self {
self.run_initial_links = run_initial_drain;
self
}

/// Preserve the HOST header.
pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
self.preserve_host_header = preserve;
Expand Down
34 changes: 31 additions & 3 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,9 @@ impl Website {
}

/// Drain the links visited.
pub fn drain_links(&mut self) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> {
pub fn drain_links(
&mut self,
) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> {
self.links_visited.drain()
}

Expand Down Expand Up @@ -2036,9 +2038,14 @@ impl Website {
self.start();
match self.setup_selectors() {
Some(mut selector) => {
let mut exact_budget: usize = 0;

if match self.configuration.inner_budget {
Some(ref b) => match b.get(&*WILD_CARD_PATH) {
Some(b) => b.eq(&1),
Some(b) => {
exact_budget = *b as usize;
b.eq(&1)
}
_ => false,
},
_ => false,
Expand All @@ -2055,7 +2062,14 @@ impl Website {
Arc::new(Semaphore::const_new(*DEFAULT_PERMITS))
};

links.extend(self._crawl_establish(client, &mut selector, false).await);
// we are running an exact crawl to match the ended pages.
let mut post_poned_initial =
self.configuration.run_initial_links && exact_budget.eq(&(links.len() - 1));

if !post_poned_initial {
links.extend(self._crawl_establish(client, &mut selector, false).await);
}

self.configuration.configure_allowlist();
let on_link_find_callback = self.on_link_find_callback;
let full_resources = self.configuration.full_resources;
Expand Down Expand Up @@ -2177,6 +2191,14 @@ impl Website {
};
}

if post_poned_initial {
links.extend(
self._crawl_establish(client, &mut shared.1.clone(), false)
.await,
);
post_poned_initial = false;
}

if links.is_empty() {
break;
}
Expand Down Expand Up @@ -3725,6 +3747,12 @@ impl Website {
self
}

/// Run the initial drained links before concurrently crawling. Make sure to set the crawl limit equal to the extra_links count in order to run first.
pub fn with_run_initial_drain(&mut self, run_initial_drain: bool) -> &mut Self {
self.configuration.with_run_initial_drain(run_initial_drain);
self
}

/// Preserve the HOST header.
pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
self.configuration.with_preserve_host_header(preserve);
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.6.16"
version = "2.6.18"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.6.16"
version = "2.6.18"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.6.16"
version = "2.6.18"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.6.16"
version = "2.6.18"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 7933166

Please sign in to comment.