Skip to content

Commit

Permalink
chore(sync): fix initial link sub capture
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Aug 27, 2023
1 parent ffe30f5 commit c6391aa
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 29 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.36.1"
version = "1.36.3"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.36.1"
version = "1.36.3"
path = "../spider"
features = ["serde"]

Expand Down
2 changes: 0 additions & 2 deletions examples/subscribe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,4 @@ async fn main() {
});

website.crawl().await;

let _ = join_handle.await.unwrap();
}
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.36.1"
version = "1.36.3"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down
12 changes: 6 additions & 6 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.36.1"
spider = "1.36.3"
```

And then the code:
Expand Down Expand Up @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.36.1", features = ["regex", "ua_generator"] }
spider = { version = "1.36.3", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand All @@ -109,7 +109,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.36.1", features = ["decentralized"] }
spider = { version = "1.36.3", features = ["decentralized"] }
```

```sh
Expand All @@ -130,7 +130,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.36.1", features = ["sync"] }
spider = { version = "1.36.3", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -160,7 +160,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.36.1", features = ["regex"] }
spider = { version = "1.36.3", features = ["regex"] }
```

```rust,no_run
Expand All @@ -187,7 +187,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.36.1", features = ["control"] }
spider = { version = "1.36.3", features = ["control"] }
```

```rust
Expand Down
95 changes: 86 additions & 9 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,18 @@ impl Website {
_ => *self.domain.clone(),
});

HashSet::from(page.links(&base).await)
let links = HashSet::from(page.links(&base).await);

match &self.channel {
Some(c) => {
match c.0.send(page) {
_ => (),
};
}
_ => (),
};

links
} else {
HashSet::new()
};
Expand Down Expand Up @@ -564,7 +575,18 @@ impl Website {
_ => *self.domain.to_owned(),
});

HashSet::from(page.links)
let page_links = HashSet::from(page.links);

match &self.channel {
Some(c) => {
match c.0.send(page.clone()) {
_ => (),
};
}
_ => (),
};

page_links
} else {
HashSet::new()
};
Expand Down Expand Up @@ -617,7 +639,18 @@ impl Website {

self.links_visited.insert(link_result.0);

links.extend(HashSet::from(page.links));
match &self.channel {
Some(c) => {
match c.0.send(page.clone()) {
_ => (),
};
}
_ => (),
};

let page_links = HashSet::from(page.links);

links.extend(page_links);
}
}

Expand Down Expand Up @@ -658,8 +691,18 @@ impl Website {
};

self.links_visited.insert(link_result);
let page_links = HashSet::from(page.links(&base).await);

links.extend(page_links);

links.extend(HashSet::from(page.links(&base).await));
match &self.channel {
Some(c) => {
match c.0.send(page) {
_ => (),
};
}
_ => (),
};
}
}

Expand Down Expand Up @@ -706,7 +749,7 @@ impl Website {
let shared = Arc::new((
client,
unsafe { selectors.unwrap_unchecked() },
self.channel.clone()
self.channel.clone(),
));

let mut links: HashSet<CaseInsensitiveString> =
Expand Down Expand Up @@ -960,7 +1003,6 @@ impl Website {
new_links.extend(page_links);
task::yield_now().await;


match &channel {
Some(c) => {
match c.0.send(page) {
Expand All @@ -969,7 +1011,6 @@ impl Website {
}
_ => (),
};

}

links.clone_from(&(&new_links - &self.links_visited));
Expand Down Expand Up @@ -1173,14 +1214,19 @@ impl Website {

/// Setup subscription for data.
#[cfg(not(feature = "sync"))]
pub fn subscribe(&mut self, capacity: usize) -> Option<Arc<(broadcast::Sender<Page>, broadcast::Receiver<Page>)>> {
pub fn subscribe(
&mut self,
capacity: usize,
) -> Option<Arc<(broadcast::Sender<Page>, broadcast::Receiver<Page>)>> {
None
}

/// Setup subscription for data.
#[cfg(feature = "sync")]
pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
let channel = self.channel.get_or_insert(Arc::new(broadcast::channel(capacity.max(1))));
let channel = self
.channel
.get_or_insert(Arc::new(broadcast::channel(capacity.max(1))));
let channel = channel.clone();

let rx2 = channel.0.subscribe();
Expand Down Expand Up @@ -1417,6 +1463,37 @@ async fn test_crawl_tld() {
);
}

#[tokio::test]
#[cfg(feature = "sync")]
async fn test_crawl_subscription() {
let mut website: Website = Website::new("https://choosealicense.com");
let mut rx2 = website.subscribe(100).unwrap();
let count = Arc::new(tokio::sync::Mutex::new(0));
let count1 = count.clone();

tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
let mut lock = count1.lock().await;
*lock += 1;
assert!(
res.get_url().starts_with(&"https://choosealicense.com/"),
"{:?}",
true
);
}
});

website.crawl().await;
let website_links = website.get_links().len();

// no subscription if did not fulfill. The root page is always captured in links.
if website_links == 1 {
assert!(*count.lock().await == 0, "{:?}", true);
} else {
assert!(*count.lock().await == website_links, "{:?}", true);
}
}

#[cfg(all(feature = "socks", not(feature = "decentralized")))]
#[tokio::test]
async fn test_crawl_proxy() {
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.36.1"
version = "1.36.3"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "The fastest web crawler CLI written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -26,7 +26,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.36.1"
version = "1.36.3"
path = "../spider"

[[bin]]
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spider --domain http://localhost:3000 download
```

```sh
spider_cli 1.36.1
spider_cli 1.36.3
madeindjs <[email protected]>, j-mendez <[email protected]>
The fastest web crawler CLI written in Rust.

Expand Down
4 changes: 2 additions & 2 deletions spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "1.36.1"
version = "1.36.3"
authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
description = "The fastest web crawler CLI written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ lazy_static = "1.4.0"
env_logger = "0.10.0"

[dependencies.spider]
version = "1.36.1"
version = "1.36.3"
path = "../spider"
features = ["serde", "flexbuffers"]

Expand Down

0 comments on commit c6391aa

Please sign in to comment.