Skip to content

Commit

Permalink
chore(status): add rate limit and server status tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 7, 2024
1 parent 0ad5800 commit 81fe9cd
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 17 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.8.16"
version = "2.8.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
26 changes: 21 additions & 5 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ pub enum CrawlStatus {
Active,
/// The crawl blocked from network ratelimit, firewall, etc.
Blocked,
/// The crawl failed from a server error.
ServerError,
/// The crawl was rate limited.
RateLimited,
/// The initial request ran without returning html.
Empty,
/// The URL of the website is invalid. Crawl cannot commence.
Expand Down Expand Up @@ -1323,8 +1327,11 @@ impl Website {

if page.status_code == reqwest::StatusCode::FORBIDDEN && links.len() == 0 {
self.status = CrawlStatus::Blocked;
} else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
self.status = CrawlStatus::RateLimited;
} else if page.status_code.is_server_error() {
self.status = CrawlStatus::ServerError;
}

if self.configuration.return_page_links {
page.page_links = if links.is_empty() {
None
Expand Down Expand Up @@ -1473,8 +1480,11 @@ impl Website {

if page.status_code == reqwest::StatusCode::FORBIDDEN && links.len() == 0 {
self.status = CrawlStatus::Blocked;
} else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
self.status = CrawlStatus::RateLimited;
} else if page.status_code.is_server_error() {
self.status = CrawlStatus::ServerError;
}

if self.configuration.return_page_links {
page.page_links = if links.is_empty() {
None
Expand Down Expand Up @@ -1655,8 +1665,11 @@ impl Website {

if page.status_code == reqwest::StatusCode::FORBIDDEN && links.len() == 0 {
self.status = CrawlStatus::Blocked;
} else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
self.status = CrawlStatus::RateLimited;
} else if page.status_code.is_server_error() {
self.status = CrawlStatus::ServerError;
}

if self.configuration.return_page_links {
page.page_links = if links.is_empty() {
None
Expand Down Expand Up @@ -1709,10 +1722,13 @@ impl Website {
_ => *self.url.to_owned(),
});

if page.status_code == reqwest::StatusCode::FORBIDDEN && page.links.len() == 0 {
if page.status_code == reqwest::StatusCode::FORBIDDEN && links.len() == 0 {
self.status = CrawlStatus::Blocked;
} else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
self.status = CrawlStatus::RateLimited;
} else if page.status_code.is_server_error() {
self.status = CrawlStatus::ServerError;
}

let links = HashSet::from(page.links.clone());

if self.configuration.return_page_links {
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.8.16"
version = "2.8.17"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.8.16"
version = "2.8.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.8.16"
version = "2.8.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.8.16"
version = "2.8.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.8.16"
version = "2.8.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 81fe9cd

Please sign in to comment.