chore(crawl): remove crawl sync api

spider-rs · Sep 11, 2023 · 99b70aa · 99b70aa
1 parent 1e28e45
commit 99b70aa
Show file tree

Hide file tree

Showing 11 changed files with 199 additions and 332 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.39.4"
+version = "1.40.1"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.39.4"
+version = "1.40.1"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.39.4"
+version = "1.40.1"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.39.4"
+spider = "1.40.1"
 ```
 
 And then the code:
@@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["regex", "ua_generator"] }
+spider = { version = "1.40.1", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -112,7 +112,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["decentralized"] }
+spider = { version = "1.40.1", features = ["decentralized"] }
 ```
 
 ```sh
@@ -133,7 +133,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["sync"] }
+spider = { version = "1.40.1", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -163,7 +163,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["regex"] }
+spider = { version = "1.40.1", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -190,7 +190,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["control"] }
+spider = { version = "1.40.1", features = ["control"] }
 ```
 
 ```rust
@@ -253,16 +253,6 @@ async fn main() {
 }
 ```
 
-### Sequential
-
-Perform crawls sequential without any concurrency.
-
-```rust
-// ..
-let mut website: Website = Website::new("https://choosealicense.com");
-
-website.crawl_sync().await;
-
 ```
 ### Blocking
 

diff --git a/spider/src/lib.rs b/spider/src/lib.rs
@@ -11,16 +11,13 @@
 //!
 //! There are a couple of ways to use Spider:
 //!
-//! - **Concurrent** is the fastest way to start crawling a web page and
-//!   typically the most efficient.
+//! - **Crawl** starts crawling a web page and
+//!   perform most work in isolation.
 //!   - [`crawl`] is used to crawl concurrently.
-//! - **Sequential** lets you crawl the web pages one after another respecting delay sequences.
-//!   - [`crawl_sync`] is used to crawl in sync.
 //! - **Scrape** Scrape the page and hold onto the HTML raw string to parse.
 //!   - [`scrape`] is used to gather the HTML.
 //!
 //! [`crawl`]: website/struct.Website.html#method.crawl
-//! [`crawl_sync`]: website/struct.Website.html#method.crawl_sync
 //! [`scrape`]: website/struct.Website.html#method.scrape
 //!
 //! # Examples

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -152,15 +152,21 @@ impl Page {
         )
     }
 
-    #[cfg(not(feature = "decentralized"))]
-    /// Instantiate a new page and gather the html repro of standard fetch_page_html.
-    pub async fn new_page(url: &str, client: &Client) -> Self {
+    #[cfg(feature = "chrome")]
+    /// Instantiate a new page and gather the html.
+    pub async fn new_page(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self {
         build(
             url,
-            crate::utils::fetch_page_html_raw(&url, &client).await,
+            crate::utils::fetch_page_html_chrome(&url, &client, &page).await,
         )
     }
 
+    #[cfg(not(feature = "decentralized"))]
+    /// Instantiate a new page and gather the html repro of standard fetch_page_html.
+    pub async fn new_page(url: &str, client: &Client) -> Self {
+        build(url, crate::utils::fetch_page_html_raw(&url, &client).await)
+    }
+
     /// Instantiate a new page and gather the html.
     #[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
     pub async fn new(url: &str, client: &Client) -> Self {

diff --git a/spider/src/utils.rs b/spider/src/utils.rs
@@ -56,7 +56,6 @@ pub async fn fetch_page_html(
     }
 }
 
-
 /// Perform a network request to a resource extracting all content as text streaming.
 pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> Option<bytes::Bytes> {
     use crate::bytes::BufMut;
@@ -85,7 +84,6 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> Option<by
     }
 }
 
-
 #[cfg(all(not(feature = "fs"), not(feature = "chrome")))]
 /// Perform a network request to a resource extracting all content as text streaming.
 pub async fn fetch_page_html(target_url: &str, client: &Client) -> Option<bytes::Bytes> {
@@ -247,6 +245,58 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> Option<bytes:
     }
 }
 
+#[cfg(feature = "chrome")]
+/// Perform a network request to a resource extracting all content as text streaming via chrome.
+pub async fn fetch_page_html_chrome(
+    target_url: &str,
+    client: &Client,
+    page: &chromiumoxide::Page,
+) -> Option<bytes::Bytes> {
+    match &page {
+        page => match page.goto(target_url).await {
+            Ok(page) => {
+                let res = page.content().await;
+                let content = res.unwrap_or_default().into();
+
+                // let _ = page.close().await;
+
+                Some(content)
+            }
+            _ => {
+                log(
+                    "- error parsing html text defaulting to raw http request {}",
+                    &target_url,
+                );
+
+                use crate::bytes::BufMut;
+                use bytes::BytesMut;
+                use tokio_stream::StreamExt;
+
+                match client.get(target_url).send().await {
+                    Ok(res) if res.status().is_success() => {
+                        let mut stream = res.bytes_stream();
+                        let mut data: BytesMut = BytesMut::new();
+
+                        while let Some(item) = stream.next().await {
+                            match item {
+                                Ok(text) => data.put(text),
+                                _ => (),
+                            }
+                        }
+
+                        Some(data.into())
+                    }
+                    Ok(_) => None,
+                    Err(_) => {
+                        log("- error parsing html text {}", &target_url);
+                        None
+                    }
+                }
+            }
+        },
+    }
+}
+
 /// log to console if configuration verbose.
 pub fn log(message: &'static str, data: impl AsRef<str>) {
     if log_enabled!(Level::Info) {