chore(crawl): remove crawl sync api

spider-rs · Sep 11, 2023 · b5c5edf · b5c5edf
1 parent 1e28e45
commit b5c5edf
Show file tree

Hide file tree

Showing 10 changed files with 82 additions and 201 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.39.4"
+version = "1.40.0"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.39.4"
+version = "1.40.0"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.39.4"
+version = "1.40.0"
 authors = ["madeindjs <[email protected]>", "j-mendez <[email protected]>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.39.4"
+spider = "1.40.0"
 ```
 
 And then the code:
@@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["regex", "ua_generator"] }
+spider = { version = "1.40.0", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -112,7 +112,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["decentralized"] }
+spider = { version = "1.40.0", features = ["decentralized"] }
 ```
 
 ```sh
@@ -133,7 +133,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["sync"] }
+spider = { version = "1.40.0", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -163,7 +163,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["regex"] }
+spider = { version = "1.40.0", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -190,7 +190,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.39.4", features = ["control"] }
+spider = { version = "1.40.0", features = ["control"] }
 ```
 
 ```rust

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -152,6 +152,15 @@ impl Page {
         )
     }
 
+    #[cfg(feature = "chrome")]
+    /// Instantiate a new page and gather the html.
+    pub async fn new_page(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self {
+        build(
+            url,
+            crate::utils::fetch_page_html_chrome(&url, &client, &page).await,
+        )
+    }
+
     #[cfg(not(feature = "decentralized"))]
     /// Instantiate a new page and gather the html repro of standard fetch_page_html.
     pub async fn new_page(url: &str, client: &Client) -> Self {

diff --git a/spider/src/utils.rs b/spider/src/utils.rs
@@ -56,7 +56,6 @@ pub async fn fetch_page_html(
     }
 }
 
-
 /// Perform a network request to a resource extracting all content as text streaming.
 pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> Option<bytes::Bytes> {
     use crate::bytes::BufMut;
@@ -85,7 +84,6 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> Option<by
     }
 }
 
-
 #[cfg(all(not(feature = "fs"), not(feature = "chrome")))]
 /// Perform a network request to a resource extracting all content as text streaming.
 pub async fn fetch_page_html(target_url: &str, client: &Client) -> Option<bytes::Bytes> {
@@ -247,6 +245,60 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> Option<bytes:
     }
 }
 
+
+
+#[cfg(feature = "chrome")]
+/// Perform a network request to a resource extracting all content as text streaming via chrome.
+pub async fn fetch_page_html_chrome(
+    target_url: &str,
+    client: &Client,
+    page: &chromiumoxide::Page,
+) -> Option<bytes::Bytes> {
+    match &page {
+        page => match page.goto(target_url).await {
+            Ok(page) => {
+                let res = page.content().await;
+                let content = res.unwrap_or_default().into();
+
+                // let _ = page.close().await;
+
+                Some(content)
+            }
+            _ => {
+                log(
+                    "- error parsing html text defaulting to raw http request {}",
+                    &target_url,
+                );
+
+                use crate::bytes::BufMut;
+                use bytes::BytesMut;
+                use tokio_stream::StreamExt;
+
+                match client.get(target_url).send().await {
+                    Ok(res) if res.status().is_success() => {
+                        let mut stream = res.bytes_stream();
+                        let mut data: BytesMut = BytesMut::new();
+
+                        while let Some(item) = stream.next().await {
+                            match item {
+                                Ok(text) => data.put(text),
+                                _ => (),
+                            }
+                        }
+
+                        Some(data.into())
+                    }
+                    Ok(_) => None,
+                    Err(_) => {
+                        log("- error parsing html text {}", &target_url);
+                        None
+                    }
+                }
+            }
+        },
+    }
+}
+
 /// log to console if configuration verbose.
 pub fn log(message: &'static str, data: impl AsRef<str>) {
     if log_enabled!(Level::Info) {