From b5c5edfa0df1f7d1fd2b6ba4c0af20169a2bf383 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sun, 10 Sep 2023 23:39:19 -0400 Subject: [PATCH] chore(crawl): remove crawl sync api --- Cargo.lock | 8 +- examples/Cargo.toml | 4 +- spider/Cargo.toml | 2 +- spider/README.md | 12 +-- spider/src/page.rs | 9 ++ spider/src/utils.rs | 56 +++++++++++- spider/src/website.rs | 182 +-------------------------------------- spider_cli/Cargo.toml | 4 +- spider_cli/README.md | 2 +- spider_worker/Cargo.toml | 4 +- 10 files changed, 82 insertions(+), 201 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8ca6d7233..7929fe525 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3475,7 +3475,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.39.4" +version = "1.40.0" dependencies = [ "ahash", "bytes", @@ -3510,7 +3510,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.39.4" +version = "1.40.0" dependencies = [ "clap 3.2.25", "env_logger 0.9.3", @@ -3522,7 +3522,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.39.4" +version = "1.40.0" dependencies = [ "convert_case", "env_logger 0.9.3", @@ -3543,7 +3543,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.39.4" +version = "1.40.0" dependencies = [ "env_logger 0.10.0", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 0d84bbb10..87d29cd00 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.39.4" +version = "1.40.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.39.4" +version = "1.40.0" path = "../spider" features = ["serde"] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 3a8e02a11..d4dc862a9 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.39.4" +version = "1.40.0" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" diff --git a/spider/README.md b/spider/README.md index 637061331..b0c191ff4 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.39.4" +spider = "1.40.0" ``` And then the code: @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.39.4", features = ["regex", "ua_generator"] } +spider = { version = "1.40.0", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -112,7 +112,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.39.4", features = ["decentralized"] } +spider = { version = "1.40.0", features = ["decentralized"] } ``` ```sh @@ -133,7 +133,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.39.4", features = ["sync"] } +spider = { version = "1.40.0", features = ["sync"] } ``` ```rust,no_run @@ -163,7 +163,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.39.4", features = ["regex"] } +spider = { version = "1.40.0", features = ["regex"] } ``` ```rust,no_run @@ -190,7 +190,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.39.4", features = ["control"] } +spider = { version = "1.40.0", features = ["control"] } ``` ```rust diff --git a/spider/src/page.rs b/spider/src/page.rs index 1286f2319..fb45a5310 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -152,6 +152,15 @@ impl Page { ) } + #[cfg(feature = "chrome")] + /// Instantiate a new page and gather the html. + pub async fn new_page(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self { + build( + url, + crate::utils::fetch_page_html_chrome(&url, &client, &page).await, + ) + } + #[cfg(not(feature = "decentralized"))] /// Instantiate a new page and gather the html repro of standard fetch_page_html. pub async fn new_page(url: &str, client: &Client) -> Self { diff --git a/spider/src/utils.rs b/spider/src/utils.rs index b8237538b..142bffc1a 100644 --- a/spider/src/utils.rs +++ b/spider/src/utils.rs @@ -56,7 +56,6 @@ pub async fn fetch_page_html( } } - /// Perform a network request to a resource extracting all content as text streaming. pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> Option { use crate::bytes::BufMut; @@ -85,7 +84,6 @@ pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> Option Option { @@ -247,6 +245,60 @@ pub async fn fetch_page_html(target_url: &str, client: &Client) -> Option Option { + match &page { + page => match page.goto(target_url).await { + Ok(page) => { + let res = page.content().await; + let content = res.unwrap_or_default().into(); + + // let _ = page.close().await; + + Some(content) + } + _ => { + log( + "- error parsing html text defaulting to raw http request {}", + &target_url, + ); + + use crate::bytes::BufMut; + use bytes::BytesMut; + use tokio_stream::StreamExt; + + match client.get(target_url).send().await { + Ok(res) if res.status().is_success() => { + let mut stream = res.bytes_stream(); + let mut data: BytesMut = BytesMut::new(); + + while let Some(item) = stream.next().await { + match item { + Ok(text) => data.put(text), + _ => (), + } + } + + Some(data.into()) + } + Ok(_) => None, + Err(_) => { + log("- error parsing html text {}", &target_url); + None + } + } + } + }, + } +} + /// log to console if configuration verbose. pub fn log(message: &'static str, data: impl AsRef) { if log_enabled!(Level::Info) { diff --git a/spider/src/website.rs b/spider/src/website.rs index cf64e2404..3436300f0 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -856,13 +856,6 @@ impl Website { self.crawl_concurrent(client, handle).await; } - /// Start to crawl website in sync - pub async fn crawl_sync(&mut self) { - let (client, handle) = self.setup().await; - - self.crawl_sequential(&client, handle).await; - } - /// Start to scrape/download website with async conccurency pub async fn scrape(&mut self) { let (client, handle) = self.setup().await; @@ -1207,179 +1200,6 @@ impl Website { } } - #[cfg(feature = "chrome")] - /// Start to crawl website sequential - async fn crawl_sequential(&mut self, client: &Client, handle: Option>) { - let selectors = get_page_selectors( - &self.domain.inner(), - self.configuration.subdomains, - self.configuration.tld, - ); - - if selectors.is_some() { - self.status = CrawlStatus::Active; - let blacklist_url = self.configuration.get_blacklist(); - let selectors = unsafe { selectors.unwrap_unchecked() }; - let delay = Box::from(self.configuration.delay); - let delay_enabled = self.configuration.delay > 0; - let on_link_find_callback = self.on_link_find_callback; - let mut interval = tokio::time::interval(Duration::from_millis(10)); - - let mut new_links: HashSet = HashSet::new(); - - let (mut browser, _) = launch_browser().await; - let page = Arc::new(browser.new_page("about:blank").await.unwrap()); - - let mut links: HashSet = - self.crawl_establish(&client, &selectors, false, &page).await; - - let channel = self.channel.clone(); - - // crawl while links exists - loop { - for link in links.iter() { - match handle.as_ref() { - Some(handle) => { - while handle.load(Ordering::Relaxed) == 1 { - interval.tick().await; - } - if handle.load(Ordering::Relaxed) == 2 { - links.clear(); - break; - } - } - None => (), - } - if !self.is_allowed(&link, &blacklist_url) { - continue; - } - self.links_visited.insert(link.clone()); - log("fetch", link); - if delay_enabled { - tokio::time::sleep(Duration::from_millis(*delay)).await; - } - let link = link.clone(); - let link_result = match on_link_find_callback { - Some(cb) => cb(link, None), - _ => (link, None), - }; - let page = page.clone(); - let page = Page::new(&link_result.0.as_ref(), &client, &page).await; - let page_links = page.links(&selectors).await; - task::yield_now().await; - new_links.extend(page_links); - task::yield_now().await; - - match &channel { - Some(c) => { - match c.0.send(page) { - _ => (), - }; - } - _ => (), - }; - } - - links.clone_from(&(&new_links - &self.links_visited)); - new_links.clear(); - if new_links.capacity() >= 1500 { - new_links.shrink_to_fit(); - } - task::yield_now().await; - if links.is_empty() { - break; - } - } - - self.status = CrawlStatus::Idle; - let _ = browser.close().await; - } - } - - - #[cfg(not(feature = "chrome"))] - /// Start to crawl website sequential - async fn crawl_sequential(&mut self, client: &Client, handle: Option>) { - let selectors = get_page_selectors( - &self.domain.inner(), - self.configuration.subdomains, - self.configuration.tld, - ); - - if selectors.is_some() { - self.status = CrawlStatus::Active; - let blacklist_url = self.configuration.get_blacklist(); - let selectors = unsafe { selectors.unwrap_unchecked() }; - let delay = Box::from(self.configuration.delay); - let delay_enabled = self.configuration.delay > 0; - let on_link_find_callback = self.on_link_find_callback; - let mut interval = tokio::time::interval(Duration::from_millis(10)); - - let mut new_links: HashSet = HashSet::new(); - let mut links: HashSet = - self.crawl_establish(&client, &selectors, false).await; - - let channel = self.channel.clone(); - - // crawl while links exists - loop { - for link in links.iter() { - match handle.as_ref() { - Some(handle) => { - while handle.load(Ordering::Relaxed) == 1 { - interval.tick().await; - } - if handle.load(Ordering::Relaxed) == 2 { - links.clear(); - break; - } - } - None => (), - } - if !self.is_allowed(&link, &blacklist_url) { - continue; - } - self.links_visited.insert(link.clone()); - log("fetch", link); - if delay_enabled { - tokio::time::sleep(Duration::from_millis(*delay)).await; - } - let link = link.clone(); - let link_result = match on_link_find_callback { - Some(cb) => cb(link, None), - _ => (link, None), - }; - let page = Page::new(&link_result.0.as_ref(), &client).await; - let page_links = page.links(&selectors).await; - task::yield_now().await; - new_links.extend(page_links); - task::yield_now().await; - - match &channel { - Some(c) => { - match c.0.send(page) { - _ => (), - }; - } - _ => (), - }; - } - - links.clone_from(&(&new_links - &self.links_visited)); - new_links.clear(); - if new_links.capacity() >= 1500 { - new_links.shrink_to_fit(); - } - task::yield_now().await; - if links.is_empty() { - break; - } - } - - self.status = CrawlStatus::Idle; - } - } - #[cfg(not(feature = "chrome"))] /// Start to scape website concurrently and store html async fn scrape_concurrent(&mut self, client: &Client, handle: Option>) { @@ -1552,7 +1372,7 @@ impl Website { set.spawn(async move { drop(permit); let page = - crate::utils::fetch_page_html(&link.as_ref(), &client, &page).await; + crate::utils::fetch_page_html_chrome(&link.as_ref(), &client, &page).await; let page = build(&link.as_ref(), page); let (link, _) = match on_link_find_callback { diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 495b98ec1..55043353b 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.39.4" +version = "1.40.0" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.39.4" +version = "1.40.0" path = "../spider" [[bin]] diff --git a/spider_cli/README.md b/spider_cli/README.md index 4eb39293f..1c0b0e81a 100644 --- a/spider_cli/README.md +++ b/spider_cli/README.md @@ -40,7 +40,7 @@ spider --domain http://localhost:3000 download -t _temp_spider_downloads ``` ```sh -spider_cli 1.39.4 +spider_cli 1.40.0 madeindjs , j-mendez The fastest web crawler CLI written in Rust. diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index cf51e00bd..7cf3382a5 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.39.4" +version = "1.40.0" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.39.4" +version = "1.40.0" path = "../spider" features = ["serde", "flexbuffers"]