Skip to content

Commit

Permalink
chore(chrome): fix proxy handling websockets
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 1, 2024
1 parent 018178d commit 88b1b39
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 36 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "2.6.20"
version = "2.6.22"
authors = [
"j-mendez <[email protected]>",
]
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.6.20"
version = "2.6.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
21 changes: 17 additions & 4 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use crate::utils::log;
use crate::{configuration::Configuration, tokio_stream::StreamExt};
use chromiumoxide::cdp::browser_protocol::browser::BrowserContextId;
use chromiumoxide::cdp::browser_protocol::target::CreateTargetParams;
use chromiumoxide::error::CdpError;
use chromiumoxide::Page;
use chromiumoxide::{handler::HandlerConfig, Browser, BrowserConfig};
use std::sync::Arc;
use tokio::task::JoinHandle;

/// get chrome configuration
Expand Down Expand Up @@ -281,13 +281,18 @@ pub async fn attempt_navigation(
url: &str,
browser: &Browser,
request_timeout: &Option<Box<core::time::Duration>>,
browser_context_id: &Option<BrowserContextId>,
) -> Result<Page, CdpError> {
let mut cdp_params = CreateTargetParams::new(url);
cdp_params.browser_context_id.clone_from(browser_context_id);
cdp_params.background = Some(true);
cdp_params.url = url.into();
let page_result = tokio::time::timeout(
match request_timeout {
Some(timeout) => **timeout,
_ => tokio::time::Duration::from_secs(60),
},
browser.new_page(url),
browser.new_page(cdp_params),
)
.await;
match page_result {
Expand All @@ -299,9 +304,17 @@ pub async fn attempt_navigation(
/// close the browser and open handles
pub async fn close_browser(
browser_handle: JoinHandle<()>,
_browser: &Browser,
_context_id: &mut Option<BrowserContextId>,
browser: &Browser,
context_id: &mut Option<BrowserContextId>,
) {
match context_id.take() {
Some(id) => {
if let Err(er) = browser.dispose_browser_context(id).await {
log("CDP Error: ", er.to_string())
}
}
_ => (),
}
if !browser_handle.is_finished() {
browser_handle.abort();
}
Expand Down
14 changes: 11 additions & 3 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,7 @@ impl Page {
selectors: &RelativeSelectors,
browser: &std::sync::Arc<chromiumoxide::Browser>,
configuration: &crate::configuration::Configuration,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
) -> HashSet<A> {
let mut map = HashSet::new();
let html = self.get_html();
Expand Down Expand Up @@ -1028,10 +1029,11 @@ impl Page {
let browser = browser.to_owned();
let configuration = configuration.clone();
let target_url = self.url.clone();
let context_id = context_id.clone();

tokio::task::spawn(async move {
// we need to use about:blank here since we set the HTML content directly
match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout).await {
match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout, &context_id).await {
Ok(new_page) => {
match configuration
.evaluate_on_new_document
Expand Down Expand Up @@ -1371,15 +1373,21 @@ impl Page {
selectors: &RelativeSelectors,
page: &std::sync::Arc<chromiumoxide::Browser>,
configuration: &crate::configuration::Configuration,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
return Default::default();
}
self.links_stream_smart::<CaseInsensitiveString>(&selectors, page, configuration)
.await
self.links_stream_smart::<CaseInsensitiveString>(
&selectors,
page,
configuration,
context_id,
)
.await
}
}
}
Expand Down
67 changes: 50 additions & 17 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1480,16 +1480,17 @@ impl Website {
base: &mut RelativeSelectors,
_: bool,
browser: &Arc<chromiumoxide::Browser>,
scrape: bool,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
) -> HashSet<CaseInsensitiveString> {
let links: HashSet<CaseInsensitiveString> = if self
.is_allowed_default(&self.get_base_link())
.eq(&ProcessLinkStatus::Allowed)
{
let mut page = Page::new_page(&self.url.inner(), &client).await;

let page_links: HashSet<CaseInsensitiveString> =
page.smart_links(&base, &browser, &self.configuration).await;
let page_links: HashSet<CaseInsensitiveString> = page
.smart_links(&base, &browser, &self.configuration, &context_id)
.await;

match page.final_redirect_destination {
Some(ref domain) => {
Expand Down Expand Up @@ -1542,13 +1543,6 @@ impl Website {
self.status = CrawlStatus::Blocked;
}

if scrape {
match self.pages.as_mut() {
Some(p) => p.push(page.clone()),
_ => (),
};
}

if self.configuration.return_page_links {
page.page_links = if links.is_empty() {
None
Expand Down Expand Up @@ -2210,6 +2204,7 @@ impl Website {
"about:blank",
&browser,
&self.configuration.request_timeout,
&context_id,
)
.await
{
Expand Down Expand Up @@ -2263,6 +2258,7 @@ impl Website {
self.channel_guard.clone(),
browser,
self.configuration.clone(), // we may just want to share explicit config instead.
context_id.clone(),
));

let add_external =
Expand Down Expand Up @@ -2318,7 +2314,8 @@ impl Website {
_ => (link, None),
};
let target_url = link_result.0.as_ref();
let next = match attempt_navigation("about:blank", &shared.4, &shared.5.request_timeout).await {
let next = match attempt_navigation("about:blank", &shared.4, &shared.5.request_timeout, &shared.6
).await {
Ok(new_page) => {
match shared.5.evaluate_on_new_document
{
Expand Down Expand Up @@ -2489,6 +2486,7 @@ impl Website {
"about:blank",
&browser,
&self.configuration.request_timeout,
&context_id,
)
.await
{
Expand Down Expand Up @@ -2542,6 +2540,7 @@ impl Website {
browser,
self.configuration.clone(),
self.url.inner().to_string(),
context_id.clone(),
));

let add_external = shared.3.len() > 0;
Expand Down Expand Up @@ -2589,7 +2588,8 @@ impl Website {

set.spawn_on(
run_task(semaphore.clone(), move || async move {
match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout).await {
match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8
).await {
Ok(new_page) => {
let intercept_handle = crate::features::chrome::setup_chrome_interception_base(
&new_page,
Expand Down Expand Up @@ -2899,8 +2899,14 @@ impl Website {
_ => false,
} {
self.status = CrawlStatus::Active;
self.crawl_establish_smart(&client, &mut selectors, false, &browser, false)
.await;
self.crawl_establish_smart(
&client,
&mut selectors,
false,
&browser,
&context_id,
)
.await;
self.subscription_guard();
crate::features::chrome::close_browser(
browser_handle,
Expand Down Expand Up @@ -2932,7 +2938,7 @@ impl Website {
&mut selectors,
false,
&browser,
false,
&context_id,
)
.await,
);
Expand All @@ -2947,6 +2953,7 @@ impl Website {
self.channel_guard.clone(),
browser,
self.configuration.clone(),
context_id.clone(),
));

let add_external = self.configuration.external_domains_caseless.len() > 0;
Expand Down Expand Up @@ -3011,6 +3018,7 @@ impl Website {
let links = page
.smart_links(
&shared.1, &shared.4, &shared.5,
&shared.6,
)
.await;

Expand Down Expand Up @@ -3371,6 +3379,7 @@ impl Website {
browser,
self.configuration.clone(),
self.url.inner().to_string(),
context_id.clone(),
));

let mut sitemaps = match self.configuration.sitemap_url {
Expand Down Expand Up @@ -3468,6 +3477,7 @@ impl Website {
&shared
.3
.request_timeout,
&shared.5,
)
.await
{
Expand Down Expand Up @@ -3668,11 +3678,34 @@ impl Website {
tokio::task::JoinHandle<()>,
Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
)> {
use chromiumoxide::cdp::browser_protocol::target::CreateBrowserContextParams;
match launch_browser(&self.configuration).await {
Some((browser, browser_handle, context_id)) => {
let browser = Arc::new(browser);
let b = if context_id.is_some() {
context_id
} else {
let mut create_content = CreateBrowserContextParams::default();
create_content.dispose_on_detach = Some(true);

match self.configuration.proxies {
Some(ref p) => match p.get(0) {
Some(p) => {
create_content.proxy_server = Some(p.into());
}
_ => (),
},
_ => (),
};

match browser.create_browser_context(create_content).await {
Ok(c) => Some(c),
_ => None,
}
};

let browser: Arc<chromiumoxide::Browser> = Arc::new(browser);

Some((browser, browser_handle, context_id))
Some((browser, browser_handle, b))
}
_ => None,
}
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.6.20"
version = "2.6.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.6.20"
version = "2.6.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.6.20"
version = "2.6.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.6.20"
version = "2.6.22"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 88b1b39

Please sign in to comment.