Skip to content

Commit

Permalink
chore(chrome): fix proxy handling websockets
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 1, 2024
1 parent 018178d commit 4708b7f
Show file tree
Hide file tree
Showing 11 changed files with 124 additions and 60 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "2.6.20"
version = "2.6.25"
authors = [
"j-mendez <[email protected]>",
]
Expand Down
4 changes: 2 additions & 2 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.6.20"
version = "2.6.25"
authors = [
"j-mendez <[email protected]>"
]
Expand Down Expand Up @@ -103,7 +103,7 @@ reqwest = { version = "0.12", features = [
] }

[features]
default = ["sync", "reqwest_native_tls_native_roots", "cookies", "ua_generator", "encoding"]
default = ["sync", "reqwest_native_tls_native_roots", "cookies", "ua_generator", "encoding", "chrome"]
regex = ["dep:regex"]
glob = ["dep:regex", "dep:itertools"]
ua_generator = ["dep:ua_generator"]
Expand Down
20 changes: 16 additions & 4 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use crate::utils::log;
use crate::{configuration::Configuration, tokio_stream::StreamExt};
use chromiumoxide::cdp::browser_protocol::browser::BrowserContextId;
use chromiumoxide::cdp::browser_protocol::target::CreateTargetParams;
use chromiumoxide::error::CdpError;
use chromiumoxide::Page;
use chromiumoxide::{handler::HandlerConfig, Browser, BrowserConfig};
use std::sync::Arc;
use tokio::task::JoinHandle;

/// get chrome configuration
Expand Down Expand Up @@ -281,13 +281,17 @@ pub async fn attempt_navigation(
url: &str,
browser: &Browser,
request_timeout: &Option<Box<core::time::Duration>>,
browser_context_id: &Option<BrowserContextId>,
) -> Result<Page, CdpError> {
let mut cdp_params = CreateTargetParams::new(url);
cdp_params.browser_context_id.clone_from(browser_context_id);
cdp_params.url = url.into();
let page_result = tokio::time::timeout(
match request_timeout {
Some(timeout) => **timeout,
_ => tokio::time::Duration::from_secs(60),
},
browser.new_page(url),
browser.new_page(cdp_params),
)
.await;
match page_result {
Expand All @@ -299,9 +303,17 @@ pub async fn attempt_navigation(
/// close the browser and open handles
pub async fn close_browser(
browser_handle: JoinHandle<()>,
_browser: &Browser,
_context_id: &mut Option<BrowserContextId>,
browser: &Browser,
context_id: &mut Option<BrowserContextId>,
) {
match context_id.take() {
Some(id) => {
if let Err(er) = browser.dispose_browser_context(id).await {
log("CDP Error: ", er.to_string())
}
}
_ => (),
}
if !browser_handle.is_finished() {
browser_handle.abort();
}
Expand Down
14 changes: 11 additions & 3 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,7 @@ impl Page {
selectors: &RelativeSelectors,
browser: &std::sync::Arc<chromiumoxide::Browser>,
configuration: &crate::configuration::Configuration,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
) -> HashSet<A> {
let mut map = HashSet::new();
let html = self.get_html();
Expand Down Expand Up @@ -1028,10 +1029,11 @@ impl Page {
let browser = browser.to_owned();
let configuration = configuration.clone();
let target_url = self.url.clone();
let context_id = context_id.clone();

tokio::task::spawn(async move {
// we need to use about:blank here since we set the HTML content directly
match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout).await {
match crate::features::chrome::attempt_navigation("about:blank", &browser, &configuration.request_timeout, &context_id).await {
Ok(new_page) => {
match configuration
.evaluate_on_new_document
Expand Down Expand Up @@ -1371,15 +1373,21 @@ impl Page {
selectors: &RelativeSelectors,
page: &std::sync::Arc<chromiumoxide::Browser>,
configuration: &crate::configuration::Configuration,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
return Default::default();
}
self.links_stream_smart::<CaseInsensitiveString>(&selectors, page, configuration)
.await
self.links_stream_smart::<CaseInsensitiveString>(
&selectors,
page,
configuration,
context_id,
)
.await
}
}
}
Expand Down
57 changes: 34 additions & 23 deletions spider/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,7 @@ pub async fn put_hybrid_cache(
}

/// Get the initial page headers of the page with navigation.
#[cfg(all(feature = "chrome"))]
#[cfg(feature = "chrome")]
async fn navigate(
page: &chromiumoxide::Page,
url: &str,
Expand All @@ -833,6 +833,8 @@ async fn navigate(
EventRequestWillBeSent, EventResponseReceived,
};
use tokio::sync::oneshot;
use tokio::time;

let (req_tx, req_rx) = oneshot::channel();
let (resp_tx, resp_rx) = oneshot::channel();

Expand Down Expand Up @@ -943,32 +945,41 @@ async fn navigate(
}
});

// perform the navigation here.
match page.goto(url).await {
Ok(_p) => {}
Err(e) => {
log("HTTP Error: ", e.to_string());
}
};
Ok(_p) => {
let timeout_duration = tokio::time::Duration::from_secs(25);

let rq_out = tokio::join!(req_rx, resp_rx);
let rq_out = tokio::join!(
time::timeout(timeout_duration, req_rx),
time::timeout(timeout_duration, resp_rx)
);

match rq_out.0.ok() {
Some(r) => {
chrome_http_req_res.request_headers = r;
}
_ => (),
}
match rq_out.0 {
Ok(Ok(r)) => {
chrome_http_req_res.request_headers = r;
}
Ok(Err(_)) | Err(_) => {
log("", "Timeout or error waiting for request headers");
}
}

match rq_out.1.ok() {
Some(r) => {
chrome_http_req_res.response_headers = r.0;
chrome_http_req_res.status_code = StatusCode::from_u16(r.1 as u16).unwrap_or_default();
chrome_http_req_res.protocol = r.2.unwrap_or_default();
chrome_http_req_res.waf_check = r.3;
match rq_out.1 {
Ok(Ok(r)) => {
chrome_http_req_res.response_headers = r.0;
chrome_http_req_res.status_code =
StatusCode::from_u16(r.1 as u16).unwrap_or_default();
chrome_http_req_res.protocol = r.2.unwrap_or_default();
chrome_http_req_res.waf_check = r.3;
}
Ok(Err(_)) | Err(_) => {
log("", "Timeout or error waiting for response headers");
}
}
}
_ => (),
}
Err(e) => {
log("HTTP Error: ", e.to_string());
}
};

Ok(())
}
Expand All @@ -991,7 +1002,7 @@ pub async fn fetch_page_html_chrome_base(
let mut chrome_http_req_res = ChromeHTTPReqRes::default();
let mut valid = false;

let _ = tokio::time::timeout(tokio::time::Duration::from_secs(60), async {
let _ = tokio::time::timeout(tokio::time::Duration::from_secs(30), async {
// the active page was already set prior. No need to re-navigate or set the content.
if !page_set {
// used for smart mode re-rendering direct assigning html
Expand Down
Loading

0 comments on commit 4708b7f

Please sign in to comment.