Skip to content

Commit

Permalink
perf(chrome): add skip other resources
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 3, 2024
1 parent 13ac661 commit 2fd036c
Show file tree
Hide file tree
Showing 11 changed files with 85 additions and 35 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.11.16"
version = "2.11.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use reqwest::{
};

/// Ignore the content types.
static IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf_set! {
pub static IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf_set! {
"application/pdf",
"application/zip",
"application/x-rar-compressed",
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.11.16"
version = "2.11.17"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
1 change: 1 addition & 0 deletions spider_chrome/src/browser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ impl Browser {
ignore_ads: config.ignore_ads,
extra_headers: config.extra_headers.clone(),
only_html: config.only_html,
created_first_target: false,
};

let fut = Handler::new(conn, rx, handler_config);
Expand Down
9 changes: 8 additions & 1 deletion spider_chrome/src/handler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ impl Handler {
}
PendingRequest::Navigate(id) => {
self.on_navigation_response(id, resp);
if self.config.only_html && !self.config.created_first_target {
self.config.created_first_target = true;
}
}
PendingRequest::ExternalCommand(tx) => {
let _ = tx.send(Ok(resp)).ok();
Expand Down Expand Up @@ -460,10 +463,11 @@ impl Handler {
cache_enabled: self.config.cache_enabled,
ignore_visuals: self.config.ignore_visuals,
extra_headers: self.config.extra_headers.clone(),
only_html: self.config.only_html,
only_html: self.config.only_html && self.config.created_first_target,
},
browser_ctx,
);

self.target_ids.push(target.target_id().clone());
self.targets.insert(target.target_id().clone(), target);
}
Expand Down Expand Up @@ -701,6 +705,8 @@ pub struct HandlerConfig {
pub extra_headers: Option<HashMap<String, String>>,
/// Only Html.
pub only_html: bool,
/// Created the first target.
pub created_first_target: bool,
}

impl Default for HandlerConfig {
Expand All @@ -718,6 +724,7 @@ impl Default for HandlerConfig {
ignore_javascript: false,
only_html: false,
extra_headers: Default::default(),
created_first_target: false,
}
}
}
Expand Down
84 changes: 63 additions & 21 deletions spider_chrome/src/handler/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,47 @@ lazy_static::lazy_static! {
"https://js.stripe.com/v3/"
}
};

/// Ignore the content types.
pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
"application/pdf",
"application/zip",
"application/x-rar-compressed",
"application/x-tar",
"image/png",
"image/jpeg",
"image/gif",
"image/bmp",
"image/svg+xml",
"video/mp4",
"video/x-msvideo",
"video/x-matroska",
"video/webm",
"audio/mpeg",
"audio/ogg",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/x-7z-compressed",
"application/x-rpm",
"application/x-shockwave-flash",
};

/// Ignore the resources for visual content types.
pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
"Image",
"Media",
"Font",
"Other",
};

/// Ignore the resources for visual content types.
pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
"Prefetch",
"Ping"
};
}

#[derive(Debug)]
Expand All @@ -60,8 +101,6 @@ pub struct NetworkManager {
block_stylesheets: bool,
/// Block javascript.
block_javascript: bool,
/// Made first request. Used to track crawling
made_request: bool,
/// Only html from loading.
pub only_html: bool,
}
Expand All @@ -85,7 +124,6 @@ impl NetworkManager {
ignore_visuals: true,
block_javascript: false,
block_stylesheets: false,
made_request: false,
only_html: false,
}
}
Expand Down Expand Up @@ -187,18 +225,20 @@ impl NetworkManager {
{
self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
} else {
if self.ignore_visuals
&& (ResourceType::Image == event.resource_type
|| ResourceType::Media == event.resource_type
|| self.block_stylesheets
&& ResourceType::Stylesheet == event.resource_type)
|| ResourceType::Prefetch == event.resource_type
|| ResourceType::Ping == event.resource_type
let skip_networking = IGNORE_NETWORKING_RESOURCE_MAP
.contains(&event.resource_type.as_ref())
|| self.ignore_visuals
&& (IGNORE_VISUAL_RESOURCE_MAP.contains(&event.resource_type.as_ref())
|| self.block_stylesheets
&& ResourceType::Stylesheet == event.resource_type)
|| self.block_javascript
&& ResourceType::Script == event.resource_type
&& !JS_FRAMEWORK_ALLOW.contains(&event.request.url.as_str())
// add one off stripe framework check for now...
{
&& !JS_FRAMEWORK_ALLOW.contains(&event.request.url.as_str());

// perform the http request here and insert the body.
// if self.only_html && !skip_networking {}

if skip_networking {
let fullfill_params =
crate::handler::network::fetch::FulfillRequestParams::new(
event.request_id.clone(),
Expand All @@ -224,19 +264,17 @@ impl NetworkManager {
{
self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
} else {
if self.detect_ad(event)
let skip_networking = IGNORE_NETWORKING_RESOURCE_MAP
.contains(&event.resource_type.as_ref())
|| self.ignore_visuals
&& (ResourceType::Image == event.resource_type
|| ResourceType::Media == event.resource_type
&& (IGNORE_VISUAL_RESOURCE_MAP.contains(&event.resource_type.as_ref())
|| self.block_stylesheets
&& ResourceType::Stylesheet == event.resource_type)
|| ResourceType::Prefetch == event.resource_type
|| ResourceType::Ping == event.resource_type
|| self.block_javascript
&& ResourceType::Script == event.resource_type
&& !JS_FRAMEWORK_ALLOW.contains(&event.request.url.as_str())
// add one off stripe framework check for now...
{
&& !JS_FRAMEWORK_ALLOW.contains(&event.request.url.as_str());

if self.detect_ad(event) || skip_networking {
let fullfill_params =
crate::handler::network::fetch::FulfillRequestParams::new(
event.request_id.clone(),
Expand All @@ -249,6 +287,10 @@ impl NetworkManager {
}
}
}

if self.only_html {
self.made_request = true;
}
}

/// Perform a page intercept for chrome
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.11.16"
version = "2.11.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.11.16"
version = "2.11.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.11.16"
version = "2.11.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.11.16"
version = "2.11.17"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 2fd036c

Please sign in to comment.