Skip to content

Commit

Permalink
chore(config): add serializable crawl configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 8, 2024
1 parent 54a9fab commit 1a9be9b
Show file tree
Hide file tree
Showing 10 changed files with 155 additions and 86 deletions.
137 changes: 64 additions & 73 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.8.20"
version = "2.8.21"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
83 changes: 80 additions & 3 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub use crate::features::chrome_common::{
};
pub use crate::features::openai_common::GPTConfigs;
use crate::website::CronType;
use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
use std::time::Duration;

/// Redirect policy configuration for request
Expand Down Expand Up @@ -53,7 +54,7 @@ type AllowList = Box<regex::RegexSet>;
),
derive(PartialEq)
)]

#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Configuration {
/// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included.
pub respect_robots_txt: bool,
Expand All @@ -78,7 +79,7 @@ pub struct Configuration {
/// Use proxy list for performing network request.
pub proxies: Option<Box<Vec<String>>>,
/// Headers to include with request.
pub headers: Option<Box<reqwest::header::HeaderMap>>,
pub headers: Option<Box<SerializableHeaderMap>>,
#[cfg(feature = "sitemap")]
/// Include a sitemap in response of the crawl.
pub sitemap_url: Option<Box<CompactString>>,
Expand Down Expand Up @@ -180,6 +181,82 @@ pub struct Configuration {
Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
}

#[derive(Default, Debug, Clone, PartialEq, Eq)]
/// Serializable HTTP headers.
pub struct SerializableHeaderMap(pub HeaderMap);

impl SerializableHeaderMap {
/// Innter HeaderMap.
pub fn inner(&self) -> &HeaderMap {
&self.0
}
/// Returns true if the map contains a value for the specified key.
pub fn contains_key<K>(&self, key: K) -> bool
where
K: AsHeaderName,
{
self.0.contains_key(key)
}
/// Inserts a key-value pair into the map.
pub fn insert<K>(
&mut self,
key: K,
val: reqwest::header::HeaderValue,
) -> Option<reqwest::header::HeaderValue>
where
K: IntoHeaderName,
{
self.0.insert(key, val)
}
/// Extend a `HeaderMap` with the contents of another `HeaderMap`.
pub fn extend<I>(&mut self, iter: I)
where
I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
{
self.0.extend(iter);
}
}

#[cfg(feature = "serde")]
impl serde::Serialize for SerializableHeaderMap {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let map: std::collections::BTreeMap<String, String> = self
.0
.iter()
.map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
.collect();
map.serialize(serializer)
}
}

impl From<HeaderMap> for SerializableHeaderMap {
fn from(header_map: HeaderMap) -> Self {
SerializableHeaderMap(header_map)
}
}

#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
use reqwest::header::{HeaderName, HeaderValue};
use std::collections::BTreeMap;
let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
let mut headers = HeaderMap::new();
for (k, v) in map {
let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
headers.insert(key, value);
}
Ok(SerializableHeaderMap(headers))
}
}

/// Get the user agent from the top agent list randomly.
#[cfg(any(feature = "ua_generator"))]
pub fn get_ua(chrome: bool) -> &'static str {
Expand Down Expand Up @@ -478,7 +555,7 @@ impl Configuration {
/// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
match headers {
Some(m) => self.headers = Some(m.into()),
Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
_ => self.headers = None,
};
self
Expand Down
5 changes: 3 additions & 2 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ fn create_handler_config(config: &Configuration) -> HandlerConfig {
ignore_stylesheets: config.chrome_intercept.block_stylesheets,
extra_headers: match config.headers {
Some(ref headers) => {
let hm = crate::utils::header_utils::header_map_to_hash_map(headers);
let hm = crate::utils::header_utils::header_map_to_hash_map(headers.inner());
if hm.is_empty() {
None
} else {
Expand Down Expand Up @@ -262,7 +262,8 @@ pub async fn setup_browser_configuration(
browser_config.ignore_stylesheets = config.chrome_intercept.block_stylesheets;
browser_config.extra_headers = match config.headers {
Some(ref headers) => {
let hm = crate::utils::header_utils::header_map_to_hash_map(headers);
let hm =
crate::utils::header_utils::header_map_to_hash_map(headers.inner());
if hm.is_empty() {
None
} else {
Expand Down
4 changes: 2 additions & 2 deletions spider/src/utils/header_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub fn setup_default_headers(
) -> ClientBuilder {
let mut headers = match configuration.headers {
Some(ref h) => *h.clone(),
None => HeaderMap::new(),
None => crate::configuration::SerializableHeaderMap::default(),
};

if !headers.contains_key(REFERER) {
Expand Down Expand Up @@ -44,7 +44,7 @@ pub fn setup_default_headers(

headers.extend(header_map);

client_builder.default_headers(headers)
client_builder.default_headers(headers.0)
}

/// Build the headers to use to act like a browser
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.8.20"
version = "2.8.21"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.8.20"
version = "2.8.21"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.8.20"
version = "2.8.21"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.8.20"
version = "2.8.21"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.8.20"
version = "2.8.21"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 1a9be9b

Please sign in to comment.