diff --git a/Cargo.lock b/Cargo.lock index 38032729f..d2066d9d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,9 +36,9 @@ dependencies = [ [[package]] name = "addr2line" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -500,9 +500,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.24" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812acba72f0a070b003d3697490d2b55b837230ae7c6c6497f05cc2ddbb8d938" +checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" dependencies = [ "jobserver", "libc", @@ -632,9 +632,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" dependencies = [ "clap_builder", "clap_derive", @@ -642,9 +642,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" dependencies = [ "anstream", "anstyle", @@ -994,18 +994,18 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ "darling", "proc-macro2", @@ -1015,9 +1015,9 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", "syn 2.0.79", @@ -1330,9 +1330,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1345,9 +1345,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1355,15 +1355,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1372,15 +1372,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -1389,15 +1389,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -1407,9 +1407,9 @@ checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1468,9 +1468,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "h2" @@ -1879,7 +1879,7 @@ dependencies = [ "http 1.1.0", "hyper 1.4.1", "hyper-util", - "rustls 0.23.13", + "rustls 0.23.14", "rustls-native-certs", "rustls-pki-types", "tokio", @@ -2037,9 +2037,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is-terminal" @@ -2505,21 +2505,18 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.20.1" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" -dependencies = [ - "portable-atomic", -] +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "oorandom" @@ -2790,18 +2787,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +checksum = "baf123a161dde1e524adf36f90bc5d8d3462824a9c43553ad07a8183161189ec" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +checksum = "a4502d8515ca9f32f1fb543d987f63d95a14934883db45bdb48060b6b69257f8" dependencies = [ "proc-macro2", "quote", @@ -2854,12 +2851,6 @@ dependencies = [ "plotters-backend", ] -[[package]] -name = "portable-atomic" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" - [[package]] name = "powerfmt" version = "0.2.0" @@ -2922,9 +2913,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" dependencies = [ "unicode-ident", ] @@ -2997,7 +2988,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.0.0", - "rustls 0.23.13", + "rustls 0.23.14", "socket2", "thiserror", "tokio", @@ -3014,7 +3005,7 @@ dependencies = [ "rand 0.8.5", "ring", "rustc-hash 2.0.0", - "rustls 0.23.13", + "rustls 0.23.14", "slab", "thiserror", "tinyvec", @@ -3246,7 +3237,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.13", + "rustls 0.23.14", "rustls-native-certs", "rustls-pemfile", "rustls-pki-types", @@ -3404,9 +3395,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" dependencies = [ "log", "once_cell", @@ -3479,9 +3470,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" dependencies = [ "windows-sys 0.59.0", ] @@ -3747,7 +3738,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.8.20" +version = "2.8.21" dependencies = [ "ahash", "async-openai", @@ -3806,7 +3797,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.8.20" +version = "2.8.21" dependencies = [ "adblock", "async-tungstenite", @@ -3841,7 +3832,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.8.20" +version = "2.8.21" dependencies = [ "clap", "env_logger", @@ -3865,7 +3856,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.8.20" +version = "2.8.21" dependencies = [ "aho-corasick", "fast_html2md", @@ -3884,7 +3875,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.8.20" +version = "2.8.21" dependencies = [ "indexmap 1.9.3", "spider", @@ -3893,7 +3884,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.8.20" +version = "2.8.21" dependencies = [ "env_logger", "lazy_static", @@ -4290,7 +4281,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.13", + "rustls 0.23.14", "rustls-pki-types", "tokio", ] @@ -4481,9 +4472,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "ua_generator" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20093f826c4866dc7d4808ecd9ed351e3021b94194f25f490f0ea9ea030e1f87" +checksum = "b0d4f7fcefca2ec9c0b34043d4cd8c233fb975607a6c51143e765d9f22c652d8" dependencies = [ "fastrand", "serde", @@ -4557,7 +4548,7 @@ dependencies = [ "flate2", "log", "once_cell", - "rustls 0.23.13", + "rustls 0.23.14", "rustls-pki-types", "serde", "serde_json", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index b8b17e4c1..3aaf89b56 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.8.20" +version = "2.8.21" authors = [ "j-mendez " ] diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index c0942ab3b..9b479bbfa 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -8,6 +8,7 @@ pub use crate::features::chrome_common::{ }; pub use crate::features::openai_common::GPTConfigs; use crate::website::CronType; +use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName}; use std::time::Duration; /// Redirect policy configuration for request @@ -53,7 +54,7 @@ type AllowList = Box; ), derive(PartialEq) )] - +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Configuration { /// Respect robots.txt file and not scrape not allowed files. This may slow down crawls if robots.txt file has a delay included. pub respect_robots_txt: bool, @@ -78,7 +79,7 @@ pub struct Configuration { /// Use proxy list for performing network request. pub proxies: Option>>, /// Headers to include with request. - pub headers: Option>, + pub headers: Option>, #[cfg(feature = "sitemap")] /// Include a sitemap in response of the crawl. pub sitemap_url: Option>, @@ -180,6 +181,82 @@ pub struct Configuration { Option>, } +#[derive(Default, Debug, Clone, PartialEq, Eq)] +/// Serializable HTTP headers. +pub struct SerializableHeaderMap(pub HeaderMap); + +impl SerializableHeaderMap { + /// Innter HeaderMap. + pub fn inner(&self) -> &HeaderMap { + &self.0 + } + /// Returns true if the map contains a value for the specified key. + pub fn contains_key(&self, key: K) -> bool + where + K: AsHeaderName, + { + self.0.contains_key(key) + } + /// Inserts a key-value pair into the map. + pub fn insert( + &mut self, + key: K, + val: reqwest::header::HeaderValue, + ) -> Option + where + K: IntoHeaderName, + { + self.0.insert(key, val) + } + /// Extend a `HeaderMap` with the contents of another `HeaderMap`. + pub fn extend(&mut self, iter: I) + where + I: IntoIterator, HeaderValue)>, + { + self.0.extend(iter); + } +} + +#[cfg(feature = "serde")] +impl serde::Serialize for SerializableHeaderMap { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let map: std::collections::BTreeMap = self + .0 + .iter() + .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string())) + .collect(); + map.serialize(serializer) + } +} + +impl From for SerializableHeaderMap { + fn from(header_map: HeaderMap) -> Self { + SerializableHeaderMap(header_map) + } +} + +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for SerializableHeaderMap { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + use reqwest::header::{HeaderName, HeaderValue}; + use std::collections::BTreeMap; + let map: BTreeMap = BTreeMap::deserialize(deserializer)?; + let mut headers = HeaderMap::new(); + for (k, v) in map { + let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?; + let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?; + headers.insert(key, value); + } + Ok(SerializableHeaderMap(headers)) + } +} + /// Get the user agent from the top agent list randomly. #[cfg(any(feature = "ua_generator"))] pub fn get_ua(chrome: bool) -> &'static str { @@ -478,7 +555,7 @@ impl Configuration { /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). pub fn with_headers(&mut self, headers: Option) -> &mut Self { match headers { - Some(m) => self.headers = Some(m.into()), + Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()), _ => self.headers = None, }; self diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index 8f07a74ff..248431985 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -206,7 +206,7 @@ fn create_handler_config(config: &Configuration) -> HandlerConfig { ignore_stylesheets: config.chrome_intercept.block_stylesheets, extra_headers: match config.headers { Some(ref headers) => { - let hm = crate::utils::header_utils::header_map_to_hash_map(headers); + let hm = crate::utils::header_utils::header_map_to_hash_map(headers.inner()); if hm.is_empty() { None } else { @@ -262,7 +262,8 @@ pub async fn setup_browser_configuration( browser_config.ignore_stylesheets = config.chrome_intercept.block_stylesheets; browser_config.extra_headers = match config.headers { Some(ref headers) => { - let hm = crate::utils::header_utils::header_map_to_hash_map(headers); + let hm = + crate::utils::header_utils::header_map_to_hash_map(headers.inner()); if hm.is_empty() { None } else { diff --git a/spider/src/utils/header_utils.rs b/spider/src/utils/header_utils.rs index 7dc0d3775..840b1dfc5 100644 --- a/spider/src/utils/header_utils.rs +++ b/spider/src/utils/header_utils.rs @@ -14,7 +14,7 @@ pub fn setup_default_headers( ) -> ClientBuilder { let mut headers = match configuration.headers { Some(ref h) => *h.clone(), - None => HeaderMap::new(), + None => crate::configuration::SerializableHeaderMap::default(), }; if !headers.contains_key(REFERER) { @@ -44,7 +44,7 @@ pub fn setup_default_headers( headers.extend(header_map); - client_builder.default_headers(headers) + client_builder.default_headers(headers.0) } /// Build the headers to use to act like a browser diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 8e6a3c19b..1d5f0f27f 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.8.20" +version = "2.8.21" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index b3433ac03..0afc7804f 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.8.20" +version = "2.8.21" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 0d152b276..0bcf7663b 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.8.20" +version = "2.8.21" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 25773f1f2..99209d323 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.8.20" +version = "2.8.21" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index cac5e61b7..a7fd11827 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.8.20" +version = "2.8.21" authors = [ "j-mendez " ]