Skip to content

Commit

Permalink
chore(smart): fix establishing whitelist/blacklist
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 26, 2024
1 parent 921d889 commit 9fa9d4d
Show file tree
Hide file tree
Showing 12 changed files with 151 additions and 114 deletions.
16 changes: 8 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.10.24"
version = "2.10.26"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
41 changes: 41 additions & 0 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,47 @@ pub async fn setup_chrome_interception_base(
None
}

/// establish all the page events.
pub async fn setup_chrome_events(chrome_page: &chromiumoxide::Page, config: &Configuration) {
let stealth = async {
if cfg!(feature = "chrome_stealth") || config.stealth_mode {
match config.user_agent.as_ref() {
Some(agent) => {
let _ = chrome_page.enable_stealth_mode_with_agent(agent).await;
}
_ => {
let _ = chrome_page.enable_stealth_mode().await;
}
}
}
};
let eval_docs = async {
match config.evaluate_on_new_document {
Some(ref script) => {
if config.fingerprint {
let _ = chrome_page
.evaluate_on_new_document(string_concat!(
crate::features::chrome::FP_JS,
script.as_str()
))
.await;
} else {
let _ = chrome_page.evaluate_on_new_document(script.as_str()).await;
}
}
_ => {
if config.fingerprint {
let _ = chrome_page
.evaluate_on_new_document(crate::features::chrome::FP_JS)
.await;
}
}
}
};

tokio::join!(stealth, eval_docs, configure_browser(&chrome_page, &config));
}

/// static chrome arguments to start
#[cfg(all(feature = "chrome_cpu", feature = "real_browser"))]
pub static CHROME_ARGS: [&'static str; 27] = [
Expand Down
28 changes: 28 additions & 0 deletions spider/src/features/openai_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,31 @@ impl Default for Prompt {
}
}

#[derive(Debug, Default, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(
all(
not(feature = "regex"),
not(feature = "openai"),
not(feature = "cache_openai")
),
derive(PartialEq)
)]
/// Structured data response format.
pub struct ResponseFormatJsonSchema {
/// A description of what the response format is for, used by the model to determine how to respond in the format.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
pub description: Option<String>,
/// The name of the response format. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length
pub name: String,
/// The schema for the response format, described as a JSON Schema object.
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
pub schema: Option<String>,
/// Whether to enable strict schema adherence when generating the output. If set to true, the model will always follow the exact schema defined in the `schema` field. Only a subset of JSON Schema is supported when `strict` is `true`. To learn more, read the [Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs).
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
pub strict: Option<bool>,
}

/// The GPT configs to use for dynamic Javascript execution and other functionality.
#[derive(Debug, Default, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
Expand Down Expand Up @@ -115,6 +140,9 @@ pub struct GPTConfigs {
)]
/// Use caching to cache the prompt. This does nothing without the 'cache_openai' flag enabled.
pub cache: Option<AICache>,
#[cfg_attr(feature = "serde", serde(default))]
/// Use structured JSON mode.
pub json_schema: Option<ResponseFormatJsonSchema>,
}

#[derive(Debug, Default, Clone)]
Expand Down
2 changes: 1 addition & 1 deletion spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,7 @@ impl Page {
)
.await;

crate::website::Website::setup_chrome_events(&new_page, &configuration).await;
crate::features::chrome::setup_chrome_events(&new_page, &configuration).await;

let page_resource =
crate::utils::fetch_page_html_chrome_base(
Expand Down
25 changes: 20 additions & 5 deletions spider/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1974,6 +1974,25 @@ pub async fn openai_request_base(
let mut tokens_used = crate::features::openai_common::OpenAIUsage::default();
let json_mode = gpt_configs.extra_ai_data;

let response_format = match gpt_configs.json_schema {
Some(ref structure) => async_openai::types::ResponseFormat::JsonSchema {
json_schema: async_openai::types::ResponseFormatJsonSchema {
description: structure.description.clone(),
name: structure.name.clone(),
schema: serde_json::from_str(&structure.schema.clone().unwrap_or_default())
.unwrap_or_default(),
strict: structure.strict,
},
},
_ => {
if json_mode {
async_openai::types::ResponseFormat::JsonObject
} else {
async_openai::types::ResponseFormat::Text
}
}
};

match async_openai::types::ChatCompletionRequestAssistantMessageArgs::default()
.content(string_concat!("URL: ", url, "\n", "HTML: ", resource))
.build()
Expand Down Expand Up @@ -2006,11 +2025,7 @@ pub async fn openai_request_base(
let v = match gpt_base
.max_tokens(max_tokens as u32)
.messages(messages)
.response_format(if json_mode {
async_openai::types::ResponseFormat::JsonObject
} else {
async_openai::types::ResponseFormat::Text
})
.response_format(response_format)
.build()
{
Ok(request) => {
Expand Down
Loading

0 comments on commit 9fa9d4d

Please sign in to comment.