diff --git a/Cargo.lock b/Cargo.lock index 2b99fd644..3ac109cea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.9.3" +version = "2.9.5" dependencies = [ "ahash", "async-openai", @@ -3945,7 +3945,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.9.3" +version = "2.9.5" dependencies = [ "adblock", "async-tungstenite", @@ -3980,7 +3980,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.9.3" +version = "2.9.5" dependencies = [ "clap", "env_logger", @@ -4004,7 +4004,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.9.3" +version = "2.9.5" dependencies = [ "aho-corasick", "fast_html2md", @@ -4023,7 +4023,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.9.3" +version = "2.9.5" dependencies = [ "indexmap 1.9.3", "serde", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.9.3" +version = "2.9.5" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 10de7ab91..deed58631 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.9.3" +version = "2.9.5" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 2ef94d5b3..7783d35fe 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.9.3" +version = "2.9.5" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 3976130e9..243d5a21e 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.9.3" +version = "2.9.5" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 24c5ebe7f..3d44a20d0 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.9.3" +version = "2.9.5" authors = [ "j-mendez " ] diff --git a/spider_transformations/src/html2text/mod.rs b/spider_transformations/src/html2text/mod.rs index 9e62d2683..c91b9fbd0 100644 --- a/spider_transformations/src/html2text/mod.rs +++ b/spider_transformations/src/html2text/mod.rs @@ -1,4 +1,10 @@ pub mod render; + +use crate::markup5ever_rcdom::{ + Handle, + NodeData::{Comment, Document, Element}, +}; +pub use crate::markup5ever_rcdom::{NodeData, RcDom}; use html5ever::driver::ParseOpts; use html5ever::parse_document; use html5ever::tendril::TendrilSink; @@ -9,12 +15,6 @@ use render::text_renderer::{ TaggedLine, TextDecorator, TextRenderer, }; use render::Renderer; -mod markup5ever_rcdom; -pub use markup5ever_rcdom::RcDom; -use markup5ever_rcdom::{ - Handle, - NodeData::{Comment, Document, Element}, -}; use std::cell::Cell; use std::cmp::{max, min}; use unicode_width::UnicodeWidthStr; @@ -1255,7 +1255,7 @@ fn process_dom_node<'a, 'b, 'c, T: Write>( result } - markup5ever_rcdom::NodeData::Text { contents: ref tstr } => { + NodeData::Text { contents: ref tstr } => { Finished(RenderNode::new(Text((&*tstr.borrow()).into()))) } _ => { diff --git a/spider_transformations/src/html2xml.rs b/spider_transformations/src/html2xml.rs new file mode 100644 index 000000000..d503ab854 --- /dev/null +++ b/spider_transformations/src/html2xml.rs @@ -0,0 +1,106 @@ +use super::markup5ever_rcdom::{Handle, NodeData, RcDom}; +use html5ever::tendril::TendrilSink; +use html5ever::{parse_document, QualName}; +use markup5ever::namespace_url; +use markup5ever::ns; +use spider::page::get_html_encoded; +use std::default::Default; +use std::error::Error; +use std::io::{self, Write}; + +/// Convert HTML to well-formed XML. +pub fn convert_html_to_xml( + html: &str, + url: &str, + encoding: &Option, +) -> Result> { + let parser = parse_document(RcDom::default(), Default::default()); + let dom = parser.one(html); + let mut xml_output = Vec::new(); + let encoding = if let Some(ref encoding) = encoding { + encoding + } else { + "UTF-8" + }; + let root = format!(r#""#); + + write!(xml_output, "{root}")?; + serialize_xml(&dom.document, &mut xml_output)?; + write!(xml_output, "")?; + + Ok(get_html_encoded(&Some(xml_output.into()), &encoding)) +} + +/// Serialize a DOM node into XML. +fn serialize_xml(handle: &Handle, writer: &mut W) -> io::Result<()> { + match handle.data { + NodeData::Document => { + for child in handle.children.borrow().iter() { + serialize_xml(child, writer)?; + } + } + NodeData::Element { + ref name, + ref attrs, + .. + } => { + write!(writer, "<{}", qual_name_to_string(name))?; + + for attr in attrs.borrow().iter() { + let attr_name = qual_name_to_string(&attr.name); + let processed_name = if attr_name.contains(":") { + format!("custom:{}", attr_name.replace(":", "")) + } else { + attr_name + }; + + write!( + writer, + " {}=\"{}\"", + processed_name, + escape_xml(&attr.value) + )?; + } + + let children = handle.children.borrow(); + if children.is_empty() { + write!(writer, " />")?; + } else { + write!(writer, ">")?; + for child in children.iter() { + serialize_xml(child, writer)?; + } + write!(writer, "", qual_name_to_string(name))?; + } + } + NodeData::Text { ref contents } => { + write!(writer, "{}", escape_xml(&contents.borrow()))?; + } + NodeData::Comment { ref contents } => { + write!(writer, "", escape_xml(&contents.to_string()))?; + } + NodeData::Doctype { ref name, .. } => { + write!(writer, "", name)?; + } + _ => (), + } + Ok(()) +} + +/// Helper function to convert qualified names into a string representation. +fn qual_name_to_string(name: &QualName) -> String { + if name.ns == ns!(html) { + name.local.to_string() + } else { + format!("{}:{}", name.ns.to_string(), name.local) + } +} + +/// Escape special characters for XML documents. +fn escape_xml(text: &str) -> String { + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("\"", """) + .replace("'", "'") +} diff --git a/spider_transformations/src/lib.rs b/spider_transformations/src/lib.rs index 8559e83d5..192ae493b 100644 --- a/spider_transformations/src/lib.rs +++ b/spider_transformations/src/lib.rs @@ -1,3 +1,7 @@ pub mod html2text; +/// Html to xml. +pub mod html2xml; /// Base transformations. pub mod transformation; + +mod markup5ever_rcdom; diff --git a/spider_transformations/src/html2text/markup5ever_rcdom.rs b/spider_transformations/src/markup5ever_rcdom.rs similarity index 100% rename from spider_transformations/src/html2text/markup5ever_rcdom.rs rename to spider_transformations/src/markup5ever_rcdom.rs diff --git a/spider_transformations/src/transformation/content.rs b/spider_transformations/src/transformation/content.rs index 8a06da1ed..6b7bce232 100644 --- a/spider_transformations/src/transformation/content.rs +++ b/spider_transformations/src/transformation/content.rs @@ -1,3 +1,4 @@ +use crate::html2xml::convert_html_to_xml; use aho_corasick::AhoCorasick; use html2md; use regex::Regex; @@ -48,6 +49,8 @@ pub enum ReturnFormat { Markdown, /// Commonmark CommonMark, + /// Markdown + XML, } impl ReturnFormat { @@ -62,6 +65,7 @@ impl ReturnFormat { "raw" | "RAW" | "Raw" => ReturnFormat::Raw, "bytes" | "Bytes" | "BYTES" => ReturnFormat::Bytes, "commonmark" | "CommonMark" | "COMMONMARK" => ReturnFormat::CommonMark, + "xml" | "XML" | "XmL" | "Xml" => ReturnFormat::XML, _ => ReturnFormat::Raw, } } @@ -83,6 +87,7 @@ impl<'de> Deserialize<'de> for ReturnFormat { "raw" | "RAW" | "Raw" => Ok(ReturnFormat::Raw), "bytes" | "Bytes" | "BYTES" => Ok(ReturnFormat::Bytes), "commonmark" | "CommonMark" | "COMMONMARK" => Ok(ReturnFormat::CommonMark), + "xml" | "XML" | "XmL" | "Xml" => Ok(ReturnFormat::XML), _ => Ok(ReturnFormat::Raw), } } @@ -464,5 +469,49 @@ pub fn transform_content( super::text_extract::extract_text(&d) } + ReturnFormat::XML => { + let target_url = match url_parsed { + Some(u) => u.to_string(), + _ => EXAMPLE_URL.to_string(), + }; + + if c.readability { + match llm_readability::extractor::extract( + &mut res.get_html_bytes_u8(), + match url_parsed { + Some(u) => u, + _ => &EXAMPLE_URL, + }, + &None, + ) { + Ok(product) => { + if let Ok(xml) = + convert_html_to_xml(&product.content, &target_url, &encoding) + { + xml + } else { + Default::default() + } + } + _ => { + if let Ok(xml) = + convert_html_to_xml(&get_html(res, &encoding), &target_url, &encoding) + { + xml + } else { + Default::default() + } + } + } + } else { + if let Ok(xml) = + convert_html_to_xml(&get_html(res, &encoding), &target_url, &encoding) + { + xml + } else { + Default::default() + } + } + } } } diff --git a/spider_transformations/src/transformation/mod.rs b/spider_transformations/src/transformation/mod.rs index 7fcf8e2c5..42679865f 100644 --- a/spider_transformations/src/transformation/mod.rs +++ b/spider_transformations/src/transformation/mod.rs @@ -11,7 +11,7 @@ mod tests { use spider::{bytes::Bytes, page::build, utils::PageResponse}; #[test] - fn text_html_to_markdown() { + fn test_transformations() { use maud::{html, DOCTYPE}; let page_title = "Transform Test"; @@ -67,5 +67,14 @@ mod tests { .contains(&"\n\n\n\n\nTransform Test\n\n\n

Fun is fun

Spider Cloud
The content is ready
"), "The tranform to bytes is invalid" ); + + conf.return_format = ReturnFormat::XML; + let content = content::transform_content(&page, &conf, &None, &None); + + assert!( + content + .contains(& "?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n\n\n\nTransform Test\n\n\n

Fun is fun

Spider Cloud
The content is ready
"), + "The tranform to xml is invalid" + ); } } diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 53ffd5b0c..3711d02d1 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.9.3" +version = "2.9.5" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 641e8f10c..2447ed1f0 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.9.3" +version = "2.9.5" authors = [ "j-mendez " ]