Skip to content

Commit

Permalink
chore(transformations): add html_to_xml format
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 10, 2024
1 parent 4c424a9 commit 7a65934
Show file tree
Hide file tree
Showing 13 changed files with 188 additions and 20 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.9.3"
version = "2.9.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.9.3"
version = "2.9.5"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.9.3"
version = "2.9.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.9.3"
version = "2.9.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
14 changes: 7 additions & 7 deletions spider_transformations/src/html2text/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
pub mod render;

use crate::markup5ever_rcdom::{
Handle,
NodeData::{Comment, Document, Element},
};
pub use crate::markup5ever_rcdom::{NodeData, RcDom};
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
Expand All @@ -9,12 +15,6 @@ use render::text_renderer::{
TaggedLine, TextDecorator, TextRenderer,
};
use render::Renderer;
mod markup5ever_rcdom;
pub use markup5ever_rcdom::RcDom;
use markup5ever_rcdom::{
Handle,
NodeData::{Comment, Document, Element},
};
use std::cell::Cell;
use std::cmp::{max, min};
use unicode_width::UnicodeWidthStr;
Expand Down Expand Up @@ -1255,7 +1255,7 @@ fn process_dom_node<'a, 'b, 'c, T: Write>(

result
}
markup5ever_rcdom::NodeData::Text { contents: ref tstr } => {
NodeData::Text { contents: ref tstr } => {
Finished(RenderNode::new(Text((&*tstr.borrow()).into())))
}
_ => {
Expand Down
106 changes: 106 additions & 0 deletions spider_transformations/src/html2xml.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
use super::markup5ever_rcdom::{Handle, NodeData, RcDom};
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, QualName};
use markup5ever::namespace_url;
use markup5ever::ns;
use spider::page::get_html_encoded;
use std::default::Default;
use std::error::Error;
use std::io::{self, Write};

/// Convert HTML to well-formed XML.
pub fn convert_html_to_xml(
html: &str,
url: &str,
encoding: &Option<String>,
) -> Result<String, Box<dyn Error>> {
let parser = parse_document(RcDom::default(), Default::default());
let dom = parser.one(html);
let mut xml_output = Vec::new();
let encoding = if let Some(ref encoding) = encoding {
encoding
} else {
"UTF-8"
};
let root = format!(r#"<?xml version="1.0" encoding="{encoding}"?><root xmlns:custom="{url}">"#);

write!(xml_output, "{root}")?;
serialize_xml(&dom.document, &mut xml_output)?;
write!(xml_output, "</root>")?;

Ok(get_html_encoded(&Some(xml_output.into()), &encoding))
}

/// Serialize a DOM node into XML.
fn serialize_xml<W: Write>(handle: &Handle, writer: &mut W) -> io::Result<()> {
match handle.data {
NodeData::Document => {
for child in handle.children.borrow().iter() {
serialize_xml(child, writer)?;
}
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
write!(writer, "<{}", qual_name_to_string(name))?;

for attr in attrs.borrow().iter() {
let attr_name = qual_name_to_string(&attr.name);
let processed_name = if attr_name.contains(":") {
format!("custom:{}", attr_name.replace(":", ""))
} else {
attr_name
};

write!(
writer,
" {}=\"{}\"",
processed_name,
escape_xml(&attr.value)
)?;
}

let children = handle.children.borrow();
if children.is_empty() {
write!(writer, " />")?;
} else {
write!(writer, ">")?;
for child in children.iter() {
serialize_xml(child, writer)?;
}
write!(writer, "</{}>", qual_name_to_string(name))?;
}
}
NodeData::Text { ref contents } => {
write!(writer, "{}", escape_xml(&contents.borrow()))?;
}
NodeData::Comment { ref contents } => {
write!(writer, "<!--{}-->", escape_xml(&contents.to_string()))?;
}
NodeData::Doctype { ref name, .. } => {
write!(writer, "<!DOCTYPE {}>", name)?;
}
_ => (),
}
Ok(())
}

/// Helper function to convert qualified names into a string representation.
fn qual_name_to_string(name: &QualName) -> String {
if name.ns == ns!(html) {
name.local.to_string()
} else {
format!("{}:{}", name.ns.to_string(), name.local)
}
}

/// Escape special characters for XML documents.
fn escape_xml(text: &str) -> String {
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace("\"", "&quot;")
.replace("'", "&apos;")
}
4 changes: 4 additions & 0 deletions spider_transformations/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
pub mod html2text;
/// Html to xml.
pub mod html2xml;
/// Base transformations.
pub mod transformation;

mod markup5ever_rcdom;
49 changes: 49 additions & 0 deletions spider_transformations/src/transformation/content.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::html2xml::convert_html_to_xml;
use aho_corasick::AhoCorasick;
use html2md;
use regex::Regex;
Expand Down Expand Up @@ -48,6 +49,8 @@ pub enum ReturnFormat {
Markdown,
/// Commonmark
CommonMark,
/// Markdown
XML,
}

impl ReturnFormat {
Expand All @@ -62,6 +65,7 @@ impl ReturnFormat {
"raw" | "RAW" | "Raw" => ReturnFormat::Raw,
"bytes" | "Bytes" | "BYTES" => ReturnFormat::Bytes,
"commonmark" | "CommonMark" | "COMMONMARK" => ReturnFormat::CommonMark,
"xml" | "XML" | "XmL" | "Xml" => ReturnFormat::XML,
_ => ReturnFormat::Raw,
}
}
Expand All @@ -83,6 +87,7 @@ impl<'de> Deserialize<'de> for ReturnFormat {
"raw" | "RAW" | "Raw" => Ok(ReturnFormat::Raw),
"bytes" | "Bytes" | "BYTES" => Ok(ReturnFormat::Bytes),
"commonmark" | "CommonMark" | "COMMONMARK" => Ok(ReturnFormat::CommonMark),
"xml" | "XML" | "XmL" | "Xml" => Ok(ReturnFormat::XML),
_ => Ok(ReturnFormat::Raw),
}
}
Expand Down Expand Up @@ -464,5 +469,49 @@ pub fn transform_content(

super::text_extract::extract_text(&d)
}
ReturnFormat::XML => {
let target_url = match url_parsed {
Some(u) => u.to_string(),
_ => EXAMPLE_URL.to_string(),
};

if c.readability {
match llm_readability::extractor::extract(
&mut res.get_html_bytes_u8(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
},
&None,
) {
Ok(product) => {
if let Ok(xml) =
convert_html_to_xml(&product.content, &target_url, &encoding)
{
xml
} else {
Default::default()
}
}
_ => {
if let Ok(xml) =
convert_html_to_xml(&get_html(res, &encoding), &target_url, &encoding)
{
xml
} else {
Default::default()
}
}
}
} else {
if let Ok(xml) =
convert_html_to_xml(&get_html(res, &encoding), &target_url, &encoding)
{
xml
} else {
Default::default()
}
}
}
}
}
11 changes: 10 additions & 1 deletion spider_transformations/src/transformation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ mod tests {
use spider::{bytes::Bytes, page::build, utils::PageResponse};

#[test]
fn text_html_to_markdown() {
fn test_transformations() {
use maud::{html, DOCTYPE};

let page_title = "Transform Test";
Expand Down Expand Up @@ -67,5 +67,14 @@ mod tests {
.contains(&"<html class=\"paper\"><head>\n<meta name=\"disabled-adaptations\" content=\"watch\">\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n<meta name=\"viewport\" content=\"initial-scale=1\">\n<base href=\"https://spider.cloud/\">\n<title>Transform Test</title>\n<script>window.isReaderPage = true;</script>\n</head><body>\n<h1>Fun is fun</h1><a href=\"https://spider.cloud\">Spider Cloud</a><pre>The content is ready</pre></body></html>"),
"The tranform to bytes is invalid"
);

conf.return_format = ReturnFormat::XML;
let content = content::transform_content(&page, &conf, &None, &None);

assert!(
content
.contains(& "?xml version=\"1.0\" encoding=\"UTF-8\"?><root xmlns:custom=\"https://spider.cloud/\"><html custom:class=\"paper\"><head>\n<meta custom:name=\"disabled-adaptations\" custom:content=\"watch\" />\n<meta custom:http-equiv=\"Content-Type\" custom:content=\"text/html; charset=utf-8\" />\n<meta custom:name=\"viewport\" custom:content=\"initial-scale=1\" />\n<base custom:href=\"https://spider.cloud/\" />\n<title>Transform Test</title>\n<script>window.isReaderPage = true;</script>\n</head><body>\n<h1>Fun is fun</h1><a custom:href=\"https://spider.cloud\">Spider Cloud</a><pre>The content is ready</pre></body></html></root>"),
"The tranform to xml is invalid"
);
}
}
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.9.3"
version = "2.9.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.9.3"
version = "2.9.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 7a65934

Please sign in to comment.