Skip to content

Commit

Permalink
chore(transformations): fix script encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 10, 2024
1 parent eb00eb1 commit 6485110
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 35 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.9.6"
version = "2.9.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.9.6"
version = "2.9.8"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.9.6"
version = "2.9.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.9.6"
version = "2.9.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
72 changes: 57 additions & 15 deletions spider_transformations/src/html2xml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, QualName};
use markup5ever::namespace_url;
use markup5ever::ns;
use spider::page::get_html_encoded;
use spider::page::{encode_bytes_from_language, get_html_encoded};
use std::default::Default;
use std::error::Error;
use std::io::{self, Write};
Expand All @@ -14,21 +14,46 @@ pub fn convert_html_to_xml(
url: &str,
encoding: &Option<String>,
) -> Result<String, Box<dyn Error>> {
if encoding.is_some() {
Ok(get_html_encoded(
&Some(base_convert_xml(html, url, encoding)?.into()),
&match encoding {
Some(encoding) => encoding,
_ => "UTF-8",
},
))
} else {
convert_html_to_xml_with_language(html, url, &None)
}
}

/// Convert HTML to well-formed XML by language.
pub fn convert_html_to_xml_with_language(
html: &str,
url: &str,
language: &Option<String>,
) -> Result<String, Box<dyn Error>> {
Ok(encode_bytes_from_language(
&base_convert_xml(html, url, language)?.as_slice(),
&match language {
Some(encoding) => encoding,
_ => "",
},
))
}

/// Convert HTML to well-formed XML.
pub fn base_convert_xml(
html: &str,
_url: &str,
_encoding: &Option<String>,
) -> Result<Vec<u8>, Box<dyn Error>> {
let parser = parse_document(RcDom::default(), Default::default());
let dom = parser.one(html);
let mut xml_output = Vec::new();
let encoding = if let Some(ref encoding) = encoding {
encoding
} else {
"UTF-8"
};
let root = format!(r#"<?xml version="1.0" encoding="{encoding}"?><root xmlns:custom="{url}">"#);

write!(xml_output, "{root}")?;
serialize_xml(&dom.document, &mut xml_output)?;
write!(xml_output, "</root>")?;

Ok(get_html_encoded(&Some(xml_output.into()), &encoding))
Ok(xml_output)
}

/// Serialize a DOM node into XML.
Expand All @@ -44,16 +69,21 @@ fn serialize_xml<W: Write>(handle: &Handle, writer: &mut W) -> io::Result<()> {
ref attrs,
..
} => {
write!(writer, "<{}", qual_name_to_string(name))?;
let sname = qual_name_to_string(name);

if sname == "html" {
write!(writer, r#"<{} xmlns="http://www.w3.org/1999/xhtml""#, sname)?;
} else {
write!(writer, "<{}", sname)?;
}

for attr in attrs.borrow().iter() {
let attr_name = qual_name_to_string(&attr.name);
let processed_name = if attr_name.contains(":") {
format!("custom:{}", attr_name.replace(":", ""))
attr_name.replace(":", "")
} else {
attr_name
};

write!(
writer,
" {}=\"{}\"",
Expand All @@ -63,14 +93,26 @@ fn serialize_xml<W: Write>(handle: &Handle, writer: &mut W) -> io::Result<()> {
}

let children = handle.children.borrow();

if children.is_empty() {
write!(writer, " />")?;
} else {
write!(writer, ">")?;
let insert_cdata = sname == "script" && !children.is_empty();

if insert_cdata {
write!(writer, "<![CDATA[")?;
}

for child in children.iter() {
serialize_xml(child, writer)?;
}
write!(writer, "</{}>", qual_name_to_string(name))?;

if insert_cdata {
write!(writer, "]]></{}>", sname)?;
} else {
write!(writer, "</{}>", sname)?;
}
}
}
NodeData::Text { ref contents } => {
Expand Down
4 changes: 2 additions & 2 deletions spider_transformations/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pub mod html2text;
/// Html to xml.
pub mod html2xml;
/// rcdom
mod markup5ever_rcdom;
/// Base transformations.
pub mod transformation;

mod markup5ever_rcdom;
47 changes: 41 additions & 6 deletions spider_transformations/src/transformation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ pub mod text_extract;
#[cfg(test)]
mod tests {
use crate::transformation::content::{self, ReturnFormat};
use maud::PreEscaped;
use spider::{bytes::Bytes, page::build, utils::PageResponse};

#[test]
fn test_transformations() {
/// the template to re-use
fn template() -> PreEscaped<String> {
use maud::{html, DOCTYPE};

let page_title = "Transform Test";
Expand All @@ -26,9 +27,17 @@ mod tests {
pre {
r#"The content is ready"#
}
}
.into_string();
script {
r#"document.querySelector("pre")"#
}
};

markup
}

#[test]
fn test_transformations() {
let markup = template().into_string();
let url = "https://spider.cloud";

let mut conf = content::TransformConfig::default();
Expand Down Expand Up @@ -69,11 +78,37 @@ mod tests {
);

conf.return_format = ReturnFormat::XML;
let content = content::transform_content(&page, &conf, &None, &None);
let content = content::transform_content(&page, &conf, &Some("UTF-8".into()), &None);
assert!(
content
== r#"<html xmlns="http://www.w3.org/1999/xhtml" class="paper"><head>
<meta name="disabled-adaptations" content="watch" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="viewport" content="initial-scale=1" />
<base href="https://spider.cloud/" />
<title>Transform Test</title>
<script><![CDATA[window.isReaderPage = true;]]></script>
</head><body>
<h1>Fun is fun</h1><a href="https://spider.cloud">Spider Cloud</a><pre>The content is ready</pre></body></html>"#,
"The tranform to xml is invalid"
);
}

#[test]
fn test_xml_transformations() {
let markup = template().into_string();

let url = "https://spider.cloud";

let mut conf = content::TransformConfig::default();
let mut page_response = PageResponse::default();
conf.return_format = ReturnFormat::XML;
page_response.content = Some(Bytes::from(markup));
let page = build(url, page_response);
let content = content::transform_content(&page, &conf, &None, &None);
assert!(
content
.contains(& "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root xmlns:custom=\"https://spider.cloud/\"><html custom:class=\"paper\"><head>\n<meta custom:name=\"disabled-adaptations\" custom:content=\"watch\" />\n<meta custom:http-equiv=\"Content-Type\" custom:content=\"text/html; charset=utf-8\" />\n<meta custom:name=\"viewport\" custom:content=\"initial-scale=1\" />\n<base custom:href=\"https://spider.cloud/\" />\n<title>Transform Test</title>\n<script>window.isReaderPage = true;</script>\n</head><body>\n<h1>Fun is fun</h1><a custom:href=\"https://spider.cloud\">Spider Cloud</a><pre>The content is ready</pre></body></html></root>"),
== r#"<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"><head><meta charset="utf-8" /><title>Transform Test</title></head><body><h1>Fun is fun</h1><a href="https://spider.cloud">Spider Cloud</a><pre>The content is ready</pre><script><![CDATA[document.querySelector(&amp;quot;pre&amp;quot;)]]></script></body></html>"#,
"The tranform to xml is invalid"
);
}
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.9.6"
version = "2.9.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.9.6"
version = "2.9.8"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 6485110

Please sign in to comment.