Skip to content

Commit

Permalink
chore(encoding): add html lang auto parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Oct 22, 2024
1 parent ab40676 commit 2065600
Show file tree
Hide file tree
Showing 13 changed files with 438 additions and 278 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ target
_temp_spider_downloads
storage
http-cacache
release.sh
release.sh
spider_transformations/example.pdf
32 changes: 16 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.9.17"
version = "2.10.0"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
7 changes: 5 additions & 2 deletions spider/src/packages/scraper/element_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ use crate::packages::scraper::selector::Selector;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ElementRef<'a> {
node: NodeRef<'a, Node>,
/// The language of the element. Not used atm.
pub lang: &'a str,
}

impl<'a> ElementRef<'a> {
fn new(node: NodeRef<'a, Node>) -> Self {
ElementRef { node }
ElementRef { node, lang: "" }
}

/// Wraps a `NodeRef` only if it references a `Node::Element`.
Expand Down Expand Up @@ -60,7 +62,8 @@ impl<'a> ElementRef<'a> {
match serialize(&mut buf, self, opts) {
_ => (),
};
crate::page::encode_bytes_from_language(&buf, "")
// we need to get the initial encoding of the html lang if used.
crate::page::encode_bytes_from_language(&buf, self.lang)
}

/// Returns the HTML of this element.
Expand Down
39 changes: 36 additions & 3 deletions spider/src/packages/scraper/html/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! HTML documents and fragments.

use ego_tree::iter::Nodes;
use ego_tree::Tree;
use ego_tree::{NodeId, Tree};
use fast_html5ever::serialize::SerializeOpts;
use fast_html5ever::tree_builder::QuirksMode;
use fast_html5ever::QualName;
Expand All @@ -12,6 +12,10 @@ use crate::packages::scraper::element_ref::ElementRef;
use crate::packages::scraper::node::Node;
use crate::packages::scraper::selector::Selector;

lazy_static! {
static ref HTML_SELECTOR: Selector = Selector::parse("html").unwrap();
}

/// An HTML tree.
///
/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
Expand All @@ -22,9 +26,10 @@ use crate::packages::scraper::selector::Selector;
pub struct Html {
/// The quirks mode.
pub quirks_mode: QuirksMode,

/// The node tree.
pub tree: Tree<Node>,
/// The html language of the document.
pub lang: String,
}

impl Html {
Expand All @@ -33,6 +38,7 @@ impl Html {
Html {
quirks_mode: QuirksMode::NoQuirks,
tree: Tree::new(Node::Document),
lang: Default::default(),
}
}

Expand All @@ -41,6 +47,7 @@ impl Html {
Html {
quirks_mode: QuirksMode::NoQuirks,
tree: Tree::new(Node::Fragment),
lang: Default::default(),
}
}

Expand Down Expand Up @@ -96,6 +103,25 @@ impl Html {
ElementRef::wrap(root_node).unwrap()
}

/// Set the html language of the document by getting the lang attr
pub fn set_language(&mut self, lang: String) {
self.lang = lang;
}

/// Get the language for the page.
pub fn get_lang(&self) -> &str {
if self.lang.is_empty() {
if let Some(element) = self.select(&HTML_SELECTOR).next() {
if let Some(lang) = element.value().attr("lang") {
return lang;
}
}
&self.lang
} else {
&self.lang
}
}

/// Serialize entire document into HTML.
pub fn html(&self) -> String {
let opts = SerializeOpts {
Expand All @@ -107,7 +133,14 @@ impl Html {
match serialize(&mut buf, self, opts) {
_ => (),
};
crate::page::encode_bytes_from_language(&buf, "")
crate::page::encode_bytes_from_language(&buf, self.get_lang())
}

/// Find and remove a node
pub fn remove_node(&mut self, node_id: NodeId) {
if let Some(mut node) = self.tree.get_mut(node_id) {
node.detach();
}
}
}

Expand Down
Loading

0 comments on commit 2065600

Please sign in to comment.