Skip to content

Commit

Permalink
Merge pull request #13 from polywrap/update-parsing
Browse files Browse the repository at this point in the history
fix: properly order text elements & return array of links | /workflows/cd
  • Loading branch information
dOrgJelli authored Sep 27, 2023
2 parents c74ed34 + b155cdc commit 2a131cd
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 42 deletions.
2 changes: 1 addition & 1 deletion polywrap.deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
package: http
uri: $$ipfs_deploy
config:
postUrl: https://wraps.wrapscan.io/r/polywrap/[email protected].1
postUrl: https://wraps.wrapscan.io/r/polywrap/[email protected].2
headers:
- name: Authorization
value: $POLYWRAP_WRAPSCAN_AUTH_HEADER_PROD
2 changes: 1 addition & 1 deletion polywrap.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

type Module {
get_text(url: String!): String!
get_links(url: String!): String!
get_links(url: String!): [String!]!
}
48 changes: 30 additions & 18 deletions src/__tests__/e2e/integration.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,45 @@ describe("WebScraper", () => {
});
expect(result.ok).toBeTruthy();
if (!result.ok) return;
expect(result.value).toContain(`/
#page-top
/
/cloud-scraper
/pricing
#section3
/documentation
/tutorials
/how-to-videos
/test-sites
https://forum.webscraper.io/
https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en
https://cloud.webscraper.io/
/test-sites/e-commerce/allinone
/test-sites/e-commerce/allinone/phones
/test-sites/e-commerce/allinone/computers`
);
expect(result.value).toEqual(expect.arrayContaining([
"/",
"#page-top",
"/",
"/cloud-scraper",
"/pricing",
"#section3",
"/documentation",
"/tutorials",
"/how-to-videos",
"/test-sites",
"https://forum.webscraper.io/",
"https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en",
"https://cloud.webscraper.io/",
"/test-sites/e-commerce/allinone",
"/test-sites/e-commerce/allinone/phones",
"/test-sites/e-commerce/allinone/computers",
]));
});

it("get_text", async () => {
const result = await webScraper.get_text({
url: "\nhttps://webscraper.io/test-sites/e-commerce/allinone\n"
url: "https://webscraper.io/test-sites/e-commerce/allinone"
});
expect(result.ok).toBeTruthy();
if (!result.ok) return;
expect(result.value).toContain(
`Web Scraper\nCloud Scraper\n`
);
});

it("get_text 2", async () => {
const result = await webScraper.get_text({
url: "https://silennaihin.com/random/plain.html"
});
expect(result.ok).toBeTruthy();
if (!result.ok) return;
expect(result.value).toContain(
`This is a Heading\nThis is a paragraph.`
);
});
});
34 changes: 12 additions & 22 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
use wrap::{*, imported::ArgsGet};
use scraper::{Html, Selector, ElementRef};
use scraper::{Html, Selector};
use imported::http_module::HttpModule;
use wrap::imported::{HttpResponseType, HttpRequest};

pub mod wrap;
pub use wrap::prelude::*;

fn extract_text(element: &ElementRef) -> String {
let text: String = element.text().collect::<Vec<_>>().join(" ");
let text = text.replace("\n", " ").trim().to_string();
text
}

impl ModuleTrait for Module {
fn get_links(args: ArgsGetLinks) -> Result<String, String> {
fn get_links(args: ArgsGetLinks) -> Result<Vec<String>, String> {
let result = HttpModule::get(&ArgsGet {
url: args.url.clone(),
request: Some(HttpRequest{
Expand All @@ -26,7 +20,7 @@ impl ModuleTrait for Module {
})
})?;

let document = Html::parse_document(&result.unwrap().body.unwrap());
let document = Html::parse_document(&result.unwrap().body.unwrap());
let selector = Selector::parse("a[href]").unwrap();

let mut links = Vec::new();
Expand All @@ -37,7 +31,7 @@ impl ModuleTrait for Module {
}
}

Ok(links.join("\n"))
Ok(links)
}

fn get_text(args: ArgsGetText) -> Result<String, String> {
Expand All @@ -52,24 +46,20 @@ impl ModuleTrait for Module {
form_data: None,
})
})?;

let document: Html = Html::parse_document(&result.unwrap().body.unwrap());

let selectors = vec!["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "span"];

let mut text_vec: Vec<String> = Vec::new();
let document: Html = Html::parse_document(&result.unwrap().body.unwrap());
let selector = Selector::parse("p,h1,h2,h3,h4,h5,h6,div,span").unwrap();

for selector in selectors {
let selector = Selector::parse(selector).unwrap();
for element in document.select(&selector) {
let text = extract_text(&element);
let mut text: Vec<String> = vec!();

if !text.starts_with(".css") && !text.starts_with("html") {
text_vec.push(text);
for element in document.select(&selector) {
for el in element.text().collect::<Vec<_>>() {
if !el.starts_with(".css") && !el.starts_with("html") {
text.push(el.trim().to_string());
}
}
}

Ok(text_vec.join("\n"))
Ok(text.join("\n"))
}
}

0 comments on commit 2a131cd

Please sign in to comment.