From b155cdce18eb8baa3ceaad8609a49fe97737c29f Mon Sep 17 00:00:00 2001 From: dOrgJelli Date: Wed, 27 Sep 2023 17:03:02 +0200 Subject: [PATCH] fix: properly order text elements & return array of links --- polywrap.deploy.yaml | 2 +- polywrap.graphql | 2 +- src/__tests__/e2e/integration.spec.ts | 48 +++++++++++++++++---------- src/lib.rs | 34 +++++++------------ 4 files changed, 44 insertions(+), 42 deletions(-) diff --git a/polywrap.deploy.yaml b/polywrap.deploy.yaml index dd6a128..fb17364 100644 --- a/polywrap.deploy.yaml +++ b/polywrap.deploy.yaml @@ -12,7 +12,7 @@ jobs: package: http uri: $$ipfs_deploy config: - postUrl: https://wraps.wrapscan.io/r/polywrap/web-scraper@1.0.1 + postUrl: https://wraps.wrapscan.io/r/polywrap/web-scraper@1.0.2 headers: - name: Authorization value: $POLYWRAP_WRAPSCAN_AUTH_HEADER_PROD diff --git a/polywrap.graphql b/polywrap.graphql index ce3dbc0..103ec71 100644 --- a/polywrap.graphql +++ b/polywrap.graphql @@ -2,5 +2,5 @@ type Module { get_text(url: String!): String! - get_links(url: String!): String! + get_links(url: String!): [String!]! } diff --git a/src/__tests__/e2e/integration.spec.ts b/src/__tests__/e2e/integration.spec.ts index 766ecc0..d6ae668 100644 --- a/src/__tests__/e2e/integration.spec.ts +++ b/src/__tests__/e2e/integration.spec.ts @@ -17,28 +17,29 @@ describe("WebScraper", () => { }); expect(result.ok).toBeTruthy(); if (!result.ok) return; - expect(result.value).toContain(`/ -#page-top -/ -/cloud-scraper -/pricing -#section3 -/documentation -/tutorials -/how-to-videos -/test-sites -https://forum.webscraper.io/ -https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en -https://cloud.webscraper.io/ -/test-sites/e-commerce/allinone -/test-sites/e-commerce/allinone/phones -/test-sites/e-commerce/allinone/computers` - ); + expect(result.value).toEqual(expect.arrayContaining([ + "/", + "#page-top", + "/", + "/cloud-scraper", + "/pricing", + "#section3", + "/documentation", + "/tutorials", + "/how-to-videos", + "/test-sites", + "https://forum.webscraper.io/", + "https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en", + "https://cloud.webscraper.io/", + "/test-sites/e-commerce/allinone", + "/test-sites/e-commerce/allinone/phones", + "/test-sites/e-commerce/allinone/computers", + ])); }); it("get_text", async () => { const result = await webScraper.get_text({ - url: "\nhttps://webscraper.io/test-sites/e-commerce/allinone\n" + url: "https://webscraper.io/test-sites/e-commerce/allinone" }); expect(result.ok).toBeTruthy(); if (!result.ok) return; @@ -46,4 +47,15 @@ https://cloud.webscraper.io/ `Web Scraper\nCloud Scraper\n` ); }); + + it("get_text 2", async () => { + const result = await webScraper.get_text({ + url: "https://silennaihin.com/random/plain.html" + }); + expect(result.ok).toBeTruthy(); + if (!result.ok) return; + expect(result.value).toContain( + `This is a Heading\nThis is a paragraph.` + ); + }); }); diff --git a/src/lib.rs b/src/lib.rs index 6fa162d..3ba965b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,19 +1,13 @@ use wrap::{*, imported::ArgsGet}; -use scraper::{Html, Selector, ElementRef}; +use scraper::{Html, Selector}; use imported::http_module::HttpModule; use wrap::imported::{HttpResponseType, HttpRequest}; pub mod wrap; pub use wrap::prelude::*; -fn extract_text(element: &ElementRef) -> String { - let text: String = element.text().collect::>().join(" "); - let text = text.replace("\n", " ").trim().to_string(); - text -} - impl ModuleTrait for Module { - fn get_links(args: ArgsGetLinks) -> Result { + fn get_links(args: ArgsGetLinks) -> Result, String> { let result = HttpModule::get(&ArgsGet { url: args.url.clone(), request: Some(HttpRequest{ @@ -26,7 +20,7 @@ impl ModuleTrait for Module { }) })?; - let document = Html::parse_document(&result.unwrap().body.unwrap()); + let document = Html::parse_document(&result.unwrap().body.unwrap()); let selector = Selector::parse("a[href]").unwrap(); let mut links = Vec::new(); @@ -37,7 +31,7 @@ impl ModuleTrait for Module { } } - Ok(links.join("\n")) + Ok(links) } fn get_text(args: ArgsGetText) -> Result { @@ -52,24 +46,20 @@ impl ModuleTrait for Module { form_data: None, }) })?; - - let document: Html = Html::parse_document(&result.unwrap().body.unwrap()); - let selectors = vec!["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "span"]; - - let mut text_vec: Vec = Vec::new(); + let document: Html = Html::parse_document(&result.unwrap().body.unwrap()); + let selector = Selector::parse("p,h1,h2,h3,h4,h5,h6,div,span").unwrap(); - for selector in selectors { - let selector = Selector::parse(selector).unwrap(); - for element in document.select(&selector) { - let text = extract_text(&element); + let mut text: Vec = vec!(); - if !text.starts_with(".css") && !text.starts_with("html") { - text_vec.push(text); + for element in document.select(&selector) { + for el in element.text().collect::>() { + if !el.starts_with(".css") && !el.starts_with("html") { + text.push(el.trim().to_string()); } } } - Ok(text_vec.join("\n")) + Ok(text.join("\n")) } }