Merge pull request #13 from polywrap/update-parsing

fix: properly order text elements & return array of links | /workflows/cd
polywrap · Sep 27, 2023 · 2a131cd · 2a131cd
2 parents c74ed34 + b155cdc
commit 2a131cd
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 42 deletions.
diff --git a/polywrap.deploy.yaml b/polywrap.deploy.yaml
@@ -12,7 +12,7 @@ jobs:
         package: http
         uri: $$ipfs_deploy
         config:
-          postUrl: https://wraps.wrapscan.io/r/polywrap/[email protected].1
+          postUrl: https://wraps.wrapscan.io/r/polywrap/[email protected].2
           headers:
             - name: Authorization
               value: $POLYWRAP_WRAPSCAN_AUTH_HEADER_PROD
diff --git a/polywrap.graphql b/polywrap.graphql
@@ -2,5 +2,5 @@
 
 type Module {
   get_text(url: String!): String!
-  get_links(url: String!): String!
+  get_links(url: String!): [String!]!
 }
diff --git a/src/__tests__/e2e/integration.spec.ts b/src/__tests__/e2e/integration.spec.ts
@@ -17,33 +17,45 @@ describe("WebScraper", () => {
     });
     expect(result.ok).toBeTruthy();
     if (!result.ok) return;
-    expect(result.value).toContain(`/
-#page-top
-/
-/cloud-scraper
-/pricing
-#section3
-/documentation
-/tutorials
-/how-to-videos
-/test-sites
-https://forum.webscraper.io/
-https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en
-https://cloud.webscraper.io/
-/test-sites/e-commerce/allinone
-/test-sites/e-commerce/allinone/phones
-/test-sites/e-commerce/allinone/computers`
-    );
+    expect(result.value).toEqual(expect.arrayContaining([
+      "/",
+      "#page-top",
+      "/",
+      "/cloud-scraper",
+      "/pricing",
+      "#section3",
+      "/documentation",
+      "/tutorials",
+      "/how-to-videos",
+      "/test-sites",
+      "https://forum.webscraper.io/",
+      "https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en",
+      "https://cloud.webscraper.io/",
+      "/test-sites/e-commerce/allinone",
+      "/test-sites/e-commerce/allinone/phones",
+      "/test-sites/e-commerce/allinone/computers",
+    ]));
   });
 
   it("get_text", async () => {
     const result = await webScraper.get_text({
-      url: "\nhttps://webscraper.io/test-sites/e-commerce/allinone\n"
+      url: "https://webscraper.io/test-sites/e-commerce/allinone"
     });
     expect(result.ok).toBeTruthy();
     if (!result.ok) return;
     expect(result.value).toContain(
       `Web Scraper\nCloud Scraper\n`
     );
   });
+
+  it("get_text 2", async () => {
+    const result = await webScraper.get_text({
+      url: "https://silennaihin.com/random/plain.html"
+    });
+    expect(result.ok).toBeTruthy();
+    if (!result.ok) return;
+    expect(result.value).toContain(
+      `This is a Heading\nThis is a paragraph.`
+    );
+  });
 });
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,19 +1,13 @@
 use wrap::{*, imported::ArgsGet};
-use scraper::{Html, Selector, ElementRef};
+use scraper::{Html, Selector};
 use imported::http_module::HttpModule;
 use wrap::imported::{HttpResponseType, HttpRequest};
 
 pub mod wrap;
 pub use wrap::prelude::*;
 
-fn extract_text(element: &ElementRef) -> String {
-    let text: String = element.text().collect::<Vec<_>>().join(" ");
-    let text = text.replace("\n", " ").trim().to_string();
-    text
-}
-
 impl ModuleTrait for Module {
-    fn get_links(args: ArgsGetLinks) -> Result<String, String> {
+    fn get_links(args: ArgsGetLinks) -> Result<Vec<String>, String> {
         let result = HttpModule::get(&ArgsGet {
             url: args.url.clone(),
             request: Some(HttpRequest{
@@ -26,7 +20,7 @@ impl ModuleTrait for Module {
             })
         })?;
 
-        let document = Html::parse_document(&result.unwrap().body.unwrap()); 
+        let document = Html::parse_document(&result.unwrap().body.unwrap());
         let selector = Selector::parse("a[href]").unwrap();
 
         let mut links = Vec::new();
@@ -37,7 +31,7 @@ impl ModuleTrait for Module {
             }
         }
 
-        Ok(links.join("\n"))
+        Ok(links)
     }
 
     fn get_text(args: ArgsGetText) -> Result<String, String> {
@@ -52,24 +46,20 @@ impl ModuleTrait for Module {
                 form_data: None,
             })
         })?;
-
-        let document: Html = Html::parse_document(&result.unwrap().body.unwrap());
 
-        let selectors = vec!["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "span"];
-
-        let mut text_vec: Vec<String> = Vec::new();
+        let document: Html = Html::parse_document(&result.unwrap().body.unwrap());
+        let selector = Selector::parse("p,h1,h2,h3,h4,h5,h6,div,span").unwrap();
 
-        for selector in selectors {
-            let selector = Selector::parse(selector).unwrap();
-            for element in document.select(&selector) {
-                let text = extract_text(&element);
+        let mut text: Vec<String> = vec!();
 
-                if !text.starts_with(".css") && !text.starts_with("html") {
-                    text_vec.push(text);
+        for element in document.select(&selector) {
+            for el in element.text().collect::<Vec<_>>() {
+                if !el.starts_with(".css") && !el.starts_with("html") {
+                    text.push(el.trim().to_string());
                 }
             }
         }
 
-        Ok(text_vec.join("\n"))
+        Ok(text.join("\n"))
     }
 }