Merge pull request #7 from philschmid/support-input-directories

add directory support
philschmid · Jan 9, 2024 · 3102606 · 3102606
2 parents fe4f219 + ceec19f
commit 3102606
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ _note: for crawling you need `playwright` and the browser dependencies._
 
 **Options:**
 
-- `-i, --input <file>` - Input file (html) to clip content from
+- `-i, --input <file> | <directory>` - Input file (html) or directory to clip content from. If a directory is provided, all files in the directory will be clipped.
 - `-u, --url <url>` - URL to clip content from
 - `-f, --format <format>` - Output format (markdown, json) (default: markdown)
 - `-o, --output <file>` - Output file for clipped content (default: output.md)
@@ -43,6 +43,12 @@ clipper clip -u <url>
 clipper clip -i <file>
 ```
 
+3. Clip content from a directory, convert a directory of HTML files to a jsonl file:
+
+```
+clipper clip -i <directory> -f json -o dataset.jsonl
+```
+
 ### Crawl 
 
 > [!WARNING]  
@@ -81,6 +87,7 @@ clipper clip -i test.html
 - Clone the repo
 - Run `npm install`
 - Run `npm run test -- clip -u https://huggingface.co/docs/transformers/index` to test the CLI
+- Run `npm run test -- clip -i examples/` to test the CLI with directory input
 - Run `npm run test -- crawl -u https://awsdocs-neuron.readthedocs-hosted.com/en/v2.14.1/index.html -g https://awsdocs-neuron.readthedocs-hosted.com/en/v2.14.1/\*\*/\*` to crawl the AWS Neuron docs
 - Run `npm run build` to build for production
 - Run `npm install -g .` to symlink the CLI for local testing

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@philschmid/clipper",
-  "version": "0.1.4",
+  "version": "0.2.0",
   "description": "A CLI to clip articles from the web and save them as markdown files.",
   "author": "Philipp Schmid",
   "main": "dist/index.js",

diff --git a/src/index.ts b/src/index.ts
@@ -4,31 +4,43 @@ import { program } from 'commander';
 import { extract_from_html, extract_from_url } from './clipper';
 import { readHtmlFileFromPath, writeMarkdownToFile, writeMarkdownToJsonlines } from './utils';
 import { crawl } from './crawler';
+import * as fs from 'fs';
+import * as path from 'path';
 
 program
   .version("1.0.0")
   .description("An example CLI for managing a directory")
 
 program.command("clip").description("Converts HTML to markdown")
-  .option("-i, --input <value>", "Path to HTML file to clip")
+  .option("-i, --input <value>", "Path to HTML or directory file to clip, e.g. test/ or test/index.html")
   .option("-o, --output <value>", "Path to output file", "output.md")
   .option("-u, --url <value>", "URL to clip")
   .option("-f, --format <value>", "format how you want to store the outpout, can be `md` or `json`", "md")
   .action(async (args, options) => {
-    // console.log(options)
-    let res: string
+    let res: string | Record<string, string>[];
     if (args.url) {
-      res = await extract_from_url(args.url)
+      res = await extract_from_url(args.url);
     } else if (args.input) {
-      res = await extract_from_html(readHtmlFileFromPath(args.input))
+      const inputPath = args.input;
+      const isDirectory = fs.lstatSync(inputPath).isDirectory();
+      if (isDirectory) {
+        const htmlFiles = fs.readdirSync(inputPath).filter(file => file.endsWith('.html')).map(file => path.join(inputPath, file));
+        res = []
+        for await (const file of htmlFiles) {
+          const md = await extract_from_html(readHtmlFileFromPath(file));
+          res.push({ markdown: md, file });
+        }
+      } else {
+        res = await extract_from_html(readHtmlFileFromPath(inputPath));
+      }
     } else {
-      throw new Error("Please specify either a URL or a file path")
+      throw new Error("Please specify either a URL or a file path");
     }
     // writes to file
     if (args.format === "json") {
-      writeMarkdownToJsonlines(res, args.output)
+      writeMarkdownToJsonlines(res, args.output);
     } else {
-      writeMarkdownToFile(res, args.output)
+      writeMarkdownToFile(res, args.output);
     }
   });
 

diff --git a/src/utils.ts b/src/utils.ts
@@ -6,16 +6,24 @@ export function readHtmlFileFromPath(path: string): string {
   return html;
 }
 
-export function writeMarkdownToFile(markdown: string, output: string): string {
-  // check if outout is a markdown file else remove extension and add .md
-  const outputFileName = output.endsWith('.md') ? output : output.replace(/\.[^/.]+$/, "") + '.md'
-  // write markdown to file
-  writeFileSync(outputFileName, markdown)
-  // return filename
-  return outputFileName
+export function writeMarkdownToFile(markdown: string | Record<"markdown", string>[], output: string) {
+  // check if markdown is array if yes loop over it an save 1 file per element
+  if (Array.isArray(markdown)) {
+    markdown.forEach((md, i) => {
+      // check if outout is a markdown file else remove extension and add .md
+      const outputFileName = output.replace(/\.[^/.]+$/, "") + `_${i}.md`
+      // write markdown to file
+      writeFileSync(outputFileName, md.markdown)
+    })
+  } else {
+    // check if outout is a markdown file else remove extension and add .md
+    const outputFileName = output.replace(/\.[^/.]+$/, "") + '.md'
+    // write markdown to file
+    writeFileSync(outputFileName, markdown)
+  }
 }
 
-export function writeMarkdownToJsonlines(markdown: string | string[], output: string): string {
+export function writeMarkdownToJsonlines(markdown: string | Record<string, string>[], output: string): string {
   // check if outout is a jsonl file else remove extension and add .jsonl
   const outputFileName = output.endsWith('.jsonl') ? output : output.replace(/\.[^/.]+$/, "") + '.jsonl'
   // check if markdown is string, if convert to array