Skip to content

Commit

Permalink
Merge pull request #7 from philschmid/support-input-directories
Browse files Browse the repository at this point in the history
add directory support
  • Loading branch information
philschmid authored Jan 9, 2024
2 parents fe4f219 + ceec19f commit 3102606
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 18 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ _note: for crawling you need `playwright` and the browser dependencies._

**Options:**

- `-i, --input <file>` - Input file (html) to clip content from
- `-i, --input <file> | <directory>` - Input file (html) or directory to clip content from. If a directory is provided, all files in the directory will be clipped.
- `-u, --url <url>` - URL to clip content from
- `-f, --format <format>` - Output format (markdown, json) (default: markdown)
- `-o, --output <file>` - Output file for clipped content (default: output.md)
Expand All @@ -43,6 +43,12 @@ clipper clip -u <url>
clipper clip -i <file>
```

3. Clip content from a directory, convert a directory of HTML files to a jsonl file:

```
clipper clip -i <directory> -f json -o dataset.jsonl
```

### Crawl

> [!WARNING]
Expand Down Expand Up @@ -81,6 +87,7 @@ clipper clip -i test.html
- Clone the repo
- Run `npm install`
- Run `npm run test -- clip -u https://huggingface.co/docs/transformers/index` to test the CLI
- Run `npm run test -- clip -i examples/` to test the CLI with directory input
- Run `npm run test -- crawl -u https://awsdocs-neuron.readthedocs-hosted.com/en/v2.14.1/index.html -g https://awsdocs-neuron.readthedocs-hosted.com/en/v2.14.1/\*\*/\*` to crawl the AWS Neuron docs
- Run `npm run build` to build for production
- Run `npm install -g .` to symlink the CLI for local testing
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@philschmid/clipper",
"version": "0.1.4",
"version": "0.2.0",
"description": "A CLI to clip articles from the web and save them as markdown files.",
"author": "Philipp Schmid",
"main": "dist/index.js",
Expand Down
28 changes: 20 additions & 8 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,43 @@ import { program } from 'commander';
import { extract_from_html, extract_from_url } from './clipper';
import { readHtmlFileFromPath, writeMarkdownToFile, writeMarkdownToJsonlines } from './utils';
import { crawl } from './crawler';
import * as fs from 'fs';
import * as path from 'path';

program
.version("1.0.0")
.description("An example CLI for managing a directory")

program.command("clip").description("Converts HTML to markdown")
.option("-i, --input <value>", "Path to HTML file to clip")
.option("-i, --input <value>", "Path to HTML or directory file to clip, e.g. test/ or test/index.html")
.option("-o, --output <value>", "Path to output file", "output.md")
.option("-u, --url <value>", "URL to clip")
.option("-f, --format <value>", "format how you want to store the outpout, can be `md` or `json`", "md")
.action(async (args, options) => {
// console.log(options)
let res: string
let res: string | Record<string, string>[];
if (args.url) {
res = await extract_from_url(args.url)
res = await extract_from_url(args.url);
} else if (args.input) {
res = await extract_from_html(readHtmlFileFromPath(args.input))
const inputPath = args.input;
const isDirectory = fs.lstatSync(inputPath).isDirectory();
if (isDirectory) {
const htmlFiles = fs.readdirSync(inputPath).filter(file => file.endsWith('.html')).map(file => path.join(inputPath, file));
res = []
for await (const file of htmlFiles) {
const md = await extract_from_html(readHtmlFileFromPath(file));
res.push({ markdown: md, file });
}
} else {
res = await extract_from_html(readHtmlFileFromPath(inputPath));
}
} else {
throw new Error("Please specify either a URL or a file path")
throw new Error("Please specify either a URL or a file path");
}
// writes to file
if (args.format === "json") {
writeMarkdownToJsonlines(res, args.output)
writeMarkdownToJsonlines(res, args.output);
} else {
writeMarkdownToFile(res, args.output)
writeMarkdownToFile(res, args.output);
}
});

Expand Down
24 changes: 16 additions & 8 deletions src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,24 @@ export function readHtmlFileFromPath(path: string): string {
return html;
}

export function writeMarkdownToFile(markdown: string, output: string): string {
// check if outout is a markdown file else remove extension and add .md
const outputFileName = output.endsWith('.md') ? output : output.replace(/\.[^/.]+$/, "") + '.md'
// write markdown to file
writeFileSync(outputFileName, markdown)
// return filename
return outputFileName
export function writeMarkdownToFile(markdown: string | Record<"markdown", string>[], output: string) {
// check if markdown is array if yes loop over it an save 1 file per element
if (Array.isArray(markdown)) {
markdown.forEach((md, i) => {
// check if outout is a markdown file else remove extension and add .md
const outputFileName = output.replace(/\.[^/.]+$/, "") + `_${i}.md`
// write markdown to file
writeFileSync(outputFileName, md.markdown)
})
} else {
// check if outout is a markdown file else remove extension and add .md
const outputFileName = output.replace(/\.[^/.]+$/, "") + '.md'
// write markdown to file
writeFileSync(outputFileName, markdown)
}
}

export function writeMarkdownToJsonlines(markdown: string | string[], output: string): string {
export function writeMarkdownToJsonlines(markdown: string | Record<string, string>[], output: string): string {
// check if outout is a jsonl file else remove extension and add .jsonl
const outputFileName = output.endsWith('.jsonl') ? output : output.replace(/\.[^/.]+$/, "") + '.jsonl'
// check if markdown is string, if convert to array
Expand Down

0 comments on commit 3102606

Please sign in to comment.