From 7d5d18d4ad40559d53e6695259da5f6324bc0d43 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Fri, 11 Aug 2023 08:48:16 +0200 Subject: [PATCH] feat: add cli to push to hf (#19) * feat: add cli to push to hf * fix: fix cli * fix: fix readme --- README.md | 5 +++++ textbook/dataset_gen/dataset_gen_cli.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/README.md b/README.md index 5b9c918..8421547 100644 --- a/README.md +++ b/README.md @@ -88,3 +88,8 @@ Once the file are generated you can postprocess the files and save it into a jso python dataset_gen_cli.py filter ./exercises dataset.jsonl ``` +push to hf dataset + +```shell +python dataset_gen_cli.py push "jinaai/code_exercises_40k" dataset.jsonl +``` diff --git a/textbook/dataset_gen/dataset_gen_cli.py b/textbook/dataset_gen/dataset_gen_cli.py index 679b9dc..baccb09 100644 --- a/textbook/dataset_gen/dataset_gen_cli.py +++ b/textbook/dataset_gen/dataset_gen_cli.py @@ -17,6 +17,7 @@ from textbook.dataset_gen.create_prompts import Topic, Query from textbook.dataset_gen.filtering import load_and_filter_exos +from datasets import Dataset app = Typer() @@ -127,5 +128,19 @@ def filter(exo_path: Path, dataset_file: str): write_results_to_jsonl(dataset_file, exos) +@app.command() +def push(repo_name: str, dataset_file: Path): + with open(dataset_file, "r") as file: + lines = file.readlines() + exercises = [json.loads(line) for line in lines] + + def gen(): + for exo in exercises: + yield exo + + dataset = Dataset.from_generator(gen) + dataset.push_to_hub(repo_name) + + if __name__ == "__main__": app()