diff --git a/README.md b/README.md index 5b9c918..8421547 100644 --- a/README.md +++ b/README.md @@ -88,3 +88,8 @@ Once the file are generated you can postprocess the files and save it into a jso python dataset_gen_cli.py filter ./exercises dataset.jsonl ``` +push to hf dataset + +```shell +python dataset_gen_cli.py push "jinaai/code_exercises_40k" dataset.jsonl +``` diff --git a/textbook/dataset_gen/dataset_gen_cli.py b/textbook/dataset_gen/dataset_gen_cli.py index 679b9dc..baccb09 100644 --- a/textbook/dataset_gen/dataset_gen_cli.py +++ b/textbook/dataset_gen/dataset_gen_cli.py @@ -17,6 +17,7 @@ from textbook.dataset_gen.create_prompts import Topic, Query from textbook.dataset_gen.filtering import load_and_filter_exos +from datasets import Dataset app = Typer() @@ -127,5 +128,19 @@ def filter(exo_path: Path, dataset_file: str): write_results_to_jsonl(dataset_file, exos) +@app.command() +def push(repo_name: str, dataset_file: Path): + with open(dataset_file, "r") as file: + lines = file.readlines() + exercises = [json.loads(line) for line in lines] + + def gen(): + for exo in exercises: + yield exo + + dataset = Dataset.from_generator(gen) + dataset.push_to_hub(repo_name) + + if __name__ == "__main__": app()