feat: filtering (#18)

* feat: add filtering * feat: update readme * fix: apply black
jina-ai · Aug 10, 2023 · 5d1eed9 · 5d1eed9
1 parent 8e66273
commit 5d1eed9
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -66,6 +66,25 @@ Another way to inject diversity is prompt engineering. By having random aspects
 
 ## Generating Dataset
 
+
+Follow this step to reproduce the dataset generation
+
+
+First export your openAI key 
+```shell
+export OPENAI_API_KEY=sk-XXX
+```
+then start to parrallel call to open ai
+```shell
+python dataset_gen_cli.py generate ./tree/professions.json ./tree/subsubtopics.json ./exercises --n-prompts 50_000 --pool-size 20
+```
+
+this should take around 6hours. The process might be killed before the end but the data will still be save progressivly.
+
+
+Once the file are generated you can postprocess the files and save it into a jsonl file
+
 ```shell 
-python textbook/dataset_gen/dataset_gen_cli.py --pool-size 10 "tests/data/prompts_debug.jsonl"
+python dataset_gen_cli.py filter ./exercises dataset.jsonl
 ```
+
diff --git a/tests/dataset_gen/test_integration.py b/tests/dataset_gen/test_integration.py
@@ -1,4 +1,5 @@
-from textbook.dataset_gen.dataset_gen_cli import generate
+from textbook.dataset_gen.dataset_gen_cli import generate, filter
+import os
 
 
 def test_cli_dataset_gen(tmp_path):
@@ -9,5 +10,9 @@ def test_cli_dataset_gen(tmp_path):
         debug_speed=-1,
         retries=10,
         pool_size=10,
-        output_path=tmp_path / "results.jsonl",
+        output_path=tmp_path,
     )
+
+    filter(exo_path=tmp_path, dataset_file=os.path.join(tmp_path, "dataset.jsonl"))
+
+    assert os.path.exists(os.path.join(tmp_path, "dataset.jsonl"))
diff --git a/textbook/dataset_gen/dataset_gen_cli.py b/textbook/dataset_gen/dataset_gen_cli.py
@@ -9,11 +9,14 @@
     mass_generation,
     OpenAIGenerator,
     MonkeyGenerator,
+    write_results_to_jsonl,
 )
 import openai
 import os
+from pathlib import Path
 
 from textbook.dataset_gen.create_prompts import Topic, Query
+from textbook.dataset_gen.filtering import load_and_filter_exos
 
 app = Typer()
 
@@ -116,5 +119,13 @@ def get_generator():
     )
 
 
+@app.command()
+def filter(exo_path: Path, dataset_file: str):
+    print(exo_path)
+    exos = load_and_filter_exos(exo_path)
+    print(len(exos))
+    write_results_to_jsonl(dataset_file, exos)
+
+
 if __name__ == "__main__":
     app()
diff --git a/textbook/dataset_gen/filtering.py b/textbook/dataset_gen/filtering.py
@@ -0,0 +1,53 @@
+from textbook.dataset_gen.dataset_gen import Exercise
+from typing import List, Union
+import os
+from pathlib import Path
+
+
+def load_one_file(path: Union[Path, str]) -> List[Exercise]:
+    with open(path, "r") as f:
+        lines = f.readlines()
+    return [Exercise.parse_raw(line) for line in lines]
+
+
+def load_all_exo(path: Union[Path, str]) -> List[Exercise]:
+    if isinstance(path, str):
+        path = Path(path)
+    exos: List[Exercise] = []
+    for sub_dir in os.listdir(path):
+        for fn in os.listdir(path / sub_dir):
+            exos += load_one_file(path / sub_dir / fn)
+    return exos
+
+
+def filter_bad_exos(
+    exos: List[Exercise], carac_to_remove=["??", "___"]
+) -> List[Exercise]:
+    clean_exos: List[Exercise] = []
+    for exo in exos:
+        keep = True
+        for carac in carac_to_remove:
+            if carac in exo.solution:
+                keep = False
+                break
+
+        if keep:
+            clean_exos.append(exo)
+
+    return clean_exos
+
+
+def remove_extra(exos: List[Exercise], carac_to_split=["# Test", "```"]):
+    for exo in exos:
+        for carac in carac_to_split:
+            exo.solution = exo.solution.split(carac)[0]
+
+
+def load_and_filter_exos(path: Union[Path, str]) -> List[Exercise]:
+    exos = load_all_exo(path)
+    print(len(exos))
+    clean_exos = filter_bad_exos(exos)
+    print(len(clean_exos))
+
+    remove_extra(clean_exos)
+    return clean_exos