Skip to content

Commit

Permalink
feat: filtering (#18)
Browse files Browse the repository at this point in the history
* feat: add filtering

* feat: update readme

* fix: apply black
  • Loading branch information
samsja authored Aug 10, 2023
1 parent 8e66273 commit 5d1eed9
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 3 deletions.
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ Another way to inject diversity is prompt engineering. By having random aspects

## Generating Dataset


Follow this step to reproduce the dataset generation


First export your openAI key
```shell
export OPENAI_API_KEY=sk-XXX
```
then start to parrallel call to open ai
```shell
python dataset_gen_cli.py generate ./tree/professions.json ./tree/subsubtopics.json ./exercises --n-prompts 50_000 --pool-size 20
```

this should take around 6hours. The process might be killed before the end but the data will still be save progressivly.


Once the file are generated you can postprocess the files and save it into a jsonl file

```shell
python textbook/dataset_gen/dataset_gen_cli.py --pool-size 10 "tests/data/prompts_debug.jsonl"
python dataset_gen_cli.py filter ./exercises dataset.jsonl
```

9 changes: 7 additions & 2 deletions tests/dataset_gen/test_integration.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from textbook.dataset_gen.dataset_gen_cli import generate
from textbook.dataset_gen.dataset_gen_cli import generate, filter
import os


def test_cli_dataset_gen(tmp_path):
Expand All @@ -9,5 +10,9 @@ def test_cli_dataset_gen(tmp_path):
debug_speed=-1,
retries=10,
pool_size=10,
output_path=tmp_path / "results.jsonl",
output_path=tmp_path,
)

filter(exo_path=tmp_path, dataset_file=os.path.join(tmp_path, "dataset.jsonl"))

assert os.path.exists(os.path.join(tmp_path, "dataset.jsonl"))
11 changes: 11 additions & 0 deletions textbook/dataset_gen/dataset_gen_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
mass_generation,
OpenAIGenerator,
MonkeyGenerator,
write_results_to_jsonl,
)
import openai
import os
from pathlib import Path

from textbook.dataset_gen.create_prompts import Topic, Query
from textbook.dataset_gen.filtering import load_and_filter_exos

app = Typer()

Expand Down Expand Up @@ -116,5 +119,13 @@ def get_generator():
)


@app.command()
def filter(exo_path: Path, dataset_file: str):
print(exo_path)
exos = load_and_filter_exos(exo_path)
print(len(exos))
write_results_to_jsonl(dataset_file, exos)


if __name__ == "__main__":
app()
53 changes: 53 additions & 0 deletions textbook/dataset_gen/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from textbook.dataset_gen.dataset_gen import Exercise
from typing import List, Union
import os
from pathlib import Path


def load_one_file(path: Union[Path, str]) -> List[Exercise]:
with open(path, "r") as f:
lines = f.readlines()
return [Exercise.parse_raw(line) for line in lines]


def load_all_exo(path: Union[Path, str]) -> List[Exercise]:
if isinstance(path, str):
path = Path(path)
exos: List[Exercise] = []
for sub_dir in os.listdir(path):
for fn in os.listdir(path / sub_dir):
exos += load_one_file(path / sub_dir / fn)
return exos


def filter_bad_exos(
exos: List[Exercise], carac_to_remove=["??", "___"]
) -> List[Exercise]:
clean_exos: List[Exercise] = []
for exo in exos:
keep = True
for carac in carac_to_remove:
if carac in exo.solution:
keep = False
break

if keep:
clean_exos.append(exo)

return clean_exos


def remove_extra(exos: List[Exercise], carac_to_split=["# Test", "```"]):
for exo in exos:
for carac in carac_to_split:
exo.solution = exo.solution.split(carac)[0]


def load_and_filter_exos(path: Union[Path, str]) -> List[Exercise]:
exos = load_all_exo(path)
print(len(exos))
clean_exos = filter_bad_exos(exos)
print(len(clean_exos))

remove_extra(clean_exos)
return clean_exos

0 comments on commit 5d1eed9

Please sign in to comment.