Skip to content

Commit

Permalink
Add User filter on wiki page (#13)
Browse files Browse the repository at this point in the history
* add filter

* rename filter

* remove unwanted import
  • Loading branch information
SaulLu authored Mar 3, 2022
1 parent b2d7d51 commit 0d5b6b9
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 3 deletions.
5 changes: 3 additions & 2 deletions clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datasets import Dataset, load_dataset, load_from_disk

from datasets.utils.logging import set_verbosity_info
from clean_helpers import filter_wiki_non_text_type
from clean_helpers import filter_wiki_user_titles, filter_wiki_non_text_type

set_verbosity_info()
logger = logging.getLogger(__name__)
Expand All @@ -13,7 +13,8 @@
MAPS = {}
# Filter functions
FILTERS = {
"filter_wiki_non_text_type": filter_wiki_non_text_type
"filter_wiki_user_titles": filter_wiki_user_titles,
"filter_wiki_non_text_type": filter_wiki_non_text_type,
}

assert set(MAPS.keys()).isdisjoint(set(FILTERS.keys()))
Expand Down
2 changes: 1 addition & 1 deletion clean_helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .filter_wiki_meta import filter_wiki_non_text_type
from .filter_wiki_meta import filter_wiki_user_titles, filter_wiki_non_text_type
4 changes: 4 additions & 0 deletions clean_helpers/filter_wiki_meta.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
def filter_wiki_user_titles(examples):
return [not eval(meta)["title"].startswith("User ") for meta in examples["meta"]]

def filter_wiki_non_text_type(examples):
return [eval(meta)["type"] == "text" for meta in examples["meta"]]

12 changes: 12 additions & 0 deletions slurm/cleaning_filtering/00_test_filter_user_titles.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
conda activate datacatalog

CATALOGUE_DATA_REPO="/home/lucile/code/catalogue_data"

cd $CATALOGUE_DATA_REPO

python clean.py \
--dataset-path bigscience-catalogue-lm-data/lm_en_wikinews_filtered \
--maps-and-filters filter_wiki_user_titles \
--save-path /home/lucile/data/result_filtering_cleaning/lm_en_wikinews_filtered.jsonl \
--num-proc 4 \
--batch-size 100

0 comments on commit 0d5b6b9

Please sign in to comment.