From 0d5b6b9e33916b662248475b346f498b7370e2d1 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:47:38 +0100 Subject: [PATCH] Add User filter on wiki page (#13) * add filter * rename filter * remove unwanted import --- clean.py | 5 +++-- clean_helpers/__init__.py | 2 +- clean_helpers/filter_wiki_meta.py | 4 ++++ .../cleaning_filtering/00_test_filter_user_titles.sh | 12 ++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 slurm/cleaning_filtering/00_test_filter_user_titles.sh diff --git a/clean.py b/clean.py index b005b4e..7598f2a 100644 --- a/clean.py +++ b/clean.py @@ -3,7 +3,7 @@ from datasets import Dataset, load_dataset, load_from_disk from datasets.utils.logging import set_verbosity_info -from clean_helpers import filter_wiki_non_text_type +from clean_helpers import filter_wiki_user_titles, filter_wiki_non_text_type set_verbosity_info() logger = logging.getLogger(__name__) @@ -13,7 +13,8 @@ MAPS = {} # Filter functions FILTERS = { - "filter_wiki_non_text_type": filter_wiki_non_text_type + "filter_wiki_user_titles": filter_wiki_user_titles, + "filter_wiki_non_text_type": filter_wiki_non_text_type, } assert set(MAPS.keys()).isdisjoint(set(FILTERS.keys())) diff --git a/clean_helpers/__init__.py b/clean_helpers/__init__.py index 460f248..4994c46 100644 --- a/clean_helpers/__init__.py +++ b/clean_helpers/__init__.py @@ -1 +1 @@ -from .filter_wiki_meta import filter_wiki_non_text_type \ No newline at end of file +from .filter_wiki_meta import filter_wiki_user_titles, filter_wiki_non_text_type \ No newline at end of file diff --git a/clean_helpers/filter_wiki_meta.py b/clean_helpers/filter_wiki_meta.py index ccbb6cf..62f1a97 100644 --- a/clean_helpers/filter_wiki_meta.py +++ b/clean_helpers/filter_wiki_meta.py @@ -1,2 +1,6 @@ +def filter_wiki_user_titles(examples): + return [not eval(meta)["title"].startswith("User ") for meta in examples["meta"]] + def filter_wiki_non_text_type(examples): return [eval(meta)["type"] == "text" for meta in examples["meta"]] + diff --git a/slurm/cleaning_filtering/00_test_filter_user_titles.sh b/slurm/cleaning_filtering/00_test_filter_user_titles.sh new file mode 100644 index 0000000..477c7a4 --- /dev/null +++ b/slurm/cleaning_filtering/00_test_filter_user_titles.sh @@ -0,0 +1,12 @@ +conda activate datacatalog + +CATALOGUE_DATA_REPO="/home/lucile/code/catalogue_data" + +cd $CATALOGUE_DATA_REPO + +python clean.py \ + --dataset-path bigscience-catalogue-lm-data/lm_en_wikinews_filtered \ + --maps-and-filters filter_wiki_user_titles \ + --save-path /home/lucile/data/result_filtering_cleaning/lm_en_wikinews_filtered.jsonl \ + --num-proc 4 \ + --batch-size 100 \ No newline at end of file