From 98e1efcebed4437e1a989328e210ad9fb5c09890 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 25 Dec 2023 22:02:00 +0200 Subject: [PATCH 1/3] added poetry + fixed failing tests --- .gitignore | 3 +- .pipelines/ci.yml | 45 ----------------- README.md | 6 ++- azure-pipelines.yml | 37 ++++++++++++++ .../models/presidio_recognizer_wrapper.py | 5 +- pyproject.toml | 48 +++++++++++++++++++ requirements.txt | 21 -------- requirements_all.txt | 20 -------- 8 files changed, 95 insertions(+), 90 deletions(-) delete mode 100644 .pipelines/ci.yml create mode 100644 azure-pipelines.yml create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 requirements_all.txt diff --git a/.gitignore b/.gitignore index 7f8b8e2..bca667d 100644 --- a/.gitignore +++ b/.gitignore @@ -189,4 +189,5 @@ datasets/ /data *.spacy -*.pickle \ No newline at end of file +*.pickle +/poetry.lock diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml deleted file mode 100644 index 9abdd8f..0000000 --- a/.pipelines/ci.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: -- master - -pr: - branches: - include: - - master - - feature/* - -pool: - vmImage: 'ubuntu-latest' -strategy: - matrix: - Python37: - python.version: '3.7' - Python38: - python.version: '3.8' - Python39: - python.version: '3.9' - Python310: - python.version: '3.10' -steps: -- task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - displayName: 'Use Python $(python.version)' - -- script: | - python -m pip install --upgrade pip - pip install wheel - pip install -r requirements.txt - python -m spacy download en_core_web_lg - python -m spacy download en_core_web_sm - - displayName: 'Install base dependencies' - -- script: | - pip install pytest pytest-azurepipelines - pytest - displayName: 'pytest' diff --git a/README.md b/README.md index 448f8d2..8fb762a 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,10 @@ conda create --name presidio python=3.9 conda activate presidio # Install package+dependencies -pip install -r requirements.txt -python setup.py install +pip install poetry +poetry install +# To install with all additional NER dependencies (e.g. Flair, Stanza, CRF), run: +# poetry install -with ner # Download a spaCy model used by presidio-analyzer python -m spacy download en_core_web_lg diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 0000000..b86a426 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,37 @@ +pr: + branches: + include: + - master + - feature/* + +pool: + vmImage: 'ubuntu-latest' +strategy: + matrix: + Python37: + python.version: '3.7' + Python38: + python.version: '3.8' + Python39: + python.version: '3.9' + Python310: + python.version: '3.10' + Python311: + python.version: '3.11' +steps: +- task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' + +- script: | + python -m pip install --upgrade pip + pip install poetry + poetry install --with dev,ner + + displayName: 'Install dependencies' + +- script: | + poetry add pytest-azurepipelines + poetry run pytest --runslow + displayName: 'pytest' diff --git a/presidio_evaluator/models/presidio_recognizer_wrapper.py b/presidio_evaluator/models/presidio_recognizer_wrapper.py index fc3b745..a6be3a5 100644 --- a/presidio_evaluator/models/presidio_recognizer_wrapper.py +++ b/presidio_evaluator/models/presidio_recognizer_wrapper.py @@ -41,12 +41,15 @@ def __init__( self.recognizer = recognizer self.nlp_engine = nlp_engine + if not self.nlp_engine.is_loaded(): + self.nlp_engine.load() + # def __make_nlp_artifacts(self, text: str): return self.nlp_engine.process_text(text, "en") # - def predict(self, sample: InputSample) -> List[str]: + def predict(self, sample: InputSample, **kwargs) -> List[str]: nlp_artifacts = None if self.with_nlp_artifacts: nlp_artifacts = self.__make_nlp_artifacts(sample.full_text) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..94beda4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,48 @@ +[tool.poetry] +name = "presidio_evaluator" +version = "0.1.0" +description = "" +authors = ["Omri Mendels "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.9" +spacy = ">=3.2.0, <4.0.0" +numpy = ">=1.20.2,<2.0.0" +jupyter = ">=1" +pandas = ">=1.2.4,<2.0.0" +tqdm = ">=4.60.0,<5.0.0" +haikunator = ">=2.1.0,<3.0.0" +schwifty = ">=2023.11.2,<2024.0.0" +faker = ">=9.6.0,<10.0.0" +scikit-learn = ">1.3.2,<2.0.0" +pytest = ">=6.2.3" +presidio-analyzer = "^2.2.351" +presidio-anonymizer = "^2.2.351" +requests = ">=2.25.1" +xmltodict = ">=0.12.0" +python-dotenv = "^1.0.0" +plotly = "^5.18.0" +azure-ai-textanalytics = ">=5.3.0" +en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"} +en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"} + +# optional dependencies for the different NLP approaches +[tool.poetry.group.ner] +optional=true + +[tool.poetry.group.ner.dependencies] +flair = "^0.13.0" +spacy_stanza = "^1.0.0" +sklearn_crfsuite = "^0.3.6" +spacy_huggingface_pipelines = "^0.0.4" + + +[tool.poetry.group.dev.dependencies] +pytest = ">=6.*" +flake8 = ">=3.*" +pytest-azurepipelines = "^1.0.5" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1f60c1a..0000000 --- a/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -spacy>=3.2.0 -numpy>=1.20.2 -jupyter>=1 -pandas>=1.2.4 -tqdm>=4.60.0 -haikunator>=2.1.0 -schwifty -faker>=9.6.0 -scikit_learn -#flair -#stanza -#spacy_stanza -#sklearn_crfsuite -pytest>=6.2.3 -presidio_analyzer -presidio_anonymizer -requests>=2.25.1 -xmltodict>=0.12.0 -python-dotenv -plotly -azure-ai-textanalytics==5.2.0 diff --git a/requirements_all.txt b/requirements_all.txt deleted file mode 100644 index 64a5104..0000000 --- a/requirements_all.txt +++ /dev/null @@ -1,20 +0,0 @@ -spacy>=3.2.0 -numpy>=1.12.4 -jupyter>=1 -pandas>=1.3.4 -tqdm>=4.60.0 -haikunator>=2.1.0 -schwifty -faker>=9.6.0 -scikit_learn<0.24 -pytest>=6.2.3 -presidio_analyzer -presidio_anonymizer -requests>=2.25.1 -xmltodict>=0.12.0 -torch>=1.10.1 -python-dotenv -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm -flair>=0.10 -stanza>=1.3.0 -spacy-stanza>=1.0.1 \ No newline at end of file From 2378e6803af77fecc6f3abb52bff31df5e583e6d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 25 Dec 2023 22:06:41 +0200 Subject: [PATCH 2/3] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8fb762a..295447f 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ conda activate presidio pip install poetry poetry install # To install with all additional NER dependencies (e.g. Flair, Stanza, CRF), run: -# poetry install -with ner +# poetry install --with ner # Download a spaCy model used by presidio-analyzer python -m spacy download en_core_web_lg From 181918123330c1919d121be55314ee96a353ce59 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 25 Dec 2023 22:09:56 +0200 Subject: [PATCH 3/3] Update azure-pipelines.yml --- azure-pipelines.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b86a426..c2ba4f4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,8 +8,6 @@ pool: vmImage: 'ubuntu-latest' strategy: matrix: - Python37: - python.version: '3.7' Python38: python.version: '3.8' Python39: