From ebab0a7055ec64cddcf5b774febad3d873947c17 Mon Sep 17 00:00:00 2001 From: Pierre Narcisi Date: Wed, 27 Sep 2023 15:02:56 +0200 Subject: [PATCH 01/16] fix(permissions) send import permission object --- .../modal_dataset/import-modal-dataset.component.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/app/components/modal_dataset/import-modal-dataset.component.html b/frontend/app/components/modal_dataset/import-modal-dataset.component.html index 7219235f..fbb70a8c 100644 --- a/frontend/app/components/modal_dataset/import-modal-dataset.component.html +++ b/frontend/app/components/modal_dataset/import-modal-dataset.component.html @@ -19,7 +19,7 @@ label="{{ 'MetaData.Datasets' | translate }}" [parentFormControl]="selectDatasetForm" moduleCode="IMPORT" - creatableInModule="IMPORT" + creatableInModule="IMPORT.IMPORT" >
Date: Wed, 27 Sep 2023 19:30:55 +0200 Subject: [PATCH 02/16] check referential without cast Use already converted values from types checking. --- .../gn_module_import/checks/sql/__init__.py | 60 +++++++++---------- .../migrations/ea67bf7b6888_remove_cd_fk.py | 50 ++++++++++++++++ backend/gn_module_import/tasks.py | 12 ++-- .../gn_module_import/tests/files/cd_file.csv | 8 ++- .../gn_module_import/tests/test_imports.py | 4 +- 5 files changed, 92 insertions(+), 42 deletions(-) create mode 100644 backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py diff --git a/backend/gn_module_import/checks/sql/__init__.py b/backend/gn_module_import/checks/sql/__init__.py index 406722f1..980cdc20 100644 --- a/backend/gn_module_import/checks/sql/__init__.py +++ b/backend/gn_module_import/checks/sql/__init__.py @@ -2,7 +2,7 @@ from flask import current_app from sqlalchemy import func -from sqlalchemy.sql.expression import select, update, insert, literal +from sqlalchemy.sql.expression import select, update, insert, literal, join from sqlalchemy.sql import column import sqlalchemy as sa from sqlalchemy.dialects.postgresql import array_agg, aggregate_order_by @@ -174,39 +174,36 @@ def check_nomenclatures(imprt, fields): ) -def set_column_from_referential(imprt, field, reference, error_type, whereclause=None): - source_field = getattr(ImportSyntheseData, field.source_field) +def check_referential(imprt, field, reference_field, error_type, reference_table=None): synthese_field = getattr(ImportSyntheseData, field.synthese_field) - stmt = ( - update(ImportSyntheseData) - .values( - { - synthese_field: reference, - } - ) - .where( - sa.and_( - source_field == sa.cast(reference, sa.Unicode), - ImportSyntheseData.id_import == imprt.id_import, + if reference_table is None: + reference_table = reference_field.class_ + # We outerjoin the referential, and select rows where there is a value in synthese field + # but no value in referential, which means no value in the referential matched synthese field. + cte = ( + select([ImportSyntheseData.line_no]) + .select_from( + join( + ImportSyntheseData, + reference_table, + synthese_field == reference_field, + isouter=True, ) ) + .where(ImportSyntheseData.imprt == imprt) + .where(synthese_field != None) + .where(reference_field == None) + .cte("invalid_ref") ) - if whereclause is not None: - stmt = stmt.where(whereclause) - db.session.execute(stmt) report_erroneous_rows( imprt, error_type=error_type, error_column=field.name_field, - whereclause=sa.and_( - source_field != None, - source_field != "", - synthese_field == None, - ), + whereclause=ImportSyntheseData.line_no == cte.c.line_no, ) -def set_cd_nom(imprt, fields): +def check_cd_nom(imprt, fields): if "cd_nom" not in fields: return field = fields["cd_nom"] @@ -214,21 +211,22 @@ def set_cd_nom(imprt, fields): # Filter out on a taxhub list if provided list_id = current_app.config["IMPORT"].get("ID_LIST_TAXA_RESTRICTION", None) if list_id is not None: - whereclause = sa.and_( - CorNomListe.id_liste == list_id, - BibNoms.id_nom == CorNomListe.id_nom, - Taxref.cd_nom == BibNoms.cd_nom, + reference_table = join(Taxref, BibNoms).join( + CorNomListe, + sa.and_(BibNoms.id_nom == CorNomListe.id_nom, CorNomListe.id_liste == list_id), ) - set_column_from_referential( - imprt, field, Taxref.cd_nom, "CD_NOM_NOT_FOUND", whereclause=whereclause + else: + reference_table = Taxref + check_referential( + imprt, field, Taxref.cd_nom, "CD_NOM_NOT_FOUND", reference_table=reference_table ) -def set_cd_hab(imprt, fields): +def check_cd_hab(imprt, fields): if "cd_hab" not in fields: return field = fields["cd_hab"] - set_column_from_referential(imprt, field, Habref.cd_hab, "CD_HAB_NOT_FOUND") + check_referential(imprt, field, Habref.cd_hab, "CD_HAB_NOT_FOUND") def set_altitudes(imprt, fields): diff --git a/backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py b/backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py new file mode 100644 index 00000000..9a95c6f9 --- /dev/null +++ b/backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py @@ -0,0 +1,50 @@ +"""remove cd fk + +Revision ID: ea67bf7b6888 +Revises: d6bf8eaf088c +Create Date: 2023-09-27 15:37:19.286693 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "ea67bf7b6888" +down_revision = "d6bf8eaf088c" +branch_labels = None +depends_on = None + + +def upgrade(): + op.drop_constraint( + schema="gn_imports", + table_name="t_imports_synthese", + constraint_name="t_imports_synthese_cd_nom_fkey", + ) + op.drop_constraint( + schema="gn_imports", + table_name="t_imports_synthese", + constraint_name="t_imports_synthese_cd_hab_fkey", + ) + + +def downgrade(): + op.create_foreign_key( + constraint_name="t_imports_synthese_cd_nom_fkey", + source_schema="gn_imports", + source_table="t_imports_synthese", + local_cols=["cd_nom"], + referent_schema="taxonomie", + referent_table="taxref", + remote_cols=["cd_nom"], + ) + op.create_foreign_key( + constraint_name="t_imports_synthese_cd_hab_fkey", + source_schema="gn_imports", + source_table="t_imports_synthese", + local_cols=["cd_hab"], + referent_schema="ref_habitats", + referent_table="habref", + remote_cols=["cd_hab"], + ) diff --git a/backend/gn_module_import/tasks.py b/backend/gn_module_import/tasks.py index 577e3f6d..3171ce6b 100644 --- a/backend/gn_module_import/tasks.py +++ b/backend/gn_module_import/tasks.py @@ -23,8 +23,8 @@ do_nomenclatures_mapping, check_nomenclatures, complete_others_geom_columns, - set_cd_nom, - set_cd_hab, + check_cd_nom, + check_cd_hab, set_altitudes, set_uuid, check_mandatory_fields, @@ -63,9 +63,7 @@ def do_import_checks(self, import_id): field.name_field: field for field in selected_fields if ( # handled in SQL, exclude from dataframe - field.source_field is not None - and field.mnemonique is None - and field.name_field not in ["cd_nom", "cd_hab"] + field.source_field is not None and field.mnemonique is None ) } @@ -94,8 +92,8 @@ def do_import_checks(self, import_id): complete_others_geom_columns, do_nomenclatures_mapping, check_nomenclatures, - set_cd_nom, - set_cd_hab, + check_cd_nom, + check_cd_hab, check_duplicates_source_pk, set_altitudes, check_altitudes, diff --git a/backend/gn_module_import/tests/files/cd_file.csv b/backend/gn_module_import/tests/files/cd_file.csv index eb555f35..cec11406 100644 --- a/backend/gn_module_import/tests/files/cd_file.csv +++ b/backend/gn_module_import/tests/files/cd_file.csv @@ -2,10 +2,12 @@ date_min;nom_cite;observateurs;WKT;cd_nom;cd_hab;Erreur(s) attendue(s) 2017-01-01;Ablette;Toto;POINT(6.5 44.85);;;MISSING_VALUE (cd_nom) 2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;;CD_NOM_NOT_FOUND 2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;;Valide -2017-01-01;Ablette;Toto;POINT(6.5 44.85);;123456789;MISSING_VALUE (cd_nom), CD_HAB_NOT_FOUND +2017-01-01;Ablette;Toto;POINT(6.5 44.85);;123456789;MISSING_VALUE (cd_nom) (CD_HAB_NOT_FOUND non levé car les lignes avec champs manquant ne sont pas traitées) 2017-01-01;Ablette;Toto;POINT(6.5 44.85);;629;MISSING_VALUE (cd_nom) -2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;123456789;CD_NOM_NOT_FOUND, CD_HAB_NOT_FOUND +2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;123456789;CD_NOM_NOT_FOUND & CD_HAB_NOT_FOUND 2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;123456789;CD_HAB_NOT_FOUND 2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;629;CD_NOM_NOT_FOUND 2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;629;Valide -2017-01-01;Acanthodrilus;Toto;POINT(6.5 44.85);886847;629;CD_NOM_NOT_FOUND +2017-01-01;Acanthodrilus;Toto;POINT(6.5 44.85);886847;629;CD_NOM_NOT_FOUND (existe mais pas dans la liste) +2017-01-01;Ablette;Toto;POINT(6.5 44.85);aaa;629;INVALID_INTEGER (cd_nom) +2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;aaa;INVALID_INTEGER (cd_hab) diff --git a/backend/gn_module_import/tests/test_imports.py b/backend/gn_module_import/tests/test_imports.py index 7e658f6d..10421bcd 100644 --- a/backend/gn_module_import/tests/test_imports.py +++ b/backend/gn_module_import/tests/test_imports.py @@ -970,7 +970,9 @@ def test_import_cd_file(self, change_id_list_conf, prepared_import): { ("MISSING_VALUE", "cd_nom", frozenset([1, 4, 5])), ("CD_NOM_NOT_FOUND", "cd_nom", frozenset([2, 6, 8, 10])), - ("CD_HAB_NOT_FOUND", "cd_hab", frozenset([4, 6, 7])), + ("CD_HAB_NOT_FOUND", "cd_hab", frozenset([6, 7])), + ("INVALID_INTEGER", "cd_nom", frozenset([11])), + ("INVALID_INTEGER", "cd_hab", frozenset([12])), }, ) From c37900beddfee5ca7cd9ca4c3b1fd53c927e16e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Wed, 27 Sep 2023 20:09:39 +0200 Subject: [PATCH 03/16] input url for dataset: clarifies object code --- backend/gn_module_import/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/gn_module_import/models.py b/backend/gn_module_import/models.py index 4076d2b2..39505a25 100644 --- a/backend/gn_module_import/models.py +++ b/backend/gn_module_import/models.py @@ -47,6 +47,7 @@ def generate_input_url_for_dataset(self, dataset): return f"/import/process/upload?datasetId={dataset.id_dataset}" generate_input_url_for_dataset.label = "Importer des données" + generate_input_url_for_dataset.object_code = "IMPORT" def generate_module_url_for_source(self, source): id_import = re.search(r"^Import\(id=(?P\d+)\)$", source.name_source).group("id") From 01aa338cbb1a8670545f01c75c598cdd15e8c9b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 10:11:47 +0200 Subject: [PATCH 04/16] allow only on error per import/type/column Extends list of erroneous rows if an error already exists. --- .../checks/dataframe/__init__.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/backend/gn_module_import/checks/dataframe/__init__.py b/backend/gn_module_import/checks/dataframe/__init__.py index f75b1414..a3fe84db 100644 --- a/backend/gn_module_import/checks/dataframe/__init__.py +++ b/backend/gn_module_import/checks/dataframe/__init__.py @@ -2,7 +2,9 @@ from uuid import uuid4 from itertools import chain +from sqlalchemy import func from sqlalchemy.orm.exc import NoResultFound +from sqlalchemy.dialects.postgresql import insert as pg_insert from flask import current_app from geonature.utils.env import db @@ -118,11 +120,21 @@ def run_all_checks(imprt, fields: Dict[str, BibFields], df): ordered_invalid_rows = sorted(invalid_rows["line_no"]) column = generated_fields.get(error["column"], error["column"]) column = imprt.fieldmapping.get(column, column) - error = ImportUserError( - imprt=imprt, - type=error_type, - column=column, - rows=ordered_invalid_rows, - comment=error.get("comment"), + # If an error for same import, same column and of the same type already exists, + # we concat existing erroneous rows with current rows. + stmt = pg_insert(ImportUserError).values( + { + "id_import": imprt.id_import, + "id_error": error_type.pk, + "column_error": column, + "id_rows": ordered_invalid_rows, + "comment": error.get("comment"), + } ) - db.session.add(error) + stmt = stmt.on_conflict_do_update( + constraint="t_user_errors_un", # unique (import, error_type, column) + set_={ + "id_rows": func.array_cat(ImportUserError.rows, stmt.excluded["id_rows"]), + }, + ) + db.session.execute(stmt) From efcf24655b6f66621c32d671b329f34e30b35a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 10:11:57 +0200 Subject: [PATCH 05/16] do dataframe checks in batches --- backend/gn_module_import/conf_schema_toml.py | 1 + .../2896cf965dd6_unique_import_error.py | 31 +++++++++++ backend/gn_module_import/tasks.py | 52 +++++++++++++------ .../gn_module_import/tests/test_imports.py | 7 ++- backend/gn_module_import/utils.py | 10 +++- 5 files changed, 82 insertions(+), 19 deletions(-) create mode 100644 backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py diff --git a/backend/gn_module_import/conf_schema_toml.py b/backend/gn_module_import/conf_schema_toml.py index 3c0070da..37e3988c 100644 --- a/backend/gn_module_import/conf_schema_toml.py +++ b/backend/gn_module_import/conf_schema_toml.py @@ -155,3 +155,4 @@ class GnModuleSchemaConf(Schema): # are in the list. Otherwise throws an error ID_LIST_TAXA_RESTRICTION = fields.Integer(load_default=None) MODULE_URL = fields.String(load_default="/import") + DATAFRAME_BATCH_SIZE = fields.Integer(load_default=10000) diff --git a/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py new file mode 100644 index 00000000..b38d3104 --- /dev/null +++ b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py @@ -0,0 +1,31 @@ +"""unique import error + +Revision ID: 2896cf965dd6 +Revises: d6bf8eaf088c +Create Date: 2023-09-28 10:19:10.133530 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "2896cf965dd6" +down_revision = "d6bf8eaf088c" +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_unique_constraint( + schema="gn_imports", + table_name="t_user_errors", + columns=["id_import", "id_error", "column_error"], + constraint_name="t_user_errors_un", + ) + + +def downgrade(): + op.drop_constraint( + schema="gn_imports", table_name="t_user_errors", constraint_name="t_user_errors_un" + ) diff --git a/backend/gn_module_import/tasks.py b/backend/gn_module_import/tasks.py index 577e3f6d..b6918f99 100644 --- a/backend/gn_module_import/tasks.py +++ b/backend/gn_module_import/tasks.py @@ -1,4 +1,5 @@ from datetime import datetime +from math import ceil from flask import current_app from sqlalchemy import func, distinct @@ -16,6 +17,7 @@ from gn_module_import.checks.dataframe.geography import set_the_geom_column from gn_module_import.utils import ( load_import_data_in_dataframe, + mark_all_rows_as_invalid, update_import_data_from_dataframe, import_data_to_synthese, ) @@ -69,24 +71,42 @@ def do_import_checks(self, import_id): ) } - # Checks on dataframe - logger.info("Loading import data in dataframe…") - with start_sentry_child(op="check.df", description="load dataframe"): - df = load_import_data_in_dataframe(imprt, fields) + with start_sentry_child(op="check.df", description="mark_all"): + mark_all_rows_as_invalid(imprt) self.update_state(state="PROGRESS", meta={"progress": 0.1}) - logger.info("Running dataframe checks…") - with start_sentry_child(op="check.df", description="run all checks"): - run_all_checks(imprt, fields, df) - self.update_state(state="PROGRESS", meta={"progress": 0.2}) - logger.info("Completing geometric columns…") - with start_sentry_child(op="check.df", description="set geom column"): - set_the_geom_column(imprt, fields, df) - self.update_state(state="PROGRESS", meta={"progress": 0.3}) - logger.info("Updating import data from dataframe…") - with start_sentry_child(op="check.df", description="save dataframe"): - update_import_data_from_dataframe(imprt, fields, df) - self.update_state(state="PROGRESS", meta={"progress": 0.4}) + batch_size = current_app.config["IMPORT"]["DATAFRAME_BATCH_SIZE"] + batch_count = ceil(imprt.source_count / batch_size) + + def update_batch_progress(batch, step): + start = 0.1 + end = 0.4 + step_count = 4 + progress = start + ((batch + 1) / batch_count) * (step / step_count) * (end - start) + self.update_state(state="PROGRESS", meta={"progress": progress}) + + for batch in range(batch_count): + offset = batch * batch_size + batch_fields = fields.copy() + # Checks on dataframe + logger.info(f"[{batch+1}/{batch_count}] Loading import data in dataframe…") + with start_sentry_child(op="check.df", description="load dataframe"): + df = load_import_data_in_dataframe(imprt, batch_fields, offset, batch_size) + update_batch_progress(batch, 1) + logger.info(f"[{batch+1}/{batch_count}] Running dataframe checks…") + with start_sentry_child(op="check.df", description="run all checks"): + run_all_checks(imprt, batch_fields, df) + update_batch_progress(batch, 2) + logger.info(f"[{batch+1}/{batch_count}] Completing geometric columns…") + with start_sentry_child(op="check.df", description="set geom column"): + set_the_geom_column(imprt, batch_fields, df) + update_batch_progress(batch, 3) + logger.info(f"[{batch+1}/{batch_count}] Updating import data from dataframe…") + with start_sentry_child(op="check.df", description="save dataframe"): + update_import_data_from_dataframe(imprt, batch_fields, df) + update_batch_progress(batch, 4) + + fields = batch_fields # retrive fields added during dataframe checks fields.update({field.name_field: field for field in selected_fields}) # Checks in SQL diff --git a/backend/gn_module_import/tests/test_imports.py b/backend/gn_module_import/tests/test_imports.py index 7e658f6d..8034b705 100644 --- a/backend/gn_module_import/tests/test_imports.py +++ b/backend/gn_module_import/tests/test_imports.py @@ -124,6 +124,11 @@ def create_import(authors=[]): } +@pytest.fixture() +def small_batch(monkeypatch): + monkeypatch.setitem(current_app.config["IMPORT"], "DATAFRAME_BATCH_SIZE", 3) + + @pytest.fixture() def no_default_nomenclatures(monkeypatch): monkeypatch.setitem( @@ -247,7 +252,7 @@ def content_mapped_import(client, import_file_name, loaded_import): @pytest.fixture() -def prepared_import(client, content_mapped_import): +def prepared_import(client, content_mapped_import, small_batch): set_logged_user_cookie(client, content_mapped_import.authors[0]) r = client.post(url_for("import.prepare_import", import_id=content_mapped_import.id_import)) assert r.status_code == 200, r.data diff --git a/backend/gn_module_import/utils.py b/backend/gn_module_import/utils.py index ea6b0126..f727f7eb 100644 --- a/backend/gn_module_import/utils.py +++ b/backend/gn_module_import/utils.py @@ -219,7 +219,7 @@ def build_fieldmapping(imprt, columns): return fieldmapping, used_columns -def load_import_data_in_dataframe(imprt, fields): +def load_import_data_in_dataframe(imprt, fields, offset, limit): source_cols = [ "id_import", "line_no", @@ -230,6 +230,9 @@ def load_import_data_in_dataframe(imprt, fields): .filter( ImportSyntheseData.imprt == imprt, ) + .order_by(ImportSyntheseData.line_no) + .offset(offset) + .limit(limit) .all() ) df = pd.DataFrame.from_records( @@ -239,10 +242,13 @@ def load_import_data_in_dataframe(imprt, fields): return df -def update_import_data_from_dataframe(imprt, fields, df): +def mark_all_rows_as_invalid(imprt): db.session.query(ImportSyntheseData).filter_by(id_import=imprt.id_import).update( {"valid": False} ) + + +def update_import_data_from_dataframe(imprt, fields, df): if not len(df[df["valid"] == True]): return updated_cols = [ From a530fed6fc546e59d39c49588b878ba0e322baba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 10:29:47 +0200 Subject: [PATCH 06/16] add Debian 12 python version to test matrix --- .github/workflows/pytest.yml | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 9eaf3ec0..54e7c9cc 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -22,17 +22,22 @@ jobs: strategy: fail-fast: false matrix: + debian-version: [ '10', '11', '12' ] include: - - name: "Debian 10" + - debian-version: '10' python-version: "3.7" - postgres-version: 11 - postgis-version: 2.5 - - name: "Debian 11" - python-version: "3.9" - postgres-version: 13 - postgis-version: 3.2 + postgres-version: '11' + postgis-version: '2.5' + - debian-version: '11' + python-version: '3.9' + postgres-version: '13' + postgis-version: '3.2' + - debian-version: '12' + python-version: '3.11' + postgres-version: '15' + postgis-version: '3.3' - name: ${{ matrix.name }} + name: Debian ${{ matrix.debian-version }} services: postgres: @@ -118,7 +123,7 @@ jobs: GEONATURE_CONFIG_FILE: dependencies/GeoNature/config/test_config.toml GEONATURE_SETTINGS: gn_module_import.test_config - name: Upload coverage to Codecov - if: ${{ matrix.name == 'Debian 11' }} + if: ${{ matrix.debian-version == '12' }} uses: codecov/codecov-action@v2 with: flags: pytest From 373e6c38119f3aea4713006d67720ac5166fb487 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 11:39:00 +0200 Subject: [PATCH 07/16] avoid decoding full file when looking columns --- backend/gn_module_import/routes/imports.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/backend/gn_module_import/routes/imports.py b/backend/gn_module_import/routes/imports.py index 3bb3e163..ec45c132 100644 --- a/backend/gn_module_import/routes/imports.py +++ b/backend/gn_module_import/routes/imports.py @@ -1,6 +1,5 @@ -from io import BytesIO import codecs -from io import StringIO +from io import BytesIO, StringIO, TextIOWrapper import csv import unicodedata @@ -215,7 +214,7 @@ def upload_file(scope, import_id): @blueprint.route("/imports//decode", methods=["POST"]) @permissions.check_cruved_scope("C", get_scope=True, module_code="IMPORT", object_code="IMPORT") def decode_file(scope, import_id): - imprt = TImports.query.options(undefer("source_file")).get_or_404(import_id) + imprt = TImports.query.get_or_404(import_id) if not imprt.has_instance_permission(scope): raise Forbidden if not imprt.dataset.active: @@ -257,15 +256,19 @@ def decode_file(scope, import_id): except ValueError: raise BadRequest(description="decode parameter must but an int") if decode: + csvfile = TextIOWrapper(BytesIO(imprt.source_file), encoding=imprt.encoding) + csvreader = csv.reader(csvfile, delimiter=imprt.separator) try: - csvfile = StringIO(imprt.source_file.decode(imprt.encoding)) + columns = next(csvreader) + while True: # read full file to ensure that no encoding errors occur + next(csvreader) except UnicodeError as e: raise BadRequest( description="Erreur d’encodage lors de la lecture du fichier source. " "Avez-vous sélectionné le bon encodage de votre fichier ?" ) - csvreader = csv.reader(csvfile, delimiter=imprt.separator) - columns = next(csvreader) + except StopIteration: + pass duplicates = set([col for col in columns if columns.count(col) > 1]) if duplicates: raise BadRequest(f"Duplicates column names: {duplicates}") From 943863e1f3b54f371ac46e1c2895c19340cbb939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 11:43:01 +0200 Subject: [PATCH 08/16] avoid loading full file when generating csv errors --- backend/gn_module_import/routes/imports.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/gn_module_import/routes/imports.py b/backend/gn_module_import/routes/imports.py index ec45c132..9749dacc 100644 --- a/backend/gn_module_import/routes/imports.py +++ b/backend/gn_module_import/routes/imports.py @@ -522,7 +522,7 @@ def get_import_invalid_rows_as_csv(scope, import_id): Export invalid data in CSV. """ - imprt = TImports.query.options(undefer("source_file")).get_or_404(import_id) + imprt = TImports.query.get_or_404(import_id) if not imprt.has_instance_permission(scope): raise Forbidden if not imprt.processed: @@ -533,7 +533,7 @@ def get_import_invalid_rows_as_csv(scope, import_id): @stream_with_context def generate_invalid_rows_csv(): - sourcefile = StringIO(imprt.source_file.decode(imprt.encoding)) + sourcefile = TextIOWrapper(BytesIO(imprt.source_file), encoding=imprt.encoding) destfile = StringIO() csvreader = csv.reader(sourcefile, delimiter=imprt.separator) csvwriter = csv.writer(destfile, dialect=csvreader.dialect, lineterminator="\n") From 7cd2eac10e84902c34ffb9e10a572e7681e78c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 11:45:32 +0200 Subject: [PATCH 09/16] avoid loading full file when importing in table --- backend/gn_module_import/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/gn_module_import/utils.py b/backend/gn_module_import/utils.py index ea6b0126..183e454e 100644 --- a/backend/gn_module_import/utils.py +++ b/backend/gn_module_import/utils.py @@ -1,5 +1,5 @@ import os -from io import StringIO +from io import BytesIO, TextIOWrapper import csv import ast import json @@ -131,7 +131,7 @@ def insert_import_data_in_database(imprt): extra_columns = set(columns) - set(used_columns) - csvfile = StringIO(imprt.source_file.decode(imprt.encoding)) + csvfile = TextIOWrapper(BytesIO(imprt.source_file), encoding=imprt.encoding) csvreader = csv.DictReader(csvfile, fieldnames=columns, delimiter=imprt.separator) header = next(csvreader, None) # skip header for key, value in header.items(): # FIXME From ac76e2345e6bd0bacafbc632ae6d90817bc33f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 14:40:09 +0200 Subject: [PATCH 10/16] linearize alembic history --- .../migrations/2896cf965dd6_unique_import_error.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py index b38d3104..cecc2ed8 100644 --- a/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py +++ b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py @@ -11,7 +11,7 @@ # revision identifiers, used by Alembic. revision = "2896cf965dd6" -down_revision = "d6bf8eaf088c" +down_revision = "ea67bf7b6888" branch_labels = None depends_on = None From 69557f804b9b89a680a1c88c295a6cfa1407d1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 15:14:09 +0200 Subject: [PATCH 11/16] fix race condition Imports are created at same time in fixture, which leads to unreliable sorting order. --- backend/gn_module_import/tests/test_imports.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/backend/gn_module_import/tests/test_imports.py b/backend/gn_module_import/tests/test_imports.py index aad5ec9d..d32f193c 100644 --- a/backend/gn_module_import/tests/test_imports.py +++ b/backend/gn_module_import/tests/test_imports.py @@ -400,11 +400,9 @@ def test_search_import(self, users, imports, uploaded_import): def test_order_import(self, users, imports, uploaded_import): set_logged_user_cookie(self.client, users["user"]) - r_des = self.client.get(url_for("import.get_import_list")) + r_des = self.client.get(url_for("import.get_import_list") + "?sort=id_import") assert r_des.status_code == 200, r_des.data - r_asc = self.client.get( - url_for("import.get_import_list") + "?sort=date_create_import&sort_dir=asc" - ) + r_asc = self.client.get(url_for("import.get_import_list") + "?sort=id_import&sort_dir=asc") assert r_asc.status_code == 200, r_asc.data import_ids_des = [imprt["id_import"] for imprt in r_des.get_json()["imports"]] import_ids_asc = [imprt["id_import"] for imprt in r_asc.get_json()["imports"]] From 4d0b930c51ec44c27f12c9a1429b007d9b41bcf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 15:33:47 +0200 Subject: [PATCH 12/16] disable sql check of mandatory fields Mandatory fields are already checked during dataframe checks. --- backend/gn_module_import/checks/sql/__init__.py | 1 + backend/gn_module_import/tasks.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/gn_module_import/checks/sql/__init__.py b/backend/gn_module_import/checks/sql/__init__.py index 980cdc20..ba5c0acc 100644 --- a/backend/gn_module_import/checks/sql/__init__.py +++ b/backend/gn_module_import/checks/sql/__init__.py @@ -378,6 +378,7 @@ def set_uuid(imprt, fields): db.session.execute(stmt) +# Currently not used as done during dataframe checks def check_mandatory_fields(imprt, fields): for field in fields.values(): if not field.mandatory or not field.synthese_field: diff --git a/backend/gn_module_import/tasks.py b/backend/gn_module_import/tasks.py index b4295c29..b4ef6d40 100644 --- a/backend/gn_module_import/tasks.py +++ b/backend/gn_module_import/tasks.py @@ -29,7 +29,6 @@ check_cd_hab, set_altitudes, set_uuid, - check_mandatory_fields, check_duplicates_source_pk, check_dates, check_altitudes, @@ -121,7 +120,6 @@ def update_batch_progress(batch, step): check_dates, check_depths, check_digital_proof_urls, - check_mandatory_fields, check_is_valid_geography, check_geography_outside, ] From cce6d471b48d809093eb2d13b1bc97e43a57b6f9 Mon Sep 17 00:00:00 2001 From: Camille Monchicourt Date: Thu, 28 Sep 2023 16:14:37 +0200 Subject: [PATCH 13/16] Changelog 2.2.3 --- docs/CHANGELOG.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index b77d9dc2..c9971b71 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -2,6 +2,24 @@ CHANGELOG ========= +2.2.3 (2023-09-28) +------------------ + +Nécessite la version 2.13.2 (ou plus) de GeoNature. + +**🚀 Nouveautés** + +* Amélioration des performances de la vérification des cd_nom des observations importées (#424) +* Amélioration des performances du chargement des données pour leur contrôle (#484) +* Amélioration des performances de l'analyse des colonnes du fichier source (#486) +* Amélioration des tests automatisés + +**🐛 Corrections** + +* Correction des permissions de la liste des JDD auxquels un utilisateur peut associer un import (#481) +* Correction du bouton d'import dans un JDD depuis sa fiche dans le module Métadonnées (#483) + + 2.2.2 (2023-09-19) ------------------ From 94e5efd12e7d79e45cd2cca60047bea936c73500 Mon Sep 17 00:00:00 2001 From: Camille Monchicourt Date: Thu, 28 Sep 2023 16:15:11 +0200 Subject: [PATCH 14/16] Bump VERSION - 2.2.3 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index b1b25a5f..58594069 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.2 +2.2.3 From e8b8a515d1c1db534ebe68f0e92bea24a4c45535 Mon Sep 17 00:00:00 2001 From: Camille Monchicourt Date: Thu, 28 Sep 2023 17:07:57 +0200 Subject: [PATCH 15/16] Bump GN minimal version to 2.13.2 --- requirements.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.in b/requirements.in index 68fb702b..c68566f0 100644 --- a/requirements.in +++ b/requirements.in @@ -5,4 +5,4 @@ pyproj;python_version>="3.10" chardet jsonschema utils-flask-sqlalchemy>=0.3.0,<1.0.0 -geonature>=2.13.0 +geonature>=2.13.2 From 3277e11d7ea8bd6c8ea5728886bc46dfd64f4229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89lie=20Bouttier?= Date: Thu, 28 Sep 2023 17:22:32 +0200 Subject: [PATCH 16/16] =?UTF-8?q?bump=20GeoNature=20submodule=202.13.1=20?= =?UTF-8?q?=E2=86=92=202.13.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dependencies/GeoNature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies/GeoNature b/dependencies/GeoNature index a978d6aa..0a3764b2 160000 --- a/dependencies/GeoNature +++ b/dependencies/GeoNature @@ -1 +1 @@ -Subproject commit a978d6aa6e2d85b51aa644f0e2aee74e67b1cb6d +Subproject commit 0a3764b29d83fb400b5bebb201bb94e353a915be