From ebab0a7055ec64cddcf5b774febad3d873947c17 Mon Sep 17 00:00:00 2001
From: Pierre Narcisi <pierre.narcisi@mnhn.fr>
Date: Wed, 27 Sep 2023 15:02:56 +0200
Subject: [PATCH 01/16] fix(permissions) send import permission object

---
 .../modal_dataset/import-modal-dataset.component.html           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/frontend/app/components/modal_dataset/import-modal-dataset.component.html b/frontend/app/components/modal_dataset/import-modal-dataset.component.html
index 7219235f..fbb70a8c 100644
--- a/frontend/app/components/modal_dataset/import-modal-dataset.component.html
+++ b/frontend/app/components/modal_dataset/import-modal-dataset.component.html
@@ -19,7 +19,7 @@ <h5 class="modal-title"> Sélection du jeu de données </h5>
             label="{{ 'MetaData.Datasets' | translate }}"
             [parentFormControl]="selectDatasetForm"
             moduleCode="IMPORT"
-            creatableInModule="IMPORT"
+            creatableInModule="IMPORT.IMPORT"
         >
         </pnx-datasets>
         <div

From 36ab5de9f7a56dc7c8c8eec9549c0df82d92fd3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Wed, 27 Sep 2023 19:30:55 +0200
Subject: [PATCH 02/16] check referential without cast

Use already converted values from types checking.
---
 .../gn_module_import/checks/sql/__init__.py   | 60 +++++++++----------
 .../migrations/ea67bf7b6888_remove_cd_fk.py   | 50 ++++++++++++++++
 backend/gn_module_import/tasks.py             | 12 ++--
 .../gn_module_import/tests/files/cd_file.csv  |  8 ++-
 .../gn_module_import/tests/test_imports.py    |  4 +-
 5 files changed, 92 insertions(+), 42 deletions(-)
 create mode 100644 backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py

diff --git a/backend/gn_module_import/checks/sql/__init__.py b/backend/gn_module_import/checks/sql/__init__.py
index 406722f1..980cdc20 100644
--- a/backend/gn_module_import/checks/sql/__init__.py
+++ b/backend/gn_module_import/checks/sql/__init__.py
@@ -2,7 +2,7 @@
 
 from flask import current_app
 from sqlalchemy import func
-from sqlalchemy.sql.expression import select, update, insert, literal
+from sqlalchemy.sql.expression import select, update, insert, literal, join
 from sqlalchemy.sql import column
 import sqlalchemy as sa
 from sqlalchemy.dialects.postgresql import array_agg, aggregate_order_by
@@ -174,39 +174,36 @@ def check_nomenclatures(imprt, fields):
         )
 
 
-def set_column_from_referential(imprt, field, reference, error_type, whereclause=None):
-    source_field = getattr(ImportSyntheseData, field.source_field)
+def check_referential(imprt, field, reference_field, error_type, reference_table=None):
     synthese_field = getattr(ImportSyntheseData, field.synthese_field)
-    stmt = (
-        update(ImportSyntheseData)
-        .values(
-            {
-                synthese_field: reference,
-            }
-        )
-        .where(
-            sa.and_(
-                source_field == sa.cast(reference, sa.Unicode),
-                ImportSyntheseData.id_import == imprt.id_import,
+    if reference_table is None:
+        reference_table = reference_field.class_
+    # We outerjoin the referential, and select rows where there is a value in synthese field
+    # but no value in referential, which means no value in the referential matched synthese field.
+    cte = (
+        select([ImportSyntheseData.line_no])
+        .select_from(
+            join(
+                ImportSyntheseData,
+                reference_table,
+                synthese_field == reference_field,
+                isouter=True,
             )
         )
+        .where(ImportSyntheseData.imprt == imprt)
+        .where(synthese_field != None)
+        .where(reference_field == None)
+        .cte("invalid_ref")
     )
-    if whereclause is not None:
-        stmt = stmt.where(whereclause)
-    db.session.execute(stmt)
     report_erroneous_rows(
         imprt,
         error_type=error_type,
         error_column=field.name_field,
-        whereclause=sa.and_(
-            source_field != None,
-            source_field != "",
-            synthese_field == None,
-        ),
+        whereclause=ImportSyntheseData.line_no == cte.c.line_no,
     )
 
 
-def set_cd_nom(imprt, fields):
+def check_cd_nom(imprt, fields):
     if "cd_nom" not in fields:
         return
     field = fields["cd_nom"]
@@ -214,21 +211,22 @@ def set_cd_nom(imprt, fields):
     # Filter out on a taxhub list if provided
     list_id = current_app.config["IMPORT"].get("ID_LIST_TAXA_RESTRICTION", None)
     if list_id is not None:
-        whereclause = sa.and_(
-            CorNomListe.id_liste == list_id,
-            BibNoms.id_nom == CorNomListe.id_nom,
-            Taxref.cd_nom == BibNoms.cd_nom,
+        reference_table = join(Taxref, BibNoms).join(
+            CorNomListe,
+            sa.and_(BibNoms.id_nom == CorNomListe.id_nom, CorNomListe.id_liste == list_id),
         )
-    set_column_from_referential(
-        imprt, field, Taxref.cd_nom, "CD_NOM_NOT_FOUND", whereclause=whereclause
+    else:
+        reference_table = Taxref
+    check_referential(
+        imprt, field, Taxref.cd_nom, "CD_NOM_NOT_FOUND", reference_table=reference_table
     )
 
 
-def set_cd_hab(imprt, fields):
+def check_cd_hab(imprt, fields):
     if "cd_hab" not in fields:
         return
     field = fields["cd_hab"]
-    set_column_from_referential(imprt, field, Habref.cd_hab, "CD_HAB_NOT_FOUND")
+    check_referential(imprt, field, Habref.cd_hab, "CD_HAB_NOT_FOUND")
 
 
 def set_altitudes(imprt, fields):
diff --git a/backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py b/backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py
new file mode 100644
index 00000000..9a95c6f9
--- /dev/null
+++ b/backend/gn_module_import/migrations/ea67bf7b6888_remove_cd_fk.py
@@ -0,0 +1,50 @@
+"""remove cd fk
+
+Revision ID: ea67bf7b6888
+Revises: d6bf8eaf088c
+Create Date: 2023-09-27 15:37:19.286693
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "ea67bf7b6888"
+down_revision = "d6bf8eaf088c"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.drop_constraint(
+        schema="gn_imports",
+        table_name="t_imports_synthese",
+        constraint_name="t_imports_synthese_cd_nom_fkey",
+    )
+    op.drop_constraint(
+        schema="gn_imports",
+        table_name="t_imports_synthese",
+        constraint_name="t_imports_synthese_cd_hab_fkey",
+    )
+
+
+def downgrade():
+    op.create_foreign_key(
+        constraint_name="t_imports_synthese_cd_nom_fkey",
+        source_schema="gn_imports",
+        source_table="t_imports_synthese",
+        local_cols=["cd_nom"],
+        referent_schema="taxonomie",
+        referent_table="taxref",
+        remote_cols=["cd_nom"],
+    )
+    op.create_foreign_key(
+        constraint_name="t_imports_synthese_cd_hab_fkey",
+        source_schema="gn_imports",
+        source_table="t_imports_synthese",
+        local_cols=["cd_hab"],
+        referent_schema="ref_habitats",
+        referent_table="habref",
+        remote_cols=["cd_hab"],
+    )
diff --git a/backend/gn_module_import/tasks.py b/backend/gn_module_import/tasks.py
index 577e3f6d..3171ce6b 100644
--- a/backend/gn_module_import/tasks.py
+++ b/backend/gn_module_import/tasks.py
@@ -23,8 +23,8 @@
     do_nomenclatures_mapping,
     check_nomenclatures,
     complete_others_geom_columns,
-    set_cd_nom,
-    set_cd_hab,
+    check_cd_nom,
+    check_cd_hab,
     set_altitudes,
     set_uuid,
     check_mandatory_fields,
@@ -63,9 +63,7 @@ def do_import_checks(self, import_id):
         field.name_field: field
         for field in selected_fields
         if (  # handled in SQL, exclude from dataframe
-            field.source_field is not None
-            and field.mnemonique is None
-            and field.name_field not in ["cd_nom", "cd_hab"]
+            field.source_field is not None and field.mnemonique is None
         )
     }
 
@@ -94,8 +92,8 @@ def do_import_checks(self, import_id):
         complete_others_geom_columns,
         do_nomenclatures_mapping,
         check_nomenclatures,
-        set_cd_nom,
-        set_cd_hab,
+        check_cd_nom,
+        check_cd_hab,
         check_duplicates_source_pk,
         set_altitudes,
         check_altitudes,
diff --git a/backend/gn_module_import/tests/files/cd_file.csv b/backend/gn_module_import/tests/files/cd_file.csv
index eb555f35..cec11406 100644
--- a/backend/gn_module_import/tests/files/cd_file.csv
+++ b/backend/gn_module_import/tests/files/cd_file.csv
@@ -2,10 +2,12 @@ date_min;nom_cite;observateurs;WKT;cd_nom;cd_hab;Erreur(s) attendue(s)
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);;;MISSING_VALUE (cd_nom)
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;;CD_NOM_NOT_FOUND
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;;Valide
-2017-01-01;Ablette;Toto;POINT(6.5 44.85);;123456789;MISSING_VALUE (cd_nom), CD_HAB_NOT_FOUND
+2017-01-01;Ablette;Toto;POINT(6.5 44.85);;123456789;MISSING_VALUE (cd_nom) (CD_HAB_NOT_FOUND non levé car les lignes avec champs manquant ne sont pas traitées)
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);;629;MISSING_VALUE (cd_nom)
-2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;123456789;CD_NOM_NOT_FOUND, CD_HAB_NOT_FOUND
+2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;123456789;CD_NOM_NOT_FOUND & CD_HAB_NOT_FOUND
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;123456789;CD_HAB_NOT_FOUND
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);123456789;629;CD_NOM_NOT_FOUND
 2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;629;Valide
-2017-01-01;Acanthodrilus;Toto;POINT(6.5 44.85);886847;629;CD_NOM_NOT_FOUND
+2017-01-01;Acanthodrilus;Toto;POINT(6.5 44.85);886847;629;CD_NOM_NOT_FOUND (existe mais pas dans la liste)
+2017-01-01;Ablette;Toto;POINT(6.5 44.85);aaa;629;INVALID_INTEGER (cd_nom)
+2017-01-01;Ablette;Toto;POINT(6.5 44.85);67111;aaa;INVALID_INTEGER (cd_hab)
diff --git a/backend/gn_module_import/tests/test_imports.py b/backend/gn_module_import/tests/test_imports.py
index 7e658f6d..10421bcd 100644
--- a/backend/gn_module_import/tests/test_imports.py
+++ b/backend/gn_module_import/tests/test_imports.py
@@ -970,7 +970,9 @@ def test_import_cd_file(self, change_id_list_conf, prepared_import):
             {
                 ("MISSING_VALUE", "cd_nom", frozenset([1, 4, 5])),
                 ("CD_NOM_NOT_FOUND", "cd_nom", frozenset([2, 6, 8, 10])),
-                ("CD_HAB_NOT_FOUND", "cd_hab", frozenset([4, 6, 7])),
+                ("CD_HAB_NOT_FOUND", "cd_hab", frozenset([6, 7])),
+                ("INVALID_INTEGER", "cd_nom", frozenset([11])),
+                ("INVALID_INTEGER", "cd_hab", frozenset([12])),
             },
         )
 

From c37900beddfee5ca7cd9ca4c3b1fd53c927e16e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Wed, 27 Sep 2023 20:09:39 +0200
Subject: [PATCH 03/16] input url for dataset: clarifies object code

---
 backend/gn_module_import/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/gn_module_import/models.py b/backend/gn_module_import/models.py
index 4076d2b2..39505a25 100644
--- a/backend/gn_module_import/models.py
+++ b/backend/gn_module_import/models.py
@@ -47,6 +47,7 @@ def generate_input_url_for_dataset(self, dataset):
         return f"/import/process/upload?datasetId={dataset.id_dataset}"
 
     generate_input_url_for_dataset.label = "Importer des données"
+    generate_input_url_for_dataset.object_code = "IMPORT"
 
     def generate_module_url_for_source(self, source):
         id_import = re.search(r"^Import\(id=(?P<id>\d+)\)$", source.name_source).group("id")

From 01aa338cbb1a8670545f01c75c598cdd15e8c9b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 10:11:47 +0200
Subject: [PATCH 04/16] allow only on error per import/type/column

Extends list of erroneous rows if an error already exists.
---
 .../checks/dataframe/__init__.py              | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/backend/gn_module_import/checks/dataframe/__init__.py b/backend/gn_module_import/checks/dataframe/__init__.py
index f75b1414..a3fe84db 100644
--- a/backend/gn_module_import/checks/dataframe/__init__.py
+++ b/backend/gn_module_import/checks/dataframe/__init__.py
@@ -2,7 +2,9 @@
 from uuid import uuid4
 from itertools import chain
 
+from sqlalchemy import func
 from sqlalchemy.orm.exc import NoResultFound
+from sqlalchemy.dialects.postgresql import insert as pg_insert
 from flask import current_app
 
 from geonature.utils.env import db
@@ -118,11 +120,21 @@ def run_all_checks(imprt, fields: Dict[str, BibFields], df):
         ordered_invalid_rows = sorted(invalid_rows["line_no"])
         column = generated_fields.get(error["column"], error["column"])
         column = imprt.fieldmapping.get(column, column)
-        error = ImportUserError(
-            imprt=imprt,
-            type=error_type,
-            column=column,
-            rows=ordered_invalid_rows,
-            comment=error.get("comment"),
+        # If an error for same import, same column and of the same type already exists,
+        # we concat existing erroneous rows with current rows.
+        stmt = pg_insert(ImportUserError).values(
+            {
+                "id_import": imprt.id_import,
+                "id_error": error_type.pk,
+                "column_error": column,
+                "id_rows": ordered_invalid_rows,
+                "comment": error.get("comment"),
+            }
         )
-        db.session.add(error)
+        stmt = stmt.on_conflict_do_update(
+            constraint="t_user_errors_un",  # unique (import, error_type, column)
+            set_={
+                "id_rows": func.array_cat(ImportUserError.rows, stmt.excluded["id_rows"]),
+            },
+        )
+        db.session.execute(stmt)

From efcf24655b6f66621c32d671b329f34e30b35a43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 10:11:57 +0200
Subject: [PATCH 05/16] do dataframe checks in batches

---
 backend/gn_module_import/conf_schema_toml.py  |  1 +
 .../2896cf965dd6_unique_import_error.py       | 31 +++++++++++
 backend/gn_module_import/tasks.py             | 52 +++++++++++++------
 .../gn_module_import/tests/test_imports.py    |  7 ++-
 backend/gn_module_import/utils.py             | 10 +++-
 5 files changed, 82 insertions(+), 19 deletions(-)
 create mode 100644 backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py

diff --git a/backend/gn_module_import/conf_schema_toml.py b/backend/gn_module_import/conf_schema_toml.py
index 3c0070da..37e3988c 100644
--- a/backend/gn_module_import/conf_schema_toml.py
+++ b/backend/gn_module_import/conf_schema_toml.py
@@ -155,3 +155,4 @@ class GnModuleSchemaConf(Schema):
     # are in the list. Otherwise throws an error
     ID_LIST_TAXA_RESTRICTION = fields.Integer(load_default=None)
     MODULE_URL = fields.String(load_default="/import")
+    DATAFRAME_BATCH_SIZE = fields.Integer(load_default=10000)
diff --git a/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py
new file mode 100644
index 00000000..b38d3104
--- /dev/null
+++ b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py
@@ -0,0 +1,31 @@
+"""unique import error
+
+Revision ID: 2896cf965dd6
+Revises: d6bf8eaf088c
+Create Date: 2023-09-28 10:19:10.133530
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "2896cf965dd6"
+down_revision = "d6bf8eaf088c"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.create_unique_constraint(
+        schema="gn_imports",
+        table_name="t_user_errors",
+        columns=["id_import", "id_error", "column_error"],
+        constraint_name="t_user_errors_un",
+    )
+
+
+def downgrade():
+    op.drop_constraint(
+        schema="gn_imports", table_name="t_user_errors", constraint_name="t_user_errors_un"
+    )
diff --git a/backend/gn_module_import/tasks.py b/backend/gn_module_import/tasks.py
index 577e3f6d..b6918f99 100644
--- a/backend/gn_module_import/tasks.py
+++ b/backend/gn_module_import/tasks.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from math import ceil
 
 from flask import current_app
 from sqlalchemy import func, distinct
@@ -16,6 +17,7 @@
 from gn_module_import.checks.dataframe.geography import set_the_geom_column
 from gn_module_import.utils import (
     load_import_data_in_dataframe,
+    mark_all_rows_as_invalid,
     update_import_data_from_dataframe,
     import_data_to_synthese,
 )
@@ -69,24 +71,42 @@ def do_import_checks(self, import_id):
         )
     }
 
-    # Checks on dataframe
-    logger.info("Loading import data in dataframe…")
-    with start_sentry_child(op="check.df", description="load dataframe"):
-        df = load_import_data_in_dataframe(imprt, fields)
+    with start_sentry_child(op="check.df", description="mark_all"):
+        mark_all_rows_as_invalid(imprt)
     self.update_state(state="PROGRESS", meta={"progress": 0.1})
-    logger.info("Running dataframe checks…")
-    with start_sentry_child(op="check.df", description="run all checks"):
-        run_all_checks(imprt, fields, df)
-    self.update_state(state="PROGRESS", meta={"progress": 0.2})
-    logger.info("Completing geometric columns…")
-    with start_sentry_child(op="check.df", description="set geom column"):
-        set_the_geom_column(imprt, fields, df)
-    self.update_state(state="PROGRESS", meta={"progress": 0.3})
-    logger.info("Updating import data from dataframe…")
-    with start_sentry_child(op="check.df", description="save dataframe"):
-        update_import_data_from_dataframe(imprt, fields, df)
-    self.update_state(state="PROGRESS", meta={"progress": 0.4})
 
+    batch_size = current_app.config["IMPORT"]["DATAFRAME_BATCH_SIZE"]
+    batch_count = ceil(imprt.source_count / batch_size)
+
+    def update_batch_progress(batch, step):
+        start = 0.1
+        end = 0.4
+        step_count = 4
+        progress = start + ((batch + 1) / batch_count) * (step / step_count) * (end - start)
+        self.update_state(state="PROGRESS", meta={"progress": progress})
+
+    for batch in range(batch_count):
+        offset = batch * batch_size
+        batch_fields = fields.copy()
+        # Checks on dataframe
+        logger.info(f"[{batch+1}/{batch_count}] Loading import data in dataframe…")
+        with start_sentry_child(op="check.df", description="load dataframe"):
+            df = load_import_data_in_dataframe(imprt, batch_fields, offset, batch_size)
+        update_batch_progress(batch, 1)
+        logger.info(f"[{batch+1}/{batch_count}] Running dataframe checks…")
+        with start_sentry_child(op="check.df", description="run all checks"):
+            run_all_checks(imprt, batch_fields, df)
+        update_batch_progress(batch, 2)
+        logger.info(f"[{batch+1}/{batch_count}] Completing geometric columns…")
+        with start_sentry_child(op="check.df", description="set geom column"):
+            set_the_geom_column(imprt, batch_fields, df)
+        update_batch_progress(batch, 3)
+        logger.info(f"[{batch+1}/{batch_count}] Updating import data from dataframe…")
+        with start_sentry_child(op="check.df", description="save dataframe"):
+            update_import_data_from_dataframe(imprt, batch_fields, df)
+        update_batch_progress(batch, 4)
+
+    fields = batch_fields  # retrive fields added during dataframe checks
     fields.update({field.name_field: field for field in selected_fields})
 
     # Checks in SQL
diff --git a/backend/gn_module_import/tests/test_imports.py b/backend/gn_module_import/tests/test_imports.py
index 7e658f6d..8034b705 100644
--- a/backend/gn_module_import/tests/test_imports.py
+++ b/backend/gn_module_import/tests/test_imports.py
@@ -124,6 +124,11 @@ def create_import(authors=[]):
     }
 
 
+@pytest.fixture()
+def small_batch(monkeypatch):
+    monkeypatch.setitem(current_app.config["IMPORT"], "DATAFRAME_BATCH_SIZE", 3)
+
+
 @pytest.fixture()
 def no_default_nomenclatures(monkeypatch):
     monkeypatch.setitem(
@@ -247,7 +252,7 @@ def content_mapped_import(client, import_file_name, loaded_import):
 
 
 @pytest.fixture()
-def prepared_import(client, content_mapped_import):
+def prepared_import(client, content_mapped_import, small_batch):
     set_logged_user_cookie(client, content_mapped_import.authors[0])
     r = client.post(url_for("import.prepare_import", import_id=content_mapped_import.id_import))
     assert r.status_code == 200, r.data
diff --git a/backend/gn_module_import/utils.py b/backend/gn_module_import/utils.py
index ea6b0126..f727f7eb 100644
--- a/backend/gn_module_import/utils.py
+++ b/backend/gn_module_import/utils.py
@@ -219,7 +219,7 @@ def build_fieldmapping(imprt, columns):
     return fieldmapping, used_columns
 
 
-def load_import_data_in_dataframe(imprt, fields):
+def load_import_data_in_dataframe(imprt, fields, offset, limit):
     source_cols = [
         "id_import",
         "line_no",
@@ -230,6 +230,9 @@ def load_import_data_in_dataframe(imprt, fields):
         .filter(
             ImportSyntheseData.imprt == imprt,
         )
+        .order_by(ImportSyntheseData.line_no)
+        .offset(offset)
+        .limit(limit)
         .all()
     )
     df = pd.DataFrame.from_records(
@@ -239,10 +242,13 @@ def load_import_data_in_dataframe(imprt, fields):
     return df
 
 
-def update_import_data_from_dataframe(imprt, fields, df):
+def mark_all_rows_as_invalid(imprt):
     db.session.query(ImportSyntheseData).filter_by(id_import=imprt.id_import).update(
         {"valid": False}
     )
+
+
+def update_import_data_from_dataframe(imprt, fields, df):
     if not len(df[df["valid"] == True]):
         return
     updated_cols = [

From a530fed6fc546e59d39c49588b878ba0e322baba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 10:29:47 +0200
Subject: [PATCH 06/16] add Debian 12 python version to test matrix

---
 .github/workflows/pytest.yml | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 9eaf3ec0..54e7c9cc 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -22,17 +22,22 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        debian-version: [ '10', '11', '12' ]
         include:
-          - name: "Debian 10"
+          - debian-version: '10'
             python-version: "3.7"
-            postgres-version: 11
-            postgis-version: 2.5
-          - name: "Debian 11"
-            python-version: "3.9"
-            postgres-version: 13
-            postgis-version: 3.2
+            postgres-version: '11'
+            postgis-version: '2.5'
+          - debian-version: '11'
+            python-version: '3.9'
+            postgres-version: '13'
+            postgis-version: '3.2'
+          - debian-version: '12'
+            python-version: '3.11'
+            postgres-version: '15'
+            postgis-version: '3.3'
 
-    name: ${{ matrix.name }}
+    name: Debian ${{ matrix.debian-version }}
 
     services:
       postgres:
@@ -118,7 +123,7 @@ jobs:
         GEONATURE_CONFIG_FILE: dependencies/GeoNature/config/test_config.toml
         GEONATURE_SETTINGS: gn_module_import.test_config
     - name: Upload coverage to Codecov
-      if: ${{ matrix.name == 'Debian 11' }}
+      if: ${{ matrix.debian-version == '12' }}
       uses: codecov/codecov-action@v2
       with:
         flags: pytest

From 373e6c38119f3aea4713006d67720ac5166fb487 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 11:39:00 +0200
Subject: [PATCH 07/16] avoid decoding full file when looking columns

---
 backend/gn_module_import/routes/imports.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/backend/gn_module_import/routes/imports.py b/backend/gn_module_import/routes/imports.py
index 3bb3e163..ec45c132 100644
--- a/backend/gn_module_import/routes/imports.py
+++ b/backend/gn_module_import/routes/imports.py
@@ -1,6 +1,5 @@
-from io import BytesIO
 import codecs
-from io import StringIO
+from io import BytesIO, StringIO, TextIOWrapper
 import csv
 import unicodedata
 
@@ -215,7 +214,7 @@ def upload_file(scope, import_id):
 @blueprint.route("/imports/<int:import_id>/decode", methods=["POST"])
 @permissions.check_cruved_scope("C", get_scope=True, module_code="IMPORT", object_code="IMPORT")
 def decode_file(scope, import_id):
-    imprt = TImports.query.options(undefer("source_file")).get_or_404(import_id)
+    imprt = TImports.query.get_or_404(import_id)
     if not imprt.has_instance_permission(scope):
         raise Forbidden
     if not imprt.dataset.active:
@@ -257,15 +256,19 @@ def decode_file(scope, import_id):
     except ValueError:
         raise BadRequest(description="decode parameter must but an int")
     if decode:
+        csvfile = TextIOWrapper(BytesIO(imprt.source_file), encoding=imprt.encoding)
+        csvreader = csv.reader(csvfile, delimiter=imprt.separator)
         try:
-            csvfile = StringIO(imprt.source_file.decode(imprt.encoding))
+            columns = next(csvreader)
+            while True:  # read full file to ensure that no encoding errors occur
+                next(csvreader)
         except UnicodeError as e:
             raise BadRequest(
                 description="Erreur d’encodage lors de la lecture du fichier source. "
                 "Avez-vous sélectionné le bon encodage de votre fichier ?"
             )
-        csvreader = csv.reader(csvfile, delimiter=imprt.separator)
-        columns = next(csvreader)
+        except StopIteration:
+            pass
         duplicates = set([col for col in columns if columns.count(col) > 1])
         if duplicates:
             raise BadRequest(f"Duplicates column names: {duplicates}")

From 943863e1f3b54f371ac46e1c2895c19340cbb939 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 11:43:01 +0200
Subject: [PATCH 08/16] avoid loading full file when generating csv errors

---
 backend/gn_module_import/routes/imports.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/gn_module_import/routes/imports.py b/backend/gn_module_import/routes/imports.py
index ec45c132..9749dacc 100644
--- a/backend/gn_module_import/routes/imports.py
+++ b/backend/gn_module_import/routes/imports.py
@@ -522,7 +522,7 @@ def get_import_invalid_rows_as_csv(scope, import_id):
 
     Export invalid data in CSV.
     """
-    imprt = TImports.query.options(undefer("source_file")).get_or_404(import_id)
+    imprt = TImports.query.get_or_404(import_id)
     if not imprt.has_instance_permission(scope):
         raise Forbidden
     if not imprt.processed:
@@ -533,7 +533,7 @@ def get_import_invalid_rows_as_csv(scope, import_id):
 
     @stream_with_context
     def generate_invalid_rows_csv():
-        sourcefile = StringIO(imprt.source_file.decode(imprt.encoding))
+        sourcefile = TextIOWrapper(BytesIO(imprt.source_file), encoding=imprt.encoding)
         destfile = StringIO()
         csvreader = csv.reader(sourcefile, delimiter=imprt.separator)
         csvwriter = csv.writer(destfile, dialect=csvreader.dialect, lineterminator="\n")

From 7cd2eac10e84902c34ffb9e10a572e7681e78c9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 11:45:32 +0200
Subject: [PATCH 09/16] avoid loading full file when importing in table

---
 backend/gn_module_import/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/gn_module_import/utils.py b/backend/gn_module_import/utils.py
index ea6b0126..183e454e 100644
--- a/backend/gn_module_import/utils.py
+++ b/backend/gn_module_import/utils.py
@@ -1,5 +1,5 @@
 import os
-from io import StringIO
+from io import BytesIO, TextIOWrapper
 import csv
 import ast
 import json
@@ -131,7 +131,7 @@ def insert_import_data_in_database(imprt):
 
     extra_columns = set(columns) - set(used_columns)
 
-    csvfile = StringIO(imprt.source_file.decode(imprt.encoding))
+    csvfile = TextIOWrapper(BytesIO(imprt.source_file), encoding=imprt.encoding)
     csvreader = csv.DictReader(csvfile, fieldnames=columns, delimiter=imprt.separator)
     header = next(csvreader, None)  # skip header
     for key, value in header.items():  # FIXME

From ac76e2345e6bd0bacafbc632ae6d90817bc33f35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 14:40:09 +0200
Subject: [PATCH 10/16] linearize alembic history

---
 .../migrations/2896cf965dd6_unique_import_error.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py
index b38d3104..cecc2ed8 100644
--- a/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py
+++ b/backend/gn_module_import/migrations/2896cf965dd6_unique_import_error.py
@@ -11,7 +11,7 @@
 
 # revision identifiers, used by Alembic.
 revision = "2896cf965dd6"
-down_revision = "d6bf8eaf088c"
+down_revision = "ea67bf7b6888"
 branch_labels = None
 depends_on = None
 

From 69557f804b9b89a680a1c88c295a6cfa1407d1a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 15:14:09 +0200
Subject: [PATCH 11/16] fix race condition

Imports are created at same time in fixture, which leads to unreliable sorting order.
---
 backend/gn_module_import/tests/test_imports.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/backend/gn_module_import/tests/test_imports.py b/backend/gn_module_import/tests/test_imports.py
index aad5ec9d..d32f193c 100644
--- a/backend/gn_module_import/tests/test_imports.py
+++ b/backend/gn_module_import/tests/test_imports.py
@@ -400,11 +400,9 @@ def test_search_import(self, users, imports, uploaded_import):
 
     def test_order_import(self, users, imports, uploaded_import):
         set_logged_user_cookie(self.client, users["user"])
-        r_des = self.client.get(url_for("import.get_import_list"))
+        r_des = self.client.get(url_for("import.get_import_list") + "?sort=id_import")
         assert r_des.status_code == 200, r_des.data
-        r_asc = self.client.get(
-            url_for("import.get_import_list") + "?sort=date_create_import&sort_dir=asc"
-        )
+        r_asc = self.client.get(url_for("import.get_import_list") + "?sort=id_import&sort_dir=asc")
         assert r_asc.status_code == 200, r_asc.data
         import_ids_des = [imprt["id_import"] for imprt in r_des.get_json()["imports"]]
         import_ids_asc = [imprt["id_import"] for imprt in r_asc.get_json()["imports"]]

From 4d0b930c51ec44c27f12c9a1429b007d9b41bcf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 15:33:47 +0200
Subject: [PATCH 12/16] disable sql check of mandatory fields

Mandatory fields are already checked during dataframe checks.
---
 backend/gn_module_import/checks/sql/__init__.py | 1 +
 backend/gn_module_import/tasks.py               | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/backend/gn_module_import/checks/sql/__init__.py b/backend/gn_module_import/checks/sql/__init__.py
index 980cdc20..ba5c0acc 100644
--- a/backend/gn_module_import/checks/sql/__init__.py
+++ b/backend/gn_module_import/checks/sql/__init__.py
@@ -378,6 +378,7 @@ def set_uuid(imprt, fields):
         db.session.execute(stmt)
 
 
+# Currently not used as done during dataframe checks
 def check_mandatory_fields(imprt, fields):
     for field in fields.values():
         if not field.mandatory or not field.synthese_field:
diff --git a/backend/gn_module_import/tasks.py b/backend/gn_module_import/tasks.py
index b4295c29..b4ef6d40 100644
--- a/backend/gn_module_import/tasks.py
+++ b/backend/gn_module_import/tasks.py
@@ -29,7 +29,6 @@
     check_cd_hab,
     set_altitudes,
     set_uuid,
-    check_mandatory_fields,
     check_duplicates_source_pk,
     check_dates,
     check_altitudes,
@@ -121,7 +120,6 @@ def update_batch_progress(batch, step):
         check_dates,
         check_depths,
         check_digital_proof_urls,
-        check_mandatory_fields,
         check_is_valid_geography,
         check_geography_outside,
     ]

From cce6d471b48d809093eb2d13b1bc97e43a57b6f9 Mon Sep 17 00:00:00 2001
From: Camille Monchicourt <camille.monchicourt@ecrins-parcnational.fr>
Date: Thu, 28 Sep 2023 16:14:37 +0200
Subject: [PATCH 13/16] Changelog 2.2.3

---
 docs/CHANGELOG.rst | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst
index b77d9dc2..c9971b71 100644
--- a/docs/CHANGELOG.rst
+++ b/docs/CHANGELOG.rst
@@ -2,6 +2,24 @@
 CHANGELOG
 =========
 
+2.2.3 (2023-09-28)
+------------------
+
+Nécessite la version 2.13.2 (ou plus) de GeoNature.
+
+**🚀 Nouveautés**
+
+* Amélioration des performances de la vérification des cd_nom des observations importées (#424)
+* Amélioration des performances du chargement des données pour leur contrôle (#484)
+* Amélioration des performances de l'analyse des colonnes du fichier source (#486)
+* Amélioration des tests automatisés
+
+**🐛 Corrections**
+
+* Correction des permissions de la liste des JDD auxquels un utilisateur peut associer un import (#481)
+* Correction du bouton d'import dans un JDD depuis sa fiche dans le module Métadonnées (#483)
+
+
 2.2.2 (2023-09-19)
 ------------------
 

From 94e5efd12e7d79e45cd2cca60047bea936c73500 Mon Sep 17 00:00:00 2001
From: Camille Monchicourt <camille.monchicourt@ecrins-parcnational.fr>
Date: Thu, 28 Sep 2023 16:15:11 +0200
Subject: [PATCH 14/16] Bump VERSION - 2.2.3

---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index b1b25a5f..58594069 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.2.2
+2.2.3

From e8b8a515d1c1db534ebe68f0e92bea24a4c45535 Mon Sep 17 00:00:00 2001
From: Camille Monchicourt <camille.monchicourt@ecrins-parcnational.fr>
Date: Thu, 28 Sep 2023 17:07:57 +0200
Subject: [PATCH 15/16] Bump GN minimal version to 2.13.2

---
 requirements.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.in b/requirements.in
index 68fb702b..c68566f0 100644
--- a/requirements.in
+++ b/requirements.in
@@ -5,4 +5,4 @@ pyproj;python_version>="3.10"
 chardet
 jsonschema
 utils-flask-sqlalchemy>=0.3.0,<1.0.0
-geonature>=2.13.0
+geonature>=2.13.2

From 3277e11d7ea8bd6c8ea5728886bc46dfd64f4229 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89lie=20Bouttier?= <elie@bouttier.eu>
Date: Thu, 28 Sep 2023 17:22:32 +0200
Subject: [PATCH 16/16] =?UTF-8?q?bump=20GeoNature=20submodule=202.13.1=20?=
 =?UTF-8?q?=E2=86=92=202.13.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 dependencies/GeoNature | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dependencies/GeoNature b/dependencies/GeoNature
index a978d6aa..0a3764b2 160000
--- a/dependencies/GeoNature
+++ b/dependencies/GeoNature
@@ -1 +1 @@
-Subproject commit a978d6aa6e2d85b51aa644f0e2aee74e67b1cb6d
+Subproject commit 0a3764b29d83fb400b5bebb201bb94e353a915be