From 7da983752e6b7301a269c64dde62968287d1bd68 Mon Sep 17 00:00:00 2001 From: agmorev Date: Wed, 29 Nov 2023 02:30:23 +0200 Subject: [PATCH 1/6] SXDEDPCXZIC-241_DATAVIC-622 / DELWP harvest restricted records --- ckanext/datavic_harvester/harvesters/delwp.py | 91 +++++++++++++++---- 1 file changed, 75 insertions(+), 16 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py index 26cf548..869624f 100644 --- a/ckanext/datavic_harvester/harvesters/delwp.py +++ b/ckanext/datavic_harvester/harvesters/delwp.py @@ -275,6 +275,12 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str: pkg_dict = self._get_pkg_dict(harvest_object) + if not pkg_dict["notes"] or not pkg_dict["owner_org"]: + log.info( + f"Description or organization field is missing for object {harvest_object.id}, skipping..." + ) + return False + if status not in ["new", "change"]: return True @@ -350,6 +356,14 @@ def _get_pkg_dict(self, harvest_object): else "" ) + access_notes = """ + Aerial imagery and elevation datasets\n + You can access high-resolution aerial imagery and elevation (LiDAR point cloud) datasets by contacting a business that holds a commercial license.\n + We have two types of commercial licensing:\n + Data Service Providers (DSPs) provide access to the source imagery or elevation data.\n + Value Added Retailers (VARs ) use the imagery and elevation data to create new products and services. This includes advisory services and new knowledge products. + """ + pkg_dict = {} pkg_dict["personal_information"] = "no" @@ -357,11 +371,6 @@ def _get_pkg_dict(self, harvest_object): pkg_dict["access"] = "yes" pkg_dict["organization_visibility"] = "all" pkg_dict["workflow_status"] = "published" - pkg_dict["license_id"] = self.config.get("license_id", "cc-by") - pkg_dict["private"] = self._is_pkg_private( - metashare_dict - ) - pkg_dict["title"] = metashare_dict.get("title") pkg_dict["notes"] = metashare_dict.get("abstract", "") pkg_dict["tags"] = helpers.get_tags(metashare_dict.get("topiccat")) @@ -369,16 +378,13 @@ def _get_pkg_dict(self, harvest_object): pkg_dict["extract"] = f"{pkg_dict['notes'].split('.')[0]}..." pkg_dict["owner_org"] = self._get_organisation( self.config.get("organisation_mapping"), - metashare_dict.get("resowner").split(";")[0], + metashare_dict.get("resowner", "").split(";")[0], harvest_object, ) if not pkg_dict.get("name"): pkg_dict["name"] = self._get_package_name(harvest_object, pkg_dict["title"]) - if full_metadata_url: - pkg_dict["full_metadata_url"] = full_metadata_url - if uuid: pkg_dict["primary_purpose_of_collection"] = uuid @@ -412,6 +418,22 @@ def _get_pkg_dict(self, harvest_object): pkg_dict["resources"] = self._fetch_resources(metashare_dict) + pkg_dict["private"] = self._is_pkg_private( + metashare_dict, + pkg_dict["resources"] + ) + + pkg_dict["license_id"] = self.config.get("license_id", "cc-by") + + if pkg_dict["private"]: + pkg_dict["license_id"] = "other-closed" + + if self._is_delwp_raster_data(pkg_dict["resources"]): + pkg_dict["full_metadata_url"] = f"https://metashare.maps.vic.gov.au/geonetwork/srv/api/records/{uuid}/formatters/cip-pdf?root=export&output=pdf" + pkg_dict["access_description"] = access_notes + elif full_metadata_url: + pkg_dict["full_metadata_url"] = full_metadata_url + for key, value in [ ("harvest_source_id", harvest_object.source.id), ("harvest_source_title", harvest_object.source.title), @@ -431,13 +453,50 @@ def _create_custom_package_create_schema(self) -> dict[str, Any]: return package_schema - def _is_pkg_private(self, remote_dict: dict[str, Any]) -> bool: - """Check if the dataset should be private by `resclassification` field - value""" - return remote_dict.get("resclassification") in ( - "limitedDistribution", - "restricted", - ) + def _is_delwp_vector_data(self, resources: list[dict[str, Any]]) -> bool: + for res in resources: + if res["format"].lower() in [ + "dwg", + "dxf", + "gdb", + "shp", + "mif", + "tab", + "extended tab", + "mapinfo", + ]: + return True + + return False + + def _is_delwp_raster_data(self, resources: list[dict[str, Any]]) -> bool: + for res in resources: + if res["format"].lower() in [ + "ecw", + "geotiff", + "jpeg", + "jp2", + "jpeg 2000", + "tiff", + "lass", + "xyz", + ]: + return True + + return False + + def _is_pkg_private( + self, + remote_dict: dict[str, Any], + resources: list[dict[str, Any]] + ) -> bool: + """Check if the dataset should be private""" + if (self._is_delwp_vector_data(resources) and + remote_dict.get("mdclassification") == "unclassified" and + remote_dict.get("resclassification") == "unclassified"): + return False + + return True def _get_organisation( self, From fa98db27c4472cf11f8287dbd232c81e92488460 Mon Sep 17 00:00:00 2001 From: Yan Rudenko Date: Thu, 30 May 2024 13:32:37 +0200 Subject: [PATCH 2/6] SXDEDPCXZIC-308_DATAVIC-622 --- ckanext/datavic_harvester/harvesters/delwp.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ckanext/datavic_harvester/harvesters/delwp.py b/ckanext/datavic_harvester/harvesters/delwp.py index 869624f..e4d0247 100644 --- a/ckanext/datavic_harvester/harvesters/delwp.py +++ b/ckanext/datavic_harvester/harvesters/delwp.py @@ -281,6 +281,13 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str: ) return False + # Remove restricted Datasets + if pkg_dict["private"]: + log.info( + f"Dataset is Restricted for object {harvest_object.id}, skipping..." + ) + return False + if status not in ["new", "change"]: return True From 4029b53bcb4fc665e192dbbfb6370ffbed496dfc Mon Sep 17 00:00:00 2001 From: agmorev Date: Thu, 20 Jun 2024 23:46:45 +0300 Subject: [PATCH 3/6] SXDEDPCXZIC-321_DATAVIC-699 / set dcat harvester default visibility --- ckanext/datavic_harvester/harvesters/dcat_json.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 9ee6e33..6ffebe6 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -184,7 +184,12 @@ def _set_required_fields_defaults( if not self._get_extra(pkg_dict, "protective_marking"): pkg_dict["protective_marking"] = "official" - if not self._get_extra(pkg_dict, "organization_visibility"): + if not self._get_extra(pkg_dict, "organization_visibility") \ + and "default_visibility" in self.config: + pkg_dict["organization_visibility"] = self.config["default_visibility"][ + "organization_visibility" + ] + else: pkg_dict["organization_visibility"] = "current" pkg_dict["workflow_status"] = "published" From d9256e69b5ab5dcfd88e63d8d57afa6f81cac0af Mon Sep 17 00:00:00 2001 From: agmorev Date: Fri, 21 Jun 2024 00:38:15 +0300 Subject: [PATCH 4/6] SXDEDPCXZIC-321_DATAVIC-699 / fix the logic of default value --- ckanext/datavic_harvester/harvesters/dcat_json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 6ffebe6..54627a6 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -190,7 +190,9 @@ def _set_required_fields_defaults( "organization_visibility" ] else: - pkg_dict["organization_visibility"] = "current" + pkg_dict["organization_visibility"] = self._get_extra( + pkg_dict, "organization_visibility" + ) or "current" pkg_dict["workflow_status"] = "published" From 9397321ce7a4c7486ee11b2fd3697f0b18ba6644 Mon Sep 17 00:00:00 2001 From: alexmorev Date: Mon, 24 Jun 2024 22:58:49 +0300 Subject: [PATCH 5/6] SXDEDPCXZIC-322_DATAVIC-703 / prevent records being updated unnecessarily --- .../datavic_harvester/harvesters/dcat_json.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/ckanext/datavic_harvester/harvesters/dcat_json.py b/ckanext/datavic_harvester/harvesters/dcat_json.py index 9ee6e33..d873d1f 100644 --- a/ckanext/datavic_harvester/harvesters/dcat_json.py +++ b/ckanext/datavic_harvester/harvesters/dcat_json.py @@ -34,6 +34,25 @@ def gather_stage(self, harvest_job): def import_stage(self, harvest_object): self._set_config(harvest_object.source.config) + + package_dict, dcat_dict = self._get_package_dict(harvest_object) + dcat_modified = dcat_dict.get("modified") + existing_dataset = self._get_existing_dataset(harvest_object.guid) + + if dcat_modified and existing_dataset: + dcat_modified = helpers.convert_date_to_isoformat( + dcat_modified, "modified", dcat_dict["title"] + ).lower().split("t")[0] + + pkg_modified = existing_dataset['date_modified_data_asset'] + + if pkg_modified and pkg_modified == dcat_modified: + log.info( + f"Dataset with id {existing_dataset['id']} wasn't modified " + "from the last harvest. Skipping this dataset..." + ) + return False + return super().import_stage(harvest_object) def _get_package_dict( @@ -43,7 +62,7 @@ def _get_package_dict( conversions of the data""" dcat_dict: dict[str, Any] = json.loads(harvest_object.content) - pkg_dict = converters.dcat_to_ckan(dcat_dict) + pkg_dict = converters.dcat_to_ckan(dcat_dict) soup: BeautifulSoup = BeautifulSoup(pkg_dict["notes"], "html.parser") From ac1a8e0fec2bd14e7b27b1ab396541c7bf07936f Mon Sep 17 00:00:00 2001 From: alexmorev Date: Thu, 11 Jul 2024 20:49:51 +0300 Subject: [PATCH 6/6] SXDEDPCXZIC-340 / fix harvester error --- ckanext/datavic_harvester/harvesters/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ckanext/datavic_harvester/harvesters/base.py b/ckanext/datavic_harvester/harvesters/base.py index 9b15cc4..5b7cfce 100644 --- a/ckanext/datavic_harvester/harvesters/base.py +++ b/ckanext/datavic_harvester/harvesters/base.py @@ -128,8 +128,11 @@ def fetch_stage(self, harvest_object: HarvestObject) -> bool: return True def _delete_package(self, package_id: str, guid: str): - tk.get_action("package_delete")(self._make_context(), {"id": package_id}) - log.info(f"Deleted package {package_id} with guid {guid}") + try: + tk.get_action("package_delete")(self._make_context(), {"id": package_id}) + log.info(f"Deleted package {package_id} with guid {guid}") + except tk.ObjectNotFound: + log.error(f"Package {package_id} not found") def _make_context(self) -> dict[str, Any]: return {