Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 2.9.0 [DEV] #79

Open
wants to merge 20 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
7da9837
SXDEDPCXZIC-241_DATAVIC-622 / DELWP harvest restricted records
agmorev Nov 29, 2023
c898e82
Merge pull request #56 from dpc-sdp/SXDEDPCXZIC-228
joelwigley Feb 2, 2024
c34dd51
Merge pull request #60 from dpc-sdp/SXDEDPCXZIC-280_DATAVIC-665
joelwigley Apr 15, 2024
4d19bf4
Merge remote-tracking branch 'origin/release-2.6.0' into uat
mutantsan Apr 18, 2024
394f428
Merge pull request #62 from dpc-sdp/SXDEDPCXZIC-302
joelwigley May 17, 2024
fce3e1a
Merge pull request #63 from dpc-sdp/SXDEDPCXZIC-302
joelwigley May 30, 2024
cea5599
Merge pull request #53 from dpc-sdp/SXDEDPCXZIC-241
alexmorev May 30, 2024
fa98db2
SXDEDPCXZIC-308_DATAVIC-622
May 30, 2024
1c0fdcd
Merge pull request #65 from dpc-sdp/SXDEDPCXZIC-308
Engerrs May 30, 2024
ff34999
Merge pull request #66 from dpc-sdp/release-2.6.0
iaroslav13 May 31, 2024
96c93d3
Merge pull request #67 from dpc-sdp/uat
joelwigley Jun 12, 2024
4029b53
SXDEDPCXZIC-321_DATAVIC-699 / set dcat harvester default visibility
agmorev Jun 20, 2024
d9256e6
SXDEDPCXZIC-321_DATAVIC-699 / fix the logic of default value
agmorev Jun 20, 2024
16ed3fd
Merge pull request #71 from dpc-sdp/SXDEDPCXZIC-321
joelwigley Jun 24, 2024
9397321
SXDEDPCXZIC-322_DATAVIC-703 / prevent records being updated unnecessa…
alexmorev Jun 24, 2024
ac1a8e0
SXDEDPCXZIC-340 / fix harvester error
alexmorev Jul 11, 2024
a770a14
Merge pull request #75 from dpc-sdp/SXDEDPCXZIC-340
joelwigley Jul 17, 2024
de977f4
Merge pull request #76 from dpc-sdp/uat
iaroslav13 Jul 31, 2024
e9a0587
Merge pull request #73 from dpc-sdp/SXDEDPCXZIC-322
joelwigley Oct 16, 2024
74d5d38
Merge pull request #77 from dpc-sdp/master
joelwigley Oct 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions ckanext/datavic_harvester/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,11 @@ def fetch_stage(self, harvest_object: HarvestObject) -> bool:
return True

def _delete_package(self, package_id: str, guid: str):
tk.get_action("package_delete")(self._make_context(), {"id": package_id})
log.info(f"Deleted package {package_id} with guid {guid}")
try:
tk.get_action("package_delete")(self._make_context(), {"id": package_id})
log.info(f"Deleted package {package_id} with guid {guid}")
except tk.ObjectNotFound:
log.error(f"Package {package_id} not found")

def _make_context(self) -> dict[str, Any]:
return {
Expand Down
37 changes: 32 additions & 5 deletions ckanext/datavic_harvester/harvesters/dcat_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,25 @@ def gather_stage(self, harvest_job):

def import_stage(self, harvest_object):
self._set_config(harvest_object.source.config)

package_dict, dcat_dict = self._get_package_dict(harvest_object)
dcat_modified = dcat_dict.get("modified")
existing_dataset = self._get_existing_dataset(harvest_object.guid)

if dcat_modified and existing_dataset:
dcat_modified = helpers.convert_date_to_isoformat(
dcat_modified, "modified", dcat_dict["title"]
).lower().split("t")[0]

pkg_modified = existing_dataset['date_modified_data_asset']

if pkg_modified and pkg_modified == dcat_modified:
log.info(
f"Dataset with id {existing_dataset['id']} wasn't modified "
"from the last harvest. Skipping this dataset..."
)
return False

return super().import_stage(harvest_object)

def _get_package_dict(
Expand All @@ -43,7 +62,7 @@ def _get_package_dict(
conversions of the data"""

dcat_dict: dict[str, Any] = json.loads(harvest_object.content)
pkg_dict = converters.dcat_to_ckan(dcat_dict)
pkg_dict = converters.dcat_to_ckan(dcat_dict)

soup: BeautifulSoup = BeautifulSoup(pkg_dict["notes"], "html.parser")

Expand Down Expand Up @@ -184,11 +203,17 @@ def _set_required_fields_defaults(
if not self._get_extra(pkg_dict, "protective_marking"):
pkg_dict["protective_marking"] = "official"

if not self._get_extra(pkg_dict, "organization_visibility"):
pkg_dict["organization_visibility"] = "current"
if not self._get_extra(pkg_dict, "organization_visibility") \
and "default_visibility" in self.config:
pkg_dict["organization_visibility"] = self.config["default_visibility"][
"organization_visibility"
]
else:
pkg_dict["organization_visibility"] = self._get_extra(
pkg_dict, "organization_visibility"
) or "current"

if not self._get_extra(pkg_dict, "workflow_status"):
pkg_dict["workflow_status"] = "draft"
pkg_dict["workflow_status"] = "published"

issued: Optional[str] = dcat_dict.get("issued")
if issued and not self._get_extra(pkg_dict, "date_created_data_asset"):
Expand All @@ -212,6 +237,8 @@ def _set_required_fields_defaults(

pkg_dict["tag_string"] = dcat_dict.get("keyword", [])

pkg_dict.setdefault("update_frequency", "unknown")

def _get_existing_dataset(self, guid: str) -> Optional[dict[str, Any]]:
"""Return a package with specific guid extra if exists"""

Expand Down
105 changes: 92 additions & 13 deletions ckanext/datavic_harvester/harvesters/delwp.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def gather_stage(self, harvest_job):
def _get_guids_to_package_ids(self, source_id: str) -> dict[str, str]:
query = (
model.Session.query(HarvestObject.guid, HarvestObject.package_id)
.filter(HarvestObject.current == True)
# .filter(HarvestObject.current == True) # I've commented it, because
# otherwise we were getting duplicates.
.filter(HarvestObject.harvest_source_id == source_id)
)

Expand Down Expand Up @@ -274,6 +275,19 @@ def import_stage(self, harvest_object: HarvestObject) -> bool | str:

pkg_dict = self._get_pkg_dict(harvest_object)

if not pkg_dict["notes"] or not pkg_dict["owner_org"]:
log.info(
f"Description or organization field is missing for object {harvest_object.id}, skipping..."
)
return False

# Remove restricted Datasets
if pkg_dict["private"]:
log.info(
f"Dataset is Restricted for object {harvest_object.id}, skipping..."
)
return False

if status not in ["new", "change"]:
return True

Expand Down Expand Up @@ -350,15 +364,21 @@ def _get_pkg_dict(self, harvest_object):
else ""
)

access_notes = """
Aerial imagery and elevation datasets\n
You can access high-resolution aerial imagery and elevation (LiDAR point cloud) datasets by contacting a business that holds a commercial license.\n
We have two types of commercial licensing:\n
Data Service Providers (DSPs) provide access to the source imagery or elevation data.\n
Value Added Retailers (VARs ) use the imagery and elevation data to create new products and services. This includes advisory services and new knowledge products.
"""

pkg_dict = {}

pkg_dict["personal_information"] = "no"
pkg_dict["protective_marking"] = "official"
pkg_dict["access"] = "yes"
pkg_dict["organization_visibility"] = "all"
pkg_dict["workflow_status"] = "published"
pkg_dict["license_id"] = self.config.get("license_id", "cc-by")
pkg_dict["private"] = self._is_pkg_private(metashare_dict)
pkg_dict["title"] = metashare_dict.get("title")
pkg_dict["notes"] = metashare_dict.get("abstract", "")
pkg_dict["tags"] = helpers.get_tags(remote_topiccat) if remote_topiccat else []
Expand All @@ -373,9 +393,6 @@ def _get_pkg_dict(self, harvest_object):
if not pkg_dict.get("name"):
pkg_dict["name"] = self._get_package_name(harvest_object, pkg_dict["title"])

if full_metadata_url:
pkg_dict["full_metadata_url"] = full_metadata_url

if uuid:
pkg_dict["primary_purpose_of_collection"] = uuid

Expand Down Expand Up @@ -409,6 +426,31 @@ def _get_pkg_dict(self, harvest_object):

pkg_dict["resources"] = self._fetch_resources(metashare_dict)

pkg_dict["private"] = self._is_pkg_private(
metashare_dict,
pkg_dict["resources"]
)

pkg_dict["license_id"] = self.config.get("license_id", "cc-by")

if pkg_dict["private"]:
pkg_dict["license_id"] = "other-closed"

if self._is_delwp_raster_data(pkg_dict["resources"]):
pkg_dict["full_metadata_url"] = f"https://metashare.maps.vic.gov.au/geonetwork/srv/api/records/{uuid}/formatters/cip-pdf?root=export&output=pdf"
pkg_dict["access_description"] = access_notes
elif full_metadata_url:
pkg_dict["full_metadata_url"] = full_metadata_url

for key, value in [
("harvest_source_id", harvest_object.source.id),
("harvest_source_title", harvest_object.source.title),
("harvest_source_type", harvest_object.source.type),
("delwp_restricted", pkg_dict["private"])
]:
pkg_dict.setdefault("extras", [])
pkg_dict["extras"].append({"key": key, "value": value})

return pkg_dict

def _create_custom_package_create_schema(self) -> dict[str, Any]:
Expand All @@ -419,13 +461,50 @@ def _create_custom_package_create_schema(self) -> dict[str, Any]:

return package_schema

def _is_pkg_private(self, remote_dict: dict[str, Any]) -> bool:
"""Check if the dataset should be private by `resclassification` field
value"""
return remote_dict.get("resclassification") in (
"limitedDistribution",
"restricted",
)
def _is_delwp_vector_data(self, resources: list[dict[str, Any]]) -> bool:
for res in resources:
if res["format"].lower() in [
"dwg",
"dxf",
"gdb",
"shp",
"mif",
"tab",
"extended tab",
"mapinfo",
]:
return True

return False

def _is_delwp_raster_data(self, resources: list[dict[str, Any]]) -> bool:
for res in resources:
if res["format"].lower() in [
"ecw",
"geotiff",
"jpeg",
"jp2",
"jpeg 2000",
"tiff",
"lass",
"xyz",
]:
return True

return False

def _is_pkg_private(
self,
remote_dict: dict[str, Any],
resources: list[dict[str, Any]]
) -> bool:
"""Check if the dataset should be private"""
if (self._is_delwp_vector_data(resources) and
remote_dict.get("mdclassification") == "unclassified" and
remote_dict.get("resclassification") == "unclassified"):
return False

return True

def _get_organisation(
self,
Expand Down