-
Notifications
You must be signed in to change notification settings - Fork 0
/
am-do-2-atom-do.py
431 lines (378 loc) · 19.7 KB
/
am-do-2-atom-do.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
import os
import shutil
import sys
import pymysql.cursors
import requests
import metsrw
from datetime import datetime
# Set connection parameters.
STORAGE_SERVICE_URL = ''
STORAGE_SERVICE_USER = ''
STORAGE_SERVICE_API_KEY = ''
ATOM_MYSQL_USER = ''
ATOM_MYSQL_PASSWORD = ''
ATOM_MYSQL_DATABASE = ''
# Set and test MySQL connection.
try:
# Configure AtoM MySQL connection.
mysqlConnection = pymysql.connect(
host="localhost",
user=ATOM_MYSQL_USER,
password=ATOM_MYSQL_PASSWORD,
db=ATOM_MYSQL_DATABASE,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
)
mysqlCursor = mysqlConnection.cursor()
print("Connected to AtoM MySQL database.")
except Exception as e:
print(e)
sys.exit("Unable to connect to the AtoM MySQL database. Please check your connection parameters.")
# Delete temporary MySQL working table and METS download directory?
# This is False by default as this info may be useful for any post-script
# auditing and can easily be deleted manually.
DELETE_TEMP_FILES = False
# Set METS download directory.
METS_DIR = "DIP_METS/"
# Create a working directory for downloading METS files.
if not os.path.exists(METS_DIR):
os.makedirs(METS_DIR)
# Initialize a crude, global error counter. Pre-mature exits are not counted.
ERROR_COUNT = 0
# Test Storage Service connection
try:
request_url = STORAGE_SERVICE_URL + "?username=" + STORAGE_SERVICE_USER + "&api_key=" + STORAGE_SERVICE_API_KEY
response = requests.get(request_url)
if response.status_code != requests.codes.ok:
sys.exit("Unable to connect to Archivematica Storage Service. Please check your connection parameters.")
else:
print("Connected to Archivematica Storage Service.")
except Exception as e:
print(e)
sys.exit("Unable to connect to Archivematica Storage Service. Please check your connection parameters.")
# Create a working table for transferring the legacy DIP file properties.
try:
sql = "DROP TABLE IF EXISTS dip_files, premis_events;"
mysqlCursor.execute(sql)
mysqlConnection.commit()
sql = "CREATE TABLE IF NOT EXISTS dip_files(object_id INTEGER PRIMARY KEY, object_uuid TEXT, aip_uuid TEXT, originalFileIngestedAt TEXT, relativePathWithinAip TEXT, aipName TEXT, originalFileName TEXT, originalFileSize TEXT, formatName TEXT, formatVersion TEXT, formatRegistryName TEXT, formatRegistryKey TEXT, preservationCopyNormalizedAt TEXT, preservationCopyFileName TEXT, preservationCopyFileSize TEXT, parsed BOOLEAN);"
mysqlCursor.execute(sql)
sql = "CREATE TABLE IF NOT EXISTS premis_events(id INTEGER PRIMARY KEY, object_id INTEGER, value TEXT);"
mysqlCursor.execute(sql)
mysqlConnection.commit()
except Exception as e:
print(e)
sys.exit("Unable to create working table. Check permissions for MySQL user.")
def main():
'''
Update pre release 2.7 AtoM digital objects with information from AIP
METS to take full advantage of the digital object metadata enhancement and AIP/file retrieval features.
'''
# Count total number of digital objects in AtoM.
sql = "SELECT COUNT(*) FROM digital_object WHERE object_id IS NOT NULL;"
mysqlCursor.execute(sql)
total_count = mysqlCursor.fetchone()
# Count total number of 'legacy' digital objects in AtoM.
sql = "SELECT * FROM property WHERE name='objectUUID' AND scope is NULL;"
mysqlCursor.execute(sql)
legacy_dip_files = mysqlCursor.fetchall()
legacy_count = len(legacy_dip_files)
print("Total number of digital objects in AtoM: " + str(total_count["COUNT(*)"]))
print("Total number of 'legacy` digital objects to be updated: " + str(legacy_count))
script_start = datetime.now().replace(microsecond=0)
print("Script started at: " + script_start.strftime("%Y-%m-%d %H:%M:%S"))
print("Identifying legacy digital object records in AtoM...")
flush_legacy_digital_file_properties(legacy_dip_files)
print("Parsing digital object properties from Archivematica METS files...")
try:
# Select next unparsed legacy DIP file record from the working table.
sql = "SELECT * FROM dip_files WHERE parsed = %s;"
mysqlCursor.execute(sql, False)
legacy_dip_file = mysqlCursor.fetchone()
except Exception as e:
print(e)
sys.exit("Unable to query the working table.")
while legacy_dip_file:
parse_mets_values(legacy_dip_file["aip_uuid"])
sql = "SELECT * FROM dip_files WHERE parsed = %s;"
mysqlCursor.execute(sql, False)
legacy_dip_file = mysqlCursor.fetchone()
print("Updating digital object properties in AtoM MySQL...")
update_digital_file_properties()
if DELETE_TEMP_FILES:
print("Cleaning up temporary files...")
delete_temporary_files()
else:
print("Keeping temporary files. See `dip_files` table in the MySQL database and the " + METS_DIR + " directory for downloaded METS files.")
script_end = datetime.now().replace(microsecond=0)
print("Script finished at: " + script_end.strftime("%Y-%m-%d %H:%M:%S"))
duration = script_end - script_start
print("Script duration: " + str(duration))
print("Number of errors encountered: " + str(ERROR_COUNT))
def flush_legacy_digital_file_properties(legacy_dip_files):
global ERROR_COUNT
for file in legacy_dip_files:
try:
# Select the Archivematica Object UUID which will be used to find
# property info in the METS file.
sql = "SELECT value FROM property_i18n WHERE id = %s;"
mysqlCursor.execute(sql, file['id'])
object_uuid = mysqlCursor.fetchone()
except Exception as e:
print("Unable to select Object UUID for object# " + str(file['id']) + ". Skipping...")
print(e)
ERROR_COUNT += 1
continue
try:
# Select the Archivematica AIP UUID which will be used to fetch
# the METS file from the Storage Service.
sql = "SELECT id from property WHERE name = 'aipUUID' AND object_id = %s;"
mysqlCursor.execute(sql, file['object_id'])
property_id = mysqlCursor.fetchone()
sql = "SELECT value FROM property_i18n WHERE id = %s;"
mysqlCursor.execute(sql, property_id['id'])
aip_uuid = mysqlCursor.fetchone()
except Exception as e:
print("Unable to select AIP UUID for object# " + str(file['id']) + ". Skipping...")
print(e)
ERROR_COUNT += 1
continue
try:
# Store identifier values in the working table.
sql = "INSERT INTO dip_files (object_id, object_uuid, aip_uuid, parsed) VALUES (%s, %s, %s, %s);"
mysqlCursor.execute(sql, (file['object_id'], object_uuid['value'], aip_uuid['value'], False))
mysqlConnection.commit()
except Exception as e:
print("Unable to insert working data for object# " + str(file['id']) + ". Skipping...")
print(e)
ERROR_COUNT += 1
continue
return
def get_mets_path(aip_uuid):
request_url = STORAGE_SERVICE_URL + "/file/" + aip_uuid + "?username=" + STORAGE_SERVICE_USER + "&api_key=" + STORAGE_SERVICE_API_KEY
try:
response = requests.get(request_url)
except Exception as e:
print(e)
sys.exit("Unable to connect to Storage Service. Check your connection parameters.")
package = response.json()
# build relative path to METS file
if package["current_path"].endswith(".7z"):
relativePath = package["current_path"][40:-3]
else:
relativePath = package["current_path"][40:]
relativePathToMETS = (
relativePath + "/data/METS." + package["uuid"] + ".xml"
)
# Derive AIP transfer name from filepath value by removing UUID suffix
transfer_name = relativePath[:-37]
return relativePathToMETS, transfer_name
def get_mets_file(aip_uuid, relative_path):
request_url = STORAGE_SERVICE_URL + "/file/" + aip_uuid + "/extract_file/?relative_path_to_file=" + relative_path + "&username=" + STORAGE_SERVICE_USER + "&api_key=" + STORAGE_SERVICE_API_KEY
response = requests.get(request_url)
if response.status_code == 200:
mets_file = "METS.{}.xml".format(aip_uuid)
download_file = os.path.join(METS_DIR, mets_file)
with open(download_file, "wb") as file:
file.write(response.content)
return (response.status_code, request_url)
def parse_mets_values(aip_uuid):
global ERROR_COUNT
# Identify all AtoM digital objects in this Archivematica AIP.
try:
sql = "SELECT * FROM dip_files WHERE aip_uuid = %s;"
mysqlCursor.execute(sql, aip_uuid)
legacy_dip_files = mysqlCursor.fetchall()
except Exception as e:
print("Unable to fetch the digital object records associated with AIP " + aip_uuid)
print(e)
return
# Download METS file if a local copy is not present.
if os.path.exists(METS_DIR + aip_uuid + ".xml") is False:
try:
path, transfer_name = get_mets_path(aip_uuid)
except Exception as e:
print("Unable to derive relative path of METS file in package " + aip_uuid)
print(e)
ERROR_COUNT += 1
# Give up trying to update files from this AIP
for file in legacy_dip_files:
sql = "DELETE FROM dip_files WHERE object_id = %s;"
mysqlCursor.execute(sql, file['object_id'])
mysqlConnection.commit()
return
try:
mets_file_status, request_url = get_mets_file(aip_uuid, path)
if mets_file_status != 200:
print("Unable to fetch METS file for package " + aip_uuid)
ERROR_COUNT += 1
# Give up trying to update files from this AIP
for file in legacy_dip_files:
sql = "DELETE FROM dip_files WHERE object_id = %s;"
mysqlCursor.execute(sql, file['object_id'])
mysqlConnection.commit()
return
except Exception as e:
print("Unable to fetch METS file for package " + aip_uuid)
print(e)
ERROR_COUNT += 1
# Give up trying to update files from this AIP
for file in legacy_dip_files:
sql = "DELETE FROM dip_files WHERE object_id = %s;"
mysqlCursor.execute(sql, file['object_id'])
mysqlConnection.commit()
return
# Read the METS file.
try:
mets = metsrw.METSDocument.fromfile(METS_DIR + "METS." + aip_uuid + ".xml")
except Exception as e:
print("METSRW is unable to parse the METS XML for package " + aip_uuid + ". Check your markup and see archivematica/issues#1129.")
print(e)
ERROR_COUNT += 1
# Give up trying to update files from this AIP
for file in legacy_dip_files:
sql = "UPDATE dip_files SET parsed = %s WHERE object_id = %s;"
mysqlCursor.execute(sql, True, file['object_id'])
mysqlConnection.commit()
return
for file in legacy_dip_files:
# Retrieve values for the current AtoM digital object from the METS.
try:
fsentry = mets.get_file(file_uuid=file['object_uuid'])
except Exception as e:
print("Unable to find metadata for file " + file['object_uuid'] + " in METS." + aip_uuid + ".xml")
print(e)
ERROR_COUNT += 1
return
# Initialize all properties to Null to avoid missing value errors.
originalFileIngestedAt = None
relativePathWithinAip = None
aipName = None
originalFileName = None
originalFileSize = None
formatName = None
formatVersion = None
formatRegistryKey = None
preservationCopyNormalizedAt = None
preservationCopyFileName = None
preservationCopyFileSize = None
relativePathWithinAip = fsentry.path
aipName = transfer_name
originalFileName = fsentry.label
for premis_event in fsentry.get_premis_events():
if (premis_event.event_type) == "ingestion":
eventDate = premis_event.event_date_time[0:19]
originalFileIngestedAt = datetime.strptime(eventDate, "%Y-%m-%dT%H:%M:%S")
'''
TODO: Add all PREMIS Events to AtoM MySQL database as a string array stored in a property_i18n text field. This is currently being done for AtoM 2.7 DIP uploads, even though these values do not appear anywhere in the AtoM GUI.
'''
for premis_object in fsentry.get_premis_objects():
try:
originalFileSize = premis_object.size
formatName = premis_object.format_name
if (str(premis_object.format_registry_key)) != "(('format_registry_key',),)":
if (str(premis_object.format_registry_key)) != "()":
formatRegistryKey = premis_object.format_registry_key
if (str(premis_object.format_version)) != "(('format_version',),)":
if (str(premis_object.format_version)) != "()":
formatVersion = premis_object.format_version
except Exception as e:
# A workaround hack for some METSRW failures that were only
# occurring on ISO formats in the sample data.
formatName = "ISO Disk Image File"
formatRegistryKey = "fmt/468"
print(e)
print("Unable to match file format to a registry key for digital object " + file['object_uuid'] + ". Using `fmt/468 - ISO Disk Image` as best guess.")
ERROR_COUNT += 1
continue
# If this digital object has a preservation copy, retrieve its
# information.
try:
if premis_object.relationship__relationship_sub_type == "is source of":
try:
preservation_copy_uuid = premis_object.relationship__related_object_identification__related_object_identifier_value
except AttributeError:
preservation_copy_uuid = premis_object.relationship__related_object_identifier__related_object_identifier_value
preservation_file = mets.get_file(file_uuid=preservation_copy_uuid)
if preservation_file is not None:
preservationCopyFileName = preservation_file.label
for entry in preservation_file.get_premis_objects():
preservationCopyFileSize = entry.size
for event in preservation_file.get_premis_events():
if (event.event_type) == "creation":
eventDate = event.event_date_time[0:19]
preservationCopyNormalizedAt = datetime.strptime(eventDate, "%Y-%m-%dT%H:%M:%S")
except Exception as e:
print("Unable to add preservation copy information for file " + file['object_uuid'] + ".")
print(e)
ERROR_COUNT += 1
preservationCopyNormalizedAt = None
preservationCopyFileName = None
preservationCopyFileSize = None
continue
# Write the METS values to the MySQL working table.
sql = "UPDATE dip_files SET originalFileIngestedAt = %s, relativePathWithinAip = %s, aipName = %s, originalFileName = %s, originalFileSize = %s, formatName = %s, formatVersion = %s, formatRegistryName = %s, formatRegistryKey = %s, preservationCopyNormalizedAt = %s, preservationCopyFileName = %s, preservationCopyFileSize = %s, parsed = %s WHERE object_uuid = %s;"
mysqlCursor.execute(sql, (originalFileIngestedAt, relativePathWithinAip, aipName, originalFileName, originalFileSize, formatName, formatVersion, "PRONOM", formatRegistryKey, preservationCopyNormalizedAt, preservationCopyFileName, preservationCopyFileSize, True, file['object_uuid']))
mysqlConnection.commit()
def write_property(object_id, scope, name, value, object_uuid):
global ERROR_COUNT
# Helper function to insert updated property values.
try:
sql = "INSERT INTO `property` (`object_id`, `scope`, `name`, `source_culture`) VALUES (%s, %s, %s, %s)"
mysqlCursor.execute(sql, (object_id, scope, name, "en"))
property_id = mysqlCursor.lastrowid
sql = "INSERT INTO `property_i18n` (`value`, `id`, `culture`) VALUES (%s, %s, %s)"
mysqlCursor.execute(sql, (value, property_id, "en"))
mysqlConnection.commit()
except Exception as e:
print("Unable to add property `" + name + " for digital object " + object_uuid)
print(e)
ERROR_COUNT += 1
def update_digital_file_properties():
# Select all the legacy DIP file records from the working table.
sql = "SELECT * FROM dip_files;"
mysqlCursor.execute(sql)
legacy_dip_files = mysqlCursor.fetchall()
# Loop over records in working table and insert updated property values.
for file in legacy_dip_files:
# Deleting existing properties
try:
sql = "DELETE FROM property WHERE object_id = %s;"
mysqlCursor.execute(sql, file['object_id'])
mysqlConnection.commit()
except Exception as e:
print("Unable to flush existing property values for object# " + str(file['id']) + ". Skipping...")
print(e)
ERROR_COUNT += 1
continue
# Write new property values from METS values
write_property(file["object_id"], "Archivematica AIP", "objectUUID", file["object_uuid"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "aipUUID", file["aip_uuid"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "relativePathWithinAip", file["relativePathWithinAip"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "aipName", file["aipName"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "originalFileName", file["originalFileName"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "originalFileSize", file["originalFileSize"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "originalFileIngestedAt", file["originalFileIngestedAt"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "preservationCopyFileName", file["preservationCopyFileName"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "preservationCopyFileSize", file["preservationCopyFileSize"], file["object_uuid"])
write_property(file["object_id"], "Archivematica AIP", "preservationCopyNormalizedAt", file["preservationCopyNormalizedAt"], file["object_uuid"])
write_property(file["object_id"], "premisData", "formatName", file["formatName"], file["object_uuid"])
write_property(file["object_id"], "premisData", "formatVersion", file["formatVersion"], file["object_uuid"])
write_property(file["object_id"], "premisData", "formatRegistryName", file["formatRegistryName"], file["object_uuid"])
write_property(file["object_id"], "premisData", "formatRegistryKey", file["formatRegistryKey"], file["object_uuid"])
def delete_temporary_files():
try:
if os.path.exists(METS_DIR):
shutil.rmtree(METS_DIR)
except Exception as e:
print("Unable to delete the temporary METS file download directory.")
print(e)
try:
sql = "DROP TABLE IF EXISTS dip_files, premis_events;"
mysqlCursor.execute(sql)
mysqlConnection.commit()
except Exception as e:
print("Unable to delete the working tables from the AtoM MySQL database.")
print(e)
if __name__ == "__main__":
main()