forked from terrywbrady/File-Analyzer
-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #78 from Georgetown-University-Libraries/sd1320
Refine EAD to DC mapping
- Loading branch information
Showing
2 changed files
with
192 additions
and
159 deletions.
There are no files selected for viewing
306 changes: 154 additions & 152 deletions
306
dspace/src/main/edu/georgetown/library/fileAnalyzer/importer/EAD2DC.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,152 +1,154 @@ | ||
package edu.georgetown.library.fileAnalyzer.importer; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.text.ParseException; | ||
import java.text.SimpleDateFormat; | ||
import java.util.Date; | ||
import java.util.HashMap; | ||
import java.util.TreeMap; | ||
import java.util.Vector; | ||
import java.util.regex.Pattern; | ||
|
||
import javax.xml.transform.TransformerException; | ||
|
||
import org.w3c.dom.Document; | ||
import org.xml.sax.SAXException; | ||
|
||
|
||
import gov.nara.nwts.ftapp.ActionResult; | ||
import gov.nara.nwts.ftapp.FTDriver; | ||
import gov.nara.nwts.ftapp.Timer; | ||
import gov.nara.nwts.ftapp.ftprop.FTPropString; | ||
import gov.nara.nwts.ftapp.importer.DefaultImporter; | ||
import gov.nara.nwts.ftapp.importer.DelimitedFileReader; | ||
import gov.nara.nwts.ftapp.stats.Stats; | ||
import gov.nara.nwts.ftapp.stats.StatsGenerator; | ||
import gov.nara.nwts.ftapp.stats.StatsItem; | ||
import gov.nara.nwts.ftapp.stats.StatsItemConfig; | ||
import gov.nara.nwts.ftapp.stats.StatsItemEnum; | ||
import edu.georgetown.library.fileAnalyzer.util.XMLUtil; | ||
|
||
/** | ||
* Importer for tab delimited files | ||
* | ||
* @author TBrady | ||
* | ||
*/ | ||
public class EAD2DC extends DefaultImporter { | ||
|
||
public static enum EAD2DCStatsItems implements StatsItemEnum { | ||
Record(StatsItem.makeStringStatsItem("Record", 100).setExport(false)); | ||
|
||
StatsItem si; | ||
|
||
EAD2DCStatsItems(StatsItem si) { | ||
this.si = si; | ||
} | ||
|
||
public StatsItem si() { | ||
return si; | ||
} | ||
} | ||
|
||
public static enum Generator implements StatsGenerator { | ||
INSTANCE; | ||
public Stats create(String key) { | ||
return new Stats(details, key); | ||
} | ||
} | ||
|
||
public static StatsItemConfig details = StatsItemConfig | ||
.create(EAD2DCStatsItems.class); | ||
public static String P_COLL = "Collection"; | ||
public static String P_RIGHTS = "RIGHTS"; | ||
public static String P_REFCOL = "refid-column-name"; | ||
|
||
|
||
public EAD2DC(FTDriver dt) { | ||
super(dt); | ||
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(), | ||
P_COLL, P_COLL, | ||
"DSpace Collection Handle","")); | ||
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(), | ||
P_RIGHTS, P_RIGHTS, | ||
"dc.rights statement","")); | ||
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(), | ||
P_REFCOL, P_REFCOL, | ||
"Metadata registry field name to store the archival object refid","gu.archivesspace.id")); | ||
} | ||
|
||
public String toString() { | ||
return "EAD to DSpace Dublin Core"; | ||
} | ||
|
||
public String getDescription() { | ||
return "This rule will take an exported EAD file and convert archival objects to dublin core metadata."; | ||
} | ||
|
||
public String getShortName() { | ||
return "EAD2DC"; | ||
} | ||
|
||
public ActionResult importFile(File selectedFile) throws IOException { | ||
details = StatsItemConfig.create(EAD2DCStatsItems.class); | ||
HashMap<String, Object> params = new HashMap<>(); | ||
params.put("collection", this.getProperty(P_COLL)); | ||
params.put("rights", this.getProperty(P_RIGHTS)); | ||
params.put("refcol", this.getProperty(P_REFCOL)); | ||
Timer timer = new Timer(); | ||
TreeMap<String, Stats> types = new TreeMap<String, Stats>(); | ||
|
||
try { | ||
Document d = XMLUtil.db_ns.parse(selectedFile); | ||
File csv = new File(selectedFile.getParent(), selectedFile.getName()+".csv"); | ||
XMLUtil.doTransform(d, csv, "edu/georgetown/library/fileAnalyzer/ead.xsl", params); | ||
DelimitedFileReader dfr = new DelimitedFileReader(csv, ","); | ||
Vector<String> header = dfr.getRow(); | ||
for(String col: header) { | ||
details.addStatsItem(col, StatsItem.makeStringStatsItem(col)); | ||
} | ||
int rownum = 1_000_000; | ||
for(Vector<String>row=dfr.getRow(); row!=null; row=dfr.getRow()) { | ||
String key = ""+rownum++; | ||
Stats stats = Generator.INSTANCE.create(key); | ||
types.put(key, stats); | ||
for(int i=0; i<header.size(); i++) { | ||
String s = row.size() > i ? row.get(i) : ""; | ||
String col = header.get(i); | ||
if (col.equals("dc.date.created[en]")) { | ||
s = normalizeDate(s); | ||
} | ||
stats.appendKeyVal(details.getByKey(col), s); | ||
} | ||
} | ||
} catch (SAXException e) { | ||
e.printStackTrace(); | ||
} catch (TransformerException e) { | ||
e.printStackTrace(); | ||
} | ||
return new ActionResult(selectedFile, "EAD2DC", | ||
this.toString(), details, types, true, timer.getDuration()); | ||
} | ||
|
||
public String normalizeDate(String s) { | ||
if (Pattern.matches("^\\d\\d\\d\\d(-\\d\\d(-\\d\\d)?)?", s)) { | ||
return s; | ||
} | ||
try { | ||
Date d = new SimpleDateFormat("DD MMM yyyy").parse(s); | ||
return new SimpleDateFormat("yyyy-MM-DD").format(d); | ||
} catch (ParseException e1) { | ||
// no action | ||
} | ||
try { | ||
Date d = new SimpleDateFormat("MMM yyyy").parse(s); | ||
return new SimpleDateFormat("yyyy-MM").format(d); | ||
} catch (ParseException e1) { | ||
// no action | ||
} | ||
return s; | ||
} | ||
} | ||
package edu.georgetown.library.fileAnalyzer.importer; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.text.ParseException; | ||
import java.text.SimpleDateFormat; | ||
import java.util.Date; | ||
import java.util.HashMap; | ||
import java.util.TreeMap; | ||
import java.util.Vector; | ||
import java.util.regex.Pattern; | ||
|
||
import javax.xml.transform.TransformerException; | ||
|
||
import org.w3c.dom.Document; | ||
import org.xml.sax.SAXException; | ||
|
||
|
||
import gov.nara.nwts.ftapp.ActionResult; | ||
import gov.nara.nwts.ftapp.FTDriver; | ||
import gov.nara.nwts.ftapp.Timer; | ||
import gov.nara.nwts.ftapp.ftprop.FTPropString; | ||
import gov.nara.nwts.ftapp.importer.DefaultImporter; | ||
import gov.nara.nwts.ftapp.importer.DelimitedFileReader; | ||
import gov.nara.nwts.ftapp.stats.Stats; | ||
import gov.nara.nwts.ftapp.stats.StatsGenerator; | ||
import gov.nara.nwts.ftapp.stats.StatsItem; | ||
import gov.nara.nwts.ftapp.stats.StatsItemConfig; | ||
import gov.nara.nwts.ftapp.stats.StatsItemEnum; | ||
import edu.georgetown.library.fileAnalyzer.util.XMLUtil; | ||
|
||
/** | ||
* Importer for tab delimited files | ||
* | ||
* @author TBrady | ||
* | ||
*/ | ||
public class EAD2DC extends DefaultImporter { | ||
|
||
public static enum EAD2DCStatsItems implements StatsItemEnum { | ||
Record(StatsItem.makeStringStatsItem("Record", 100).setExport(false)); | ||
|
||
StatsItem si; | ||
|
||
EAD2DCStatsItems(StatsItem si) { | ||
this.si = si; | ||
} | ||
|
||
public StatsItem si() { | ||
return si; | ||
} | ||
} | ||
|
||
public static enum Generator implements StatsGenerator { | ||
INSTANCE; | ||
public Stats create(String key) { | ||
return new Stats(details, key); | ||
} | ||
} | ||
|
||
public static StatsItemConfig details = StatsItemConfig | ||
.create(EAD2DCStatsItems.class); | ||
public static String P_COLL = "Collection"; | ||
public static String P_RIGHTS = "RIGHTS"; | ||
public static String P_REFCOL = "refid-column-name"; | ||
|
||
|
||
public EAD2DC(FTDriver dt) { | ||
super(dt); | ||
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(), | ||
P_COLL, P_COLL, | ||
"DSpace Collection Handle","")); | ||
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(), | ||
P_RIGHTS, P_RIGHTS, | ||
"dc.rights statement","")); | ||
this.ftprops.add(new FTPropString(dt, this.getClass().getSimpleName(), | ||
P_REFCOL, P_REFCOL, | ||
"Metadata registry field name to store the archival object refid","gu.archivesspace.id")); | ||
} | ||
|
||
public String toString() { | ||
return "EAD to DSpace Dublin Core"; | ||
} | ||
|
||
public String getDescription() { | ||
return "This rule will take an exported EAD file and convert archival objects to dublin core metadata."; | ||
} | ||
|
||
public String getShortName() { | ||
return "EAD2DC"; | ||
} | ||
|
||
public ActionResult importFile(File selectedFile) throws IOException { | ||
details = StatsItemConfig.create(EAD2DCStatsItems.class); | ||
HashMap<String, Object> params = new HashMap<>(); | ||
params.put("collection", this.getProperty(P_COLL)); | ||
params.put("rights", this.getProperty(P_RIGHTS)); | ||
params.put("refcol", this.getProperty(P_REFCOL)); | ||
Timer timer = new Timer(); | ||
TreeMap<String, Stats> types = new TreeMap<String, Stats>(); | ||
|
||
try { | ||
Document d = XMLUtil.db_ns.parse(selectedFile); | ||
File csv = new File(selectedFile.getParent(), selectedFile.getName()+".csv"); | ||
InputStream in = XMLUtil.getResourceStream(this, "edu/georgetown/library/fileAnalyzer/ead.xsl"); | ||
XMLUtil.doTransform(d, csv, in, params); | ||
DelimitedFileReader dfr = new DelimitedFileReader(csv, ","); | ||
Vector<String> header = dfr.getRow(); | ||
for(String col: header) { | ||
details.addStatsItem(col, StatsItem.makeStringStatsItem(col)); | ||
} | ||
int rownum = 1_000_000; | ||
for(Vector<String>row=dfr.getRow(); row!=null; row=dfr.getRow()) { | ||
String key = ""+rownum++; | ||
Stats stats = Generator.INSTANCE.create(key); | ||
types.put(key, stats); | ||
for(int i=0; i<header.size(); i++) { | ||
String s = row.size() > i ? row.get(i) : ""; | ||
String col = header.get(i); | ||
if (col.equals("dc.date.created[en]")) { | ||
s = normalizeDate(s); | ||
} | ||
stats.appendKeyVal(details.getByKey(col), s); | ||
} | ||
} | ||
} catch (SAXException e) { | ||
e.printStackTrace(); | ||
} catch (TransformerException e) { | ||
e.printStackTrace(); | ||
} | ||
return new ActionResult(selectedFile, "EAD2DC", | ||
this.toString(), details, types, true, timer.getDuration()); | ||
} | ||
|
||
public String normalizeDate(String s) { | ||
if (Pattern.matches("^\\d\\d\\d\\d(-\\d\\d(-\\d\\d)?)?", s)) { | ||
return s; | ||
} | ||
try { | ||
Date d = new SimpleDateFormat("DD MMM yyyy").parse(s); | ||
return new SimpleDateFormat("yyyy-MM-DD").format(d); | ||
} catch (ParseException e1) { | ||
// no action | ||
} | ||
try { | ||
Date d = new SimpleDateFormat("MMM yyyy").parse(s); | ||
return new SimpleDateFormat("yyyy-MM").format(d); | ||
} catch (ParseException e1) { | ||
// no action | ||
} | ||
return s; | ||
} | ||
} |
Oops, something went wrong.