Skip to content

Commit

Permalink
Merge pull request #10 from data61/develop
Browse files Browse the repository at this point in the history
update matcher and utils
  • Loading branch information
wangzhen263 authored Mar 13, 2018
2 parents 4e7fa5c + cf3656c commit 7881904
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 33 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
<dependency>
<groupId>sh.serene</groupId>
<artifactId>stellar-utils</artifactId>
<version>0.2.0</version>
<version>0.2.1</version>
</dependency>

<dependency>
Expand Down
49 changes: 17 additions & 32 deletions src/main/java/data/CoraMatcherMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,22 @@ public class CoraMatcherMerger extends BasicMatcherMerger implements
private Map<String, String> gotAttrkeys;
private Type type;

private Map<String, JaroTFIDFMatcher> matchers;

public CoraMatcherMerger(JSONConfig props)
throws FileNotFoundException, UnsupportedEncodingException {

_factory = new SimpleRecordFactory();
gson = new Gson();

float fnf = props.attributes.get("full_name").floatValue();
float addf = props.attributes.get("address").floatValue();

System.out.println("Similarity Threshold - Name: " + fnf + " Address: " + addf);
matchers = new HashMap<>();
props.attributes.forEach( (k, v) -> {
matchers.put(k, new JaroTFIDFMatcher(v.floatValue()));
System.out.println("Similarity Threshold - " + k + ":" + v);
});

keyAttributes = new ArrayList<>(props.attributes.keySet());
format = DataFileFormat.fromString(props.dataFormat);

nameMatcher = new JaroTFIDFMatcher(fnf);
addrMatcher = new JaroTFIDFMatcher(addf);

type = new TypeToken<Map<String, String>>(){}.getType();
}

Expand All @@ -50,36 +49,22 @@ protected double calculateConfidence(double c1, double c2)

protected boolean matchInternal(Record r1, Record r2) {
Set<String> attrkeys = r1.getAttributes().keySet();
Map<String, String> gotAttrkeys = new HashMap<>();
List<String> gotAttrkeys = new ArrayList<>();

keyAttributes.forEach(ka->{
Optional<String> ret = attrkeys.stream().parallel().filter(attrkey->attrkey.toLowerCase().contains(ka.toLowerCase())).findAny();
Optional<String> ret = attrkeys.stream().parallel().filter(attrkey->attrkey.contains(ka)).findAny();
if (ret.isPresent()) {
gotAttrkeys.put(ka.toLowerCase(), ret.get());
gotAttrkeys.add(ka);
}
});

for (Map.Entry<String, String> entry : gotAttrkeys.entrySet()) {
String attrKey = entry.getValue();
String propKey = entry.getKey();
Attribute a1 = r1.getAttribute(attrKey);
Attribute a2 = r2.getAttribute(attrKey);

switch (propKey) {
case "full_name":
if (a1 != null && a2 != null) {
if (!ExistentialBooleanComparator.attributesMatch(a1, a2, nameMatcher))
return false;
}
break;
case "address":
if (a1 != null && a2 != null) {
if (!ExistentialBooleanComparator.attributesMatch(a1, a2, addrMatcher))
return false;
}
break;
default:
break;
for (String key : gotAttrkeys) {
Attribute a1 = r1.getAttribute(key);
Attribute a2 = r2.getAttribute(key);

if (a1 != null && a2 != null) {
if (!ExistentialBooleanComparator.attributesMatch(a1, a2, matchers.get(key)))
return false;
}
}

Expand Down

0 comments on commit 7881904

Please sign in to comment.