Skip to content

Commit

Permalink
update for new dedupe data model pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Jun 27, 2024
1 parent fcd36af commit 6b35f74
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 27 deletions.
21 changes: 10 additions & 11 deletions datetimetype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import numpy as np
from datetime_distance import DateTimeComparator
from dedupe import predicates
from dedupe.variables.base import DerivedType, FieldType
from dedupe.variables.string import affineGap

import datetimetype.datetime_predicates as dtp
from dedupe import predicates


class DateTimeType(FieldType):
class DateTime(FieldType):

type = "DateTime"
_predicate_functions = [
Expand All @@ -26,7 +26,7 @@ def __len__(self):

return self.expanded_size

def __init__(self, definition):
def __init__(self, field, fuzzy=True, dayfirst=False, yearfirst=False, **kwargs):
"""
Initialize a field for comparing datetime types, including timestamps,
dates, months, and years.
Expand All @@ -41,30 +41,29 @@ def __init__(self, definition):
for more information about python-dateutil's parser settings.
"""

super(DateTimeType, self).__init__(definition)
super().__init__(field, **kwargs)

# Parser settings
self.fuzzy = definition.get("fuzzy", True)
self.dayfirst = definition.get("dayfirst", False)
self.yearfirst = definition.get("yearfirst", False)
self.fuzzy = fuzzy
self.dayfirst = dayfirst
self.yearfirst = yearfirst

# Define the expected fields in the output vector
self.variables = ("seconds", "days", "months", "years", "full string")
fields = self._get_fields(definition["field"])
fields = self._get_fields(field)

# Format for output vector: Not Missing + Dummies + Fields
self.expanded_size = 1 + (len(self.variables) - 1) + len(self.variables)

self.higher_vars = [
DerivedType({"name": variable, "type": field_type})
for variable, field_type in fields
DerivedType(variable, field_type) for variable, field_type in fields
]

def _get_fields(self, field):
"""
Returns the format for the output vector.
"""
fields = [("{}: Not Missing".format(field), "Dummy")]
fields = [(f"{field}: Not Missing", "Dummy")]

fields += [(var, "Dummy") for var in self.variables[:-1]]

Expand Down
3 changes: 0 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ dependencies = ["dedupe>=3.0", "datetime-distance"]
[project.urls]
Homepage = "https://github.com/datamade/dedupe-variable-datetime"

[project.entry-points]
dedupevariables = {datetimetype = "datetimetype:DateTimeType"}

[tool.setuptools]
packages = ["datetimetype"]
include-package-data = false
26 changes: 13 additions & 13 deletions tests/test_datetime_comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,67 @@
import numpy as np

import datetimetype.datetime_predicates as dtp
from datetimetype import DateTimeType
from datetimetype import DateTime


def test_datetime_to_datetime_comparison():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("2017-05-25", "2017-01-01"),
np.array([1, 0, 1, 0, 0, 0, math.sqrt(144), 0, 0, 0]),
)


def test_datetime_to_timestamp_comparison():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("2017-05-25", "2017-01-01 12:30:05"),
np.array([1, 0, 1, 0, 0, 0, math.sqrt(143), 0, 0, 0]),
)


def test_timestamp_to_timestamp_comparison():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("2017-05-25 21:08:09", "2017-01-01 12:30:05"),
np.array([1, 1, 0, 0, 0, math.sqrt(12472684), 0, 0, 0, 0]),
)


def test_years():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("2012", "2010"),
np.array([1, 0, 0, 0, 1, 0, 0, 0, math.sqrt(2), 0]),
)


def test_months():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("May 2012", "June 2013"),
np.array([1, 0, 0, 1, 0, 0, 0, math.sqrt(13), 0, 0]),
)


def test_days():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("5 May 2013", "9 June 2013"),
np.array([1, 0, 1, 0, 0, 0, math.sqrt(35), 0, 0, 0]),
)


def test_month_and_day():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("7/7", "July 9th"),
np.array([1, 0, 1, 0, 0, 0, math.sqrt(2), 0, 0, 0]),
)


def test_alternate_formats():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
comp = dt.comparator("May 5th, 2013", "2013-06-09")
np.testing.assert_almost_equal(
comp, np.array([1, 0, 1, 0, 0, 0, math.sqrt(35), 0, 0, 0])
Expand All @@ -82,7 +82,7 @@ def test_alternate_formats():


def test_bad_parse():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(
dt.comparator("foo", "bar"), np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 5.5])
)
Expand All @@ -91,7 +91,7 @@ def test_bad_parse():


def test_fuzzy_parse():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
time1 = "June 6th 2013"
time2 = "It happened on June 7th, 2013"
np.testing.assert_almost_equal(
Expand All @@ -100,13 +100,13 @@ def test_fuzzy_parse():


def test_missing():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
np.testing.assert_almost_equal(dt.comparator("", "non-empty"), np.zeros(len(dt)))
np.testing.assert_almost_equal(dt.comparator(None, "non-empty"), np.zeros(len(dt)))


def test_datetime_object():
dt = DateTimeType({"field": "foo"})
dt = DateTime("foo")
a = datetime.datetime(2016, 5, 6, 0, 0)
b = datetime.datetime(2016, 5, 7, 0, 0)
np.testing.assert_almost_equal(
Expand Down

0 comments on commit 6b35f74

Please sign in to comment.