diff --git a/changelog.md b/changelog.md index 336f21e6e..e320ea98c 100644 --- a/changelog.md +++ b/changelog.md @@ -13,6 +13,12 @@ * `eds.SOFA` → `sofa` * `eds.adicap` → `adicap` * `eds.adicap` → `adicap` +- `eds.dates` now separate dates from durations. Each entity has its own label: + * `spans["dates"]` → entities labelled as `date` with a `span._.date` parsed object + * `spans["durations"]` → entities labelled as `duration` with a `span._.duration` parsed object +- the "relative" / "absolute" / "duration" mode of the time entity is now stored in + the `mode` attribute of the `span._.date/duration` +- the "from" / "until" period bound, if any, is now stored in the `span._.date.bound` attribute ## v0.8.1 (2023-05-31) diff --git a/edsnlp/pipelines/misc/dates/dates.py b/edsnlp/pipelines/misc/dates/dates.py index e8e5d708a..1c1ebaf2e 100644 --- a/edsnlp/pipelines/misc/dates/dates.py +++ b/edsnlp/pipelines/misc/dates/dates.py @@ -12,7 +12,7 @@ from edsnlp.utils.filter import filter_spans from . import patterns -from .models import AbsoluteDate, Duration, Mode, Period, RelativeDate +from .models import AbsoluteDate, Bound, Duration, Mode, Period, RelativeDate PERIOD_PROXIMITY_THRESHOLD = 3 @@ -41,7 +41,7 @@ class Dates(BaseComponent): false_positive : Union[List[str], str] List of regular expressions for false positive (eg phone numbers, etc). on_ents_only : Union[bool, str, Iterable[str]] - Wether to look on dates in the whole document or in specific sentences: + Whether to look on dates in the whole document or in specific sentences: - If `True`: Only look in the sentences of each entity in doc.ents - If False: Look in the whole document @@ -69,7 +69,6 @@ def __init__( as_ents: bool, attr: str, ): - self.nlp = nlp if absolute is None: @@ -122,10 +121,13 @@ def set_extensions(cls) -> None: if not Span.has_extension("date"): Span.set_extension("date", default=None) + if not Span.has_extension("duration"): + Span.set_extension("duration", default=None) + if not Span.has_extension("period"): Span.set_extension("period", default=None) - def process(self, doc: Doc) -> List[Span]: + def process(self, doc: Doc) -> List[Tuple[Span, Dict[str, str]]]: """ Find dates in doc. @@ -164,33 +166,47 @@ def process(self, doc: Doc) -> List[Span]: return dates - def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]: + def parse( + self, matches: List[Tuple[Span, Dict[str, str]]] + ) -> Tuple[List[Span], List[Span]]: """ Parse dates using the groupdict returned by the matcher. Parameters ---------- - dates : List[Tuple[Span, Dict[str, str]]] + matches : List[Tuple[Span, Dict[str, str]]] List of tuples containing the spans and groupdict returned by the matcher. Returns ------- - List[Span] + Tuple[List[Span], List[Span]] List of processed spans, with the date parsed. """ - for span, groupdict in dates: + dates = [] + durations = [] + for span, groupdict in matches: if span.label_ == "relative": parsed = RelativeDate.parse_obj(groupdict) + span.label_ = "date" + span._.date = parsed + dates.append(span) + print("SPAN", span, parsed.dict()) elif span.label_ == "absolute": parsed = AbsoluteDate.parse_obj(groupdict) + span.label_ = "date" + span._.date = parsed + dates.append(span) + print("SPAN", span, parsed.dict()) else: parsed = Duration.parse_obj(groupdict) + span.label_ = "duration" + span._.duration = parsed + durations.append(span) + print("SPAN", span, parsed.dict()) - span._.date = parsed - - return [span for span, _ in dates] + return dates, durations def process_periods(self, dates: List[Span]) -> List[Span]: """ @@ -216,28 +232,32 @@ def process_periods(self, dates: List[Span]) -> List[Span]: dates = list(sorted(dates, key=lambda d: d.start)) for d1, d2 in zip(dates[:-1], dates[1:]): - - if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION: + v1 = d1._.date if d1.label_ == "date" else d1._.duration + v2 = d2._.date if d2.label_ == "date" else d2._.duration + if v1.mode == Mode.DURATION or v2.mode == Mode.DURATION: pass - elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None: + elif d1 in seen or v1.bound is None or v2.bound is None: continue - if ( - d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD - and d1._.date.mode != d2._.date.mode - ): - + if d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD and v1.bound != v2.bound: period = Span(d1.doc, d1.start, d2.end, label="period") # If one date is a duration, - # the other may not have a registered mode. - m1 = d1._.date.mode or Mode.FROM - m2 = d2._.date.mode or Mode.FROM + # the other may not have a registered bound attribute. + if v1.mode == Mode.DURATION: + m1 = Bound.FROM if v2.bound == Bound.UNTIL else Bound.UNTIL + m2 = v2.mode or Bound.FROM + elif v2.mode == Mode.DURATION: + m1 = v1.mode or Bound.FROM + m2 = Bound.FROM if v1.bound == Bound.UNTIL else Bound.UNTIL + else: + m1 = v1.mode or Bound.FROM + m2 = v2.mode or Bound.FROM period._.period = Period.parse_obj( { - m1.value: d1, - m2.value: d2, + m1: d1, + m2: d2, } ) @@ -262,17 +282,18 @@ def __call__(self, doc: Doc) -> Doc: doc : Doc spaCy Doc object, annotated for dates """ - dates = self.process(doc) - dates = self.parse(dates) + matches = self.process(doc) + dates, durations = self.parse(matches) doc.spans["dates"] = dates + doc.spans["durations"] = durations if self.detect_periods: - doc.spans["periods"] = self.process_periods(dates) + doc.spans["periods"] = self.process_periods(dates + durations) if self.as_ents: ents, discarded = filter_spans( - list(doc.ents) + dates, return_discarded=True + list(doc.ents) + dates + durations, return_discarded=True ) doc.ents = ents diff --git a/edsnlp/pipelines/misc/dates/models.py b/edsnlp/pipelines/misc/dates/models.py index f676c8060..46037dd83 100644 --- a/edsnlp/pipelines/misc/dates/models.py +++ b/edsnlp/pipelines/misc/dates/models.py @@ -10,18 +10,21 @@ from edsnlp.pipelines.misc.dates.patterns.relative import specific_dict -class Direction(Enum): +class Direction(str, Enum): + FUTURE = "future" + PAST = "past" + CURRENT = "current" - FUTURE = "FUTURE" - PAST = "PAST" - CURRENT = "CURRENT" +class Bound(str, Enum): + UNTIL = "until" + FROM = "from" -class Mode(Enum): - FROM = "FROM" - UNTIL = "UNTIL" - DURATION = "DURATION" +class Mode(str, Enum): + ABSOLUTE = "absolute" + RELATIVE = "relative" + DURATION = "duration" class Period(BaseModel): @@ -35,7 +38,8 @@ class Config: class BaseDate(BaseModel): - mode: Optional[Mode] = None + mode: Mode = None + bound: Optional[Bound] = None @validator("*", pre=True) def remove_space(cls, v): @@ -58,6 +62,7 @@ def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]: class AbsoluteDate(BaseDate): + mode: Mode = Mode.ABSOLUTE year: Optional[int] = None month: Optional[int] = None day: Optional[int] = None @@ -77,6 +82,7 @@ def to_datetime( d = self.dict(exclude_none=True) d.pop("mode", None) + d.pop("bound", None) if self.year and self.month and self.day: try: return pendulum.datetime(**d, tz=tz) @@ -151,6 +157,7 @@ def validate_year(cls, v): class Relative(BaseDate): + mode: Mode = Mode.RELATIVE year: Optional[int] = None month: Optional[int] = None week: Optional[int] = None @@ -184,29 +191,26 @@ def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]: return d - def to_datetime(self, **kwargs) -> pendulum.Duration: + +class RelativeDate(Relative): + direction: Direction = Direction.CURRENT + + def to_datetime( + self, + note_datetime: Optional[datetime] = None, + **kwargs, + ) -> pendulum.Duration: d = self.dict(exclude_none=True) direction = d.pop("direction", None) dir = -1 if direction == Direction.PAST else 1 d.pop("mode", None) + d.pop("bound", None) d = {f"{k}s": v for k, v in d.items()} td = dir * pendulum.duration(**d) - return td - - -class RelativeDate(Relative): - direction: Direction = Direction.CURRENT - - def to_datetime( - self, - note_datetime: Optional[datetime] = None, - **kwargs, - ) -> pendulum.Duration: - td = super(RelativeDate, self).to_datetime() if note_datetime is not None and not isinstance(note_datetime, NaTType): return note_datetime + td @@ -264,3 +268,10 @@ def norm(self) -> str: td = self.to_datetime() return f"during {td}" + + def to_datetime(self, **kwargs) -> pendulum.Duration: + d = self.dict(exclude_none=True) + + d = {f"{k}s": v for k, v in d.items() if k not in ("mode", "bound")} + + return pendulum.duration(**d) diff --git a/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py b/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py index 12c57ad1d..b467ce83d 100644 --- a/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py +++ b/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py @@ -1,13 +1,13 @@ from edsnlp.utils.regex import make_pattern preceding_directions = [ - r"(?Pdepuis|depuis\s+le|il\s+y\s+a|à)", - r"(?Pdans)", + r"(?Pdepuis|depuis\s+le|il\s+y\s+a|à)", + r"(?Pdans)", ] following_directions = [ - r"(?Pprochaine?s?|suivante?s?|plus\s+tard)", - r"(?Pderni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)", + r"(?Pprochaine?s?|suivante?s?|plus\s+tard)", + r"(?Pderni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)", ] preceding_direction_pattern = make_pattern(preceding_directions, with_breaks=True) diff --git a/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py b/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py index 001c65d07..d888f5015 100644 --- a/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py +++ b/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py @@ -1,8 +1,8 @@ from edsnlp.utils.regex import make_pattern modes = [ - r"(?Pdepuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)", - r"(?Pjusqu'[àa]u?|au)", + r"(?Pdepuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)", + r"(?Pjusqu'[àa]u?|au)", ] mode_pattern = make_pattern(modes, with_breaks=True) diff --git a/edsnlp/pipelines/misc/dates/patterns/relative.py b/edsnlp/pipelines/misc/dates/patterns/relative.py index b37168e6f..be8279faf 100644 --- a/edsnlp/pipelines/misc/dates/patterns/relative.py +++ b/edsnlp/pipelines/misc/dates/patterns/relative.py @@ -32,10 +32,10 @@ def make_specific_pattern(mode: str = "forward"): specific = { - "minus1": (r"hier", dict(direction="PAST", day=1)), - "minus2": (r"avant[-\s]hier", dict(direction="PAST", day=2)), - "plus1": (r"demain", dict(direction="FUTURE", day=1)), - "plus2": (r"après[-\s]demain", dict(direction="FUTURE", day=2)), + "minus1": (r"hier", dict(direction="past", day=1)), + "minus2": (r"avant[-\s]hier", dict(direction="past", day=2)), + "plus1": (r"demain", dict(direction="future", day=1)), + "plus2": (r"après[-\s]demain", dict(direction="future", day=2)), } specific_pattern = make_pattern( diff --git a/edsnlp/pipelines/qualifiers/history/history.py b/edsnlp/pipelines/qualifiers/history/history.py index 689b77beb..d1dda0ccb 100644 --- a/edsnlp/pipelines/qualifiers/history/history.py +++ b/edsnlp/pipelines/qualifiers/history/history.py @@ -278,28 +278,22 @@ def process(self, doc: Doc) -> Doc: recent_dates = [] if self.dates: for date in doc.spans["dates"]: - if date.label_ == "relative": - if date._.date.direction.value == "CURRENT": + value = date._.date + if value.mode == "relative": + if value.direction.value == "CURRENT": if ( - ( - date._.date.year == 0 - and self.history_limit >= timedelta(365) - ) - or ( - date._.date.month == 0 - and self.history_limit >= timedelta(30) - ) + (value.year == 0 and self.history_limit >= timedelta(365)) or ( - date._.date.week == 0 - and self.history_limit >= timedelta(7) + value.month == 0 and self.history_limit >= timedelta(30) ) - or (date._.date.day == 0) + or (value.week == 0 and self.history_limit >= timedelta(7)) + or (value.day == 0) ): recent_dates.append( Span(doc, date.start, date.end, label="relative_date") ) - elif date._.date.direction.value == "PAST": - if -date._.date.to_datetime() >= self.history_limit: + elif value.direction.value == "past": + if -value.to_datetime() >= self.history_limit: history_dates.append( Span(doc, date.start, date.end, label="relative_date") ) @@ -307,9 +301,9 @@ def process(self, doc: Doc) -> Doc: recent_dates.append( Span(doc, date.start, date.end, label="relative_date") ) - elif date.label_ == "absolute" and doc._.note_datetime: + elif value.mode == "absolute" and doc._.note_datetime: try: - absolute_date = date._.date.to_datetime( + absolute_date = value.to_datetime( note_datetime=note_datetime, infer_from_context=True, tz="Europe/Paris", @@ -321,7 +315,7 @@ def process(self, doc: Doc) -> Doc: "In doc {}, the following date {} raises this error: {}. " "Skipping this date.", doc._.note_id, - date._.date, + value, e, ) if absolute_date: diff --git a/tests/pipelines/misc/test_dates.py b/tests/pipelines/misc/test_dates.py index 54f0057a8..5485404e1 100644 --- a/tests/pipelines/misc/test_dates.py +++ b/tests/pipelines/misc/test_dates.py @@ -6,7 +6,7 @@ from pytest import fixture from spacy.language import Language -from edsnlp.pipelines.misc.dates.models import AbsoluteDate, Direction, Mode +from edsnlp.pipelines.misc.dates.models import AbsoluteDate from edsnlp.utils.examples import parse_example TZ = pytz.timezone("Europe/Paris") @@ -16,10 +16,10 @@ "Le patient est venu en 2019 pour une " "consultation" ), - "Le patient est venu hier", + "Le patient est venu hier", "le 04/09/2021", ( - "Il est cas contact " + "Il est cas contact " "depuis la semaine dernière" ), "le 09/08", @@ -34,30 +34,30 @@ "pour..." ), ( - "Il est venu il y a " + "Il est venu il y a " "trois mois pour..." ), ( "Il lui était arrivé la même chose il y a un an." + "direction=past year=1>il y a un an." ), ( "Il est venu le 20/09/2001 pour..." ), ( - "Consultation du 03 07 19" ), "En 11/2017 stabilité sur...", - "depuis 3 mois", + "depuis 3 mois", "- Décembre 2004 :", "- Juin 2005: ", # "-sept 2017 :", ( - "il y a 1 an " - "pdt 1 mois" + "il y a 1 an " + "pdt 1 mois" ), ( "Prélevé le : 22/04/2016 " @@ -65,7 +65,7 @@ ), "Le 07/01.", "Il est venu en août.", - "Il est venu ce jour.", + "Il est venu ce jour.", "CS le 11-01-2017 1/3", "Vu le 11 janvier\n2017 .", ] @@ -83,71 +83,33 @@ def test_dates_component(blank_nlp: Language): text, entities = parse_example(example) doc = blank_nlp(text) + spans = sorted(doc.spans["dates"] + doc.spans["durations"]) - assert len(doc.spans["dates"]) == len(entities) + assert len(spans) == len(entities) assert len(doc.ents) == len(entities) for span, entity in zip(doc.spans["dates"], entities): assert span.text == text[entity.start_char : entity.end_char] - date = span._.date + date = span._.date if span.label_ == "date" else span._.duration d = {modifier.key: modifier.value for modifier in entity.modifiers} norm = d.pop("norm") if "direction" in d: - d["direction"] = Direction[d["direction"]] - if "mode" in d: - d["mode"] = Mode[d["mode"]] + d["mode"] = "relative" + if "mode" not in d: + d["mode"] = "absolute" assert date.dict(exclude_none=True) == d assert date.norm() == norm set_d = set(d) + d.pop("mode", None) + d.pop("direction", None) + d.pop("bound", None) if isinstance(date, AbsoluteDate) and {"year", "month", "day"}.issubset( set_d ): - d.pop("direction", None) - d.pop("mode", None) - assert date.to_datetime() == TZ.localize(datetime(**d)) - - elif isinstance(date, AbsoluteDate): - assert date.to_datetime() is None - - # no year - if {"month", "day"}.issubset(set_d) and {"year"}.isdisjoint(set_d): - d["year"] = note_datetime.year - assert date.to_datetime( - note_datetime=note_datetime, infer_from_context=True - ) == TZ.localize(datetime(**d)) - - # no day - if {"month", "year"}.issubset(set_d) and {"day"}.isdisjoint(set_d): - d["day"] = 1 - assert date.to_datetime( - note_datetime=note_datetime, infer_from_context=True - ) == TZ.localize(datetime(**d)) - - # year only - if {"year"}.issubset(set_d) and {"day", "month"}.isdisjoint(set_d): - d["day"] = 1 - d["month"] = 1 - assert date.to_datetime( - note_datetime=note_datetime, infer_from_context=True - ) == TZ.localize(datetime(**d)) - - # month only - if {"month"}.issubset(set_d) and {"day", "year"}.isdisjoint(set_d): - d["day"] = 1 - d["year"] = note_datetime.year - assert date.to_datetime( - note_datetime=note_datetime, infer_from_context=True - ) == TZ.localize(datetime(**d)) - - if isinstance(date, AbsoluteDate) and {"year", "month", "day"}.issubset( - set_d - ): - d.pop("direction", None) - d.pop("mode", None) assert date.to_datetime() == TZ.localize(datetime(**d)) elif isinstance(date, AbsoluteDate):