Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make EDTF parser available as undate formatter; handle 5+ digit years #89

Merged
merged 7 commits into from
Nov 8, 2024
1 change: 1 addition & 0 deletions src/undate/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class DatePrecision(IntEnum):
DECADE = 2
#: year
YEAR = 3
# TBD: season ?
#: month
MONTH = 4
#: day
Expand Down
1 change: 1 addition & 0 deletions src/undate/dateformat/edtf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from undate.dateformat.edtf.formatter import EDTFDateFormat as EDTFDateFormat
69 changes: 69 additions & 0 deletions src/undate/dateformat/edtf/formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import Optional, Union

from lark.exceptions import UnexpectedCharacters

from undate.date import DatePrecision
from undate.dateformat.base import BaseDateFormat
from undate.dateformat.edtf.parser import edtf_parser
from undate.dateformat.edtf.transformer import EDTFTransformer
from undate.undate import Undate, UndateInterval

EDTF_UNSPECIFIED_DIGIT: str = "X"


class EDTFDateFormat(BaseDateFormat):
name: str = "EDTF"

def __init__(self):
self.transformer = EDTFTransformer()

def parse(self, value: str) -> Union[Undate, UndateInterval]:
# parse the input string, then transform to undate object
try:
parsetree = edtf_parser.parse(value)
return self.transformer.transform(parsetree)
except UnexpectedCharacters as err:
raise ValueError("Parsing failed due to UnexpectedCharacters: %s" % err)
rlskoeser marked this conversation as resolved.
Show resolved Hide resolved
rlskoeser marked this conversation as resolved.
Show resolved Hide resolved

def _convert_missing_digits(
self, value: Optional[str], old_missing_digit: str
) -> Optional[str]:
if value:
return value.replace(old_missing_digit, EDTF_UNSPECIFIED_DIGIT)
return None

def to_string(self, undate: Undate) -> str:
# in theory it's possible to use the parser and reconstruct using a tree,
# but that seems much more complicated and would be harder to read
parts = []

if undate.precision >= DatePrecision.YEAR:
year = self._convert_missing_digits(undate.year, undate.MISSING_DIGIT)
# years with more than 4 digits should be prefixed with Y
if year and len(year) > 4:
year = f"Y{year}"
# TODO: handle uncertain / approximate
rlskoeser marked this conversation as resolved.
Show resolved Hide resolved
parts.append(year or EDTF_UNSPECIFIED_DIGIT * 4)

# beware when we add more date precisions,
# week-level won't necessarily mean we know the month

if undate.precision >= DatePrecision.MONTH:
# TODO: handle uncertain / approximate
parts.append(
self._convert_missing_digits(undate.month, undate.MISSING_DIGIT)
or EDTF_UNSPECIFIED_DIGIT * 2
)

if undate.precision >= DatePrecision.DAY:
# TODO: handle uncertain / approximate
parts.append(
self._convert_missing_digits(undate.day, undate.MISSING_DIGIT)
or EDTF_UNSPECIFIED_DIGIT * 2
)

if parts:
return "-".join(parts)

# how can we have an empty string? probably shouldn't get here
return ""
rlskoeser marked this conversation as resolved.
Show resolved Hide resolved
8 changes: 4 additions & 4 deletions src/undate/dateformat/edtf/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def day_unspecified(self, items):
def date_level1(self, items):
return self.date(items)

def year_fivedigitsplus(self, token):
def year_fivedigitsplus(self, items):
# strip off the leading Y and convert to integer
# TODO: undate is currently limited to 4-digit years
# (datetime max year of 9999)
return token.update(int(token[:1]))
token = items[0]
year = int(token.value.lstrip("Y"))
return Tree(data="year", children=[year])
40 changes: 40 additions & 0 deletions src/undate/undate.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
label: Optional[str] = None,
):
# keep track of initial values and which values are known
# TODO: add validation: if str, must be expected length
self.initial_values: Dict[str, Optional[Union[int, str]]] = {
"year": year,
"month": month,
Expand Down Expand Up @@ -293,6 +294,45 @@ def is_known(self, part: str) -> bool:
def is_partially_known(self, part: str) -> bool:
return isinstance(self.initial_values[part], str)

@property
def year(self) -> Optional[str]:
"year as string (minimum 4 characters), if year is known"
year = self._get_date_part("year")
if year:
return f"{year:>04}"
# if value is unset but date precision is month or greater, return unknown month
elif self.precision >= DatePrecision.YEAR:
return self.MISSING_DIGIT * 4
return None

@property
def month(self) -> Optional[str]:
"month as 2-character string, or None if unknown/unset"
# TODO: do we allow None for unknown month with day-level granularity?
# TODO: need to distinguish between unknown (XX) and unset/not part of the date due to granularity
month = self._get_date_part("month")
if month:
return f"{month:>02}"
# if value is unset but date precision is month or greater, return unknown month
elif self.precision >= DatePrecision.MONTH:
return self.MISSING_DIGIT * 2
return None

@property
def day(self) -> Optional[str]:
"day as 2-character string or None if unset"
day = self._get_date_part("day")
if day:
return f"{day:>02}"
# if value is unset but date precision is day, return unknown day
elif self.precision == DatePrecision.DAY:
return self.MISSING_DIGIT * 2
return None

def _get_date_part(self, part: str) -> Optional[str]:
value = self.initial_values.get(part)
return str(value) if value else None

def duration(self) -> Timedelta:
"""What is the duration of this date?
Calculate based on earliest and latest date within range,
Expand Down
8 changes: 3 additions & 5 deletions tests/test_dateformat/edtf/test_edtf_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@
("1001-03-30", Undate(1001, 3, 30)),
("1000/2000", UndateInterval(Undate(1000), Undate(2000))),
("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))),
# # level 1
# NOTE: undate currently doesn't most of the level 1 functionality
# NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR)
# ("Y17000002", Undate(17000002)),
# level 1
("Y17000002", Undate(17000002)),
# "2001-21", # spring 2001
# # qualifiers
# qualifiers TODO - not yet supported by undate
# "1984?",
# "2004-06~",
# "2004-06-11%",
Expand Down
48 changes: 48 additions & 0 deletions tests/test_dateformat/test_edtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pytest

from undate.dateformat.edtf import EDTFDateFormat
from undate.undate import Undate, UndateInterval


class TestEDTFDateFormat:
def test_parse_singledate(self):
assert EDTFDateFormat().parse("2002") == Undate(2002)
assert EDTFDateFormat().parse("1991-05") == Undate(1991, 5)
assert EDTFDateFormat().parse("1991-05-03") == Undate(1991, 5, 3)
# unknown dates are not strictly equal, but string comparison should match
assert str(EDTFDateFormat().parse("201X")) == str(Undate("201X"))
assert str(EDTFDateFormat().parse("2004-XX")) == str(Undate(2004, "XX"))
# missing year but month/day known
# assert EDTFDateFormat().parse("--05-03") == Undate(month=5, day=3)

def test_parse_singledate_unequal(self):
assert EDTFDateFormat().parse("2002") != Undate(2003)
assert EDTFDateFormat().parse("1991-05") != Undate(1991, 6)
assert EDTFDateFormat().parse("1991-05-03") != Undate(1991, 5, 4)
# missing year but month/day known
# - does EDTF not support this or is parsing logic incorrect?
# assert EDTFDateFormat().parse("XXXX-05-03") != Undate(month=5, day=4)

def test_parse_invalid(self):
with pytest.raises(ValueError):
assert EDTFDateFormat().parse("1991-5") == Undate(1991, 5)
rlskoeser marked this conversation as resolved.
Show resolved Hide resolved
rlskoeser marked this conversation as resolved.
Show resolved Hide resolved

def test_parse_range(self):
assert EDTFDateFormat().parse("1800/1900") == UndateInterval(
Undate(1800), Undate(1900)
)

def test_to_string(self):
assert EDTFDateFormat().to_string(Undate(900)) == "0900"
assert EDTFDateFormat().to_string(Undate("80")) == "0080"
assert EDTFDateFormat().to_string(Undate(33)) == "0033"
assert EDTFDateFormat().to_string(Undate("20XX")) == "20XX"
assert EDTFDateFormat().to_string(Undate(17000002)) == "Y17000002"

assert EDTFDateFormat().to_string(Undate(1991, 6)) == "1991-06"
assert EDTFDateFormat().to_string(Undate(1991, 5, 3)) == "1991-05-03"

assert EDTFDateFormat().to_string(Undate(1991, "0X")) == "1991-0X"
assert EDTFDateFormat().to_string(Undate(1991, None, 3)) == "1991-XX-03"

# TODO: override missing digit and confirm replacement
41 changes: 41 additions & 0 deletions tests/test_undate.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,47 @@ def test_from_datetime_date(self):
assert isinstance(undate_from_date, Undate)
assert undate_from_date == Undate(2001, 3, 5)

# test properties for accessing parts of date
def test_year_property(self):
# two, three, four five digit years; numeric and string
assert Undate(33).year == "0033"
assert Undate(567).year == "0567"
assert Undate("1984").year == "1984"
assert Undate(23055).year == "23055"
# partially known year
assert Undate("19XX").year == "19XX"
# unset year
assert Undate(month=12, day=31).year == "XXXX"

def test_month_property(self):
# one, two digit month
assert Undate(2023, 1).month == "01"
assert Undate(2023, 12).month == "12"
# partially unknown month (first/second digit unknown)
assert Undate(2023, "1X").month == "1X"
assert Undate(2023, "X2").month == "X2"
# fully unknown month
assert Undate(2023, "XX").month == "XX"
# unset month, year precision
assert Undate(2023).month is None
# unset month, day precision (= some unknown month, not no month)
assert Undate(day=15).month == "XX"

def test_day_property(self):
# one, two digit day
assert Undate(2023, 1, 9).day == "09"
assert Undate(2023, 1, 31).day == "31"
# partially unknown day
assert Undate(2023, 1, "1X").day == "1X"
assert Undate(2023, 1, "X5").day == "X5"
# fully unknown day
assert Undate(2023, 1, "XX").day == "XX"
# unset day
assert Undate(2023).day is None
assert Undate(2023, 1).day is None
# Day without year or month
assert Undate(day=15).day == "15"

def test_eq(self):
assert Undate(2022) == Undate(2022)
assert Undate(2022, 10) == Undate(2022, 10)
Expand Down
Loading