diff --git a/src/undate/date.py b/src/undate/date.py index bac47f3..27f6efa 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -134,6 +134,7 @@ class DatePrecision(IntEnum): DECADE = 2 #: year YEAR = 3 + # TBD: season ? #: month MONTH = 4 #: day diff --git a/src/undate/dateformat/edtf/__init__.py b/src/undate/dateformat/edtf/__init__.py index e69de29..4b95f19 100644 --- a/src/undate/dateformat/edtf/__init__.py +++ b/src/undate/dateformat/edtf/__init__.py @@ -0,0 +1 @@ +from undate.dateformat.edtf.formatter import EDTFDateFormat as EDTFDateFormat diff --git a/src/undate/dateformat/edtf/formatter.py b/src/undate/dateformat/edtf/formatter.py new file mode 100644 index 0000000..c26cfdc --- /dev/null +++ b/src/undate/dateformat/edtf/formatter.py @@ -0,0 +1,69 @@ +from typing import Optional, Union + +from lark.exceptions import UnexpectedCharacters + +from undate.date import DatePrecision +from undate.dateformat.base import BaseDateFormat +from undate.dateformat.edtf.parser import edtf_parser +from undate.dateformat.edtf.transformer import EDTFTransformer +from undate.undate import Undate, UndateInterval + +EDTF_UNSPECIFIED_DIGIT: str = "X" + + +class EDTFDateFormat(BaseDateFormat): + name: str = "EDTF" + + def __init__(self): + self.transformer = EDTFTransformer() + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + # parse the input string, then transform to undate object + try: + parsetree = edtf_parser.parse(value) + return self.transformer.transform(parsetree) + except UnexpectedCharacters as err: + raise ValueError("Parsing failed due to UnexpectedCharacters: %s" % err) + + def _convert_missing_digits( + self, value: Optional[str], old_missing_digit: str + ) -> Optional[str]: + if value: + return value.replace(old_missing_digit, EDTF_UNSPECIFIED_DIGIT) + return None + + def to_string(self, undate: Undate) -> str: + # in theory it's possible to use the parser and reconstruct using a tree, + # but that seems much more complicated and would be harder to read + parts = [] + + if undate.precision >= DatePrecision.YEAR: + year = self._convert_missing_digits(undate.year, undate.MISSING_DIGIT) + # years with more than 4 digits should be prefixed with Y + if year and len(year) > 4: + year = f"Y{year}" + # TODO: handle uncertain / approximate + parts.append(year or EDTF_UNSPECIFIED_DIGIT * 4) + + # beware when we add more date precisions, + # week-level won't necessarily mean we know the month + + if undate.precision >= DatePrecision.MONTH: + # TODO: handle uncertain / approximate + parts.append( + self._convert_missing_digits(undate.month, undate.MISSING_DIGIT) + or EDTF_UNSPECIFIED_DIGIT * 2 + ) + + if undate.precision >= DatePrecision.DAY: + # TODO: handle uncertain / approximate + parts.append( + self._convert_missing_digits(undate.day, undate.MISSING_DIGIT) + or EDTF_UNSPECIFIED_DIGIT * 2 + ) + + if parts: + return "-".join(parts) + + # how can we have an empty string? probably shouldn't get here + return "" diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py index a5578de..3f5f020 100644 --- a/src/undate/dateformat/edtf/transformer.py +++ b/src/undate/dateformat/edtf/transformer.py @@ -64,8 +64,8 @@ def day_unspecified(self, items): def date_level1(self, items): return self.date(items) - def year_fivedigitsplus(self, token): + def year_fivedigitsplus(self, items): # strip off the leading Y and convert to integer - # TODO: undate is currently limited to 4-digit years - # (datetime max year of 9999) - return token.update(int(token[:1])) + token = items[0] + year = int(token.value.lstrip("Y")) + return Tree(data="year", children=[year]) diff --git a/src/undate/undate.py b/src/undate/undate.py index 3ee5dc4..01e1f42 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -45,6 +45,7 @@ def __init__( label: Optional[str] = None, ): # keep track of initial values and which values are known + # TODO: add validation: if str, must be expected length self.initial_values: Dict[str, Optional[Union[int, str]]] = { "year": year, "month": month, @@ -293,6 +294,45 @@ def is_known(self, part: str) -> bool: def is_partially_known(self, part: str) -> bool: return isinstance(self.initial_values[part], str) + @property + def year(self) -> Optional[str]: + "year as string (minimum 4 characters), if year is known" + year = self._get_date_part("year") + if year: + return f"{year:>04}" + # if value is unset but date precision is month or greater, return unknown month + elif self.precision >= DatePrecision.YEAR: + return self.MISSING_DIGIT * 4 + return None + + @property + def month(self) -> Optional[str]: + "month as 2-character string, or None if unknown/unset" + # TODO: do we allow None for unknown month with day-level granularity? + # TODO: need to distinguish between unknown (XX) and unset/not part of the date due to granularity + month = self._get_date_part("month") + if month: + return f"{month:>02}" + # if value is unset but date precision is month or greater, return unknown month + elif self.precision >= DatePrecision.MONTH: + return self.MISSING_DIGIT * 2 + return None + + @property + def day(self) -> Optional[str]: + "day as 2-character string or None if unset" + day = self._get_date_part("day") + if day: + return f"{day:>02}" + # if value is unset but date precision is day, return unknown day + elif self.precision == DatePrecision.DAY: + return self.MISSING_DIGIT * 2 + return None + + def _get_date_part(self, part: str) -> Optional[str]: + value = self.initial_values.get(part) + return str(value) if value else None + def duration(self) -> Timedelta: """What is the duration of this date? Calculate based on earliest and latest date within range, diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py index 821e42e..a475e75 100644 --- a/tests/test_dateformat/edtf/test_edtf_transformer.py +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -12,12 +12,10 @@ ("1001-03-30", Undate(1001, 3, 30)), ("1000/2000", UndateInterval(Undate(1000), Undate(2000))), ("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))), - # # level 1 - # NOTE: undate currently doesn't most of the level 1 functionality - # NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR) - # ("Y17000002", Undate(17000002)), + # level 1 + ("Y17000002", Undate(17000002)), # "2001-21", # spring 2001 - # # qualifiers + # qualifiers TODO - not yet supported by undate # "1984?", # "2004-06~", # "2004-06-11%", diff --git a/tests/test_dateformat/test_edtf.py b/tests/test_dateformat/test_edtf.py new file mode 100644 index 0000000..13d3e5a --- /dev/null +++ b/tests/test_dateformat/test_edtf.py @@ -0,0 +1,48 @@ +import pytest + +from undate.dateformat.edtf import EDTFDateFormat +from undate.undate import Undate, UndateInterval + + +class TestEDTFDateFormat: + def test_parse_singledate(self): + assert EDTFDateFormat().parse("2002") == Undate(2002) + assert EDTFDateFormat().parse("1991-05") == Undate(1991, 5) + assert EDTFDateFormat().parse("1991-05-03") == Undate(1991, 5, 3) + # unknown dates are not strictly equal, but string comparison should match + assert str(EDTFDateFormat().parse("201X")) == str(Undate("201X")) + assert str(EDTFDateFormat().parse("2004-XX")) == str(Undate(2004, "XX")) + # missing year but month/day known + # assert EDTFDateFormat().parse("--05-03") == Undate(month=5, day=3) + + def test_parse_singledate_unequal(self): + assert EDTFDateFormat().parse("2002") != Undate(2003) + assert EDTFDateFormat().parse("1991-05") != Undate(1991, 6) + assert EDTFDateFormat().parse("1991-05-03") != Undate(1991, 5, 4) + # missing year but month/day known + # - does EDTF not support this or is parsing logic incorrect? + # assert EDTFDateFormat().parse("XXXX-05-03") != Undate(month=5, day=4) + + def test_parse_invalid(self): + with pytest.raises(ValueError): + assert EDTFDateFormat().parse("1991-5") == Undate(1991, 5) + + def test_parse_range(self): + assert EDTFDateFormat().parse("1800/1900") == UndateInterval( + Undate(1800), Undate(1900) + ) + + def test_to_string(self): + assert EDTFDateFormat().to_string(Undate(900)) == "0900" + assert EDTFDateFormat().to_string(Undate("80")) == "0080" + assert EDTFDateFormat().to_string(Undate(33)) == "0033" + assert EDTFDateFormat().to_string(Undate("20XX")) == "20XX" + assert EDTFDateFormat().to_string(Undate(17000002)) == "Y17000002" + + assert EDTFDateFormat().to_string(Undate(1991, 6)) == "1991-06" + assert EDTFDateFormat().to_string(Undate(1991, 5, 3)) == "1991-05-03" + + assert EDTFDateFormat().to_string(Undate(1991, "0X")) == "1991-0X" + assert EDTFDateFormat().to_string(Undate(1991, None, 3)) == "1991-XX-03" + + # TODO: override missing digit and confirm replacement diff --git a/tests/test_undate.py b/tests/test_undate.py index 39c1f86..0072be4 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -131,6 +131,47 @@ def test_from_datetime_date(self): assert isinstance(undate_from_date, Undate) assert undate_from_date == Undate(2001, 3, 5) + # test properties for accessing parts of date + def test_year_property(self): + # two, three, four five digit years; numeric and string + assert Undate(33).year == "0033" + assert Undate(567).year == "0567" + assert Undate("1984").year == "1984" + assert Undate(23055).year == "23055" + # partially known year + assert Undate("19XX").year == "19XX" + # unset year + assert Undate(month=12, day=31).year == "XXXX" + + def test_month_property(self): + # one, two digit month + assert Undate(2023, 1).month == "01" + assert Undate(2023, 12).month == "12" + # partially unknown month (first/second digit unknown) + assert Undate(2023, "1X").month == "1X" + assert Undate(2023, "X2").month == "X2" + # fully unknown month + assert Undate(2023, "XX").month == "XX" + # unset month, year precision + assert Undate(2023).month is None + # unset month, day precision (= some unknown month, not no month) + assert Undate(day=15).month == "XX" + + def test_day_property(self): + # one, two digit day + assert Undate(2023, 1, 9).day == "09" + assert Undate(2023, 1, 31).day == "31" + # partially unknown day + assert Undate(2023, 1, "1X").day == "1X" + assert Undate(2023, 1, "X5").day == "X5" + # fully unknown day + assert Undate(2023, 1, "XX").day == "XX" + # unset day + assert Undate(2023).day is None + assert Undate(2023, 1).day is None + # Day without year or month + assert Undate(day=15).day == "15" + def test_eq(self): assert Undate(2022) == Undate(2022) assert Undate(2022, 10) == Undate(2022, 10)