diff --git a/README.md b/README.md index c4f172e..449912c 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,6 @@ The object returned by `parse_edtf()` is an instance of an `edtf.parser.parser_c PartialUnspecified OneOfASet MultipleDates - MaskedPrecision Level2Interval Level2Season ExponentialYear @@ -139,9 +138,8 @@ Test coverage includes every example given in the spec table of features. * Partial uncertain/approximate: - >>> parse_edtf('(2011)-06-04~') # year certain, month/day approximate. - # Note that the result text is normalized - PartialUncertainOrApproximate: '2011-(06-04)~' + >>> parse_edtf('2004-06~-11') # year certain, month/day approximate. + PartialUncertainOrApproximate: '2004-06~-11' * Partial unspecified: @@ -158,20 +156,44 @@ Test coverage includes every example given in the spec table of features. >>> parse_edtf('{1667,1668, 1670..1672}') MultipleDates: '{1667, 1668, 1670..1672}' -* Masked precision: - - >>> parse_edtf('197x') # A date in the 1970s. - MaskedPrecision: '197x' - * Level 2 Extended intervals: - >>> parse_edtf('2004-06-(01)~/2004-06-(20)~') - Level2Interval: '2004-06-(01)~/2004-06-(20)~' + >>> parse_edtf('2004-06-~01/2004-06-~20') + Level2Interval: '2004-06-~01/2004-06-~20' * Year requiring more than 4 digits - exponential form: - >>> parse_edtf('Y-17e7') - ExponentialYear: 'Y-17e7' + >>> e = parse_edtf('Y-17E7') + ExponentialYear: 'Y-17E7' + >>> e.estimated() + -170000000 + +* Significant digits: + # '1950S2': some year between 1900 and 1999, estimated to be 1950 + >>> d = parse_edtf('1950S2') + Date: '1950S2' + >>> d.lower_fuzzy()[:3] + (1900, 1, 1) + >>> d.upper_fuzzy()[:3] + (1999, 12, 31) + # 'Y171010000S3': some year between some year between 171000000 and 171999999 estimated to be 171010000, with 3 significant digits. + >>> l = parse_edtf('Y171010000S3') + LongYear: 'Y171010000S3' + >>> l.estimated() + 171010000 + >>> l.lower_fuzzy()[:3] + (171000000, 1, 1) + >>> l.upper_fuzzy()[:3] + (171999999, 12, 31) + # 'Y3388E2S3': some year in exponential notation between 338000 and 338999, estimated to be 338800 + >>> e = parse_edtf('Y3388E2S3') + ExponentialYear: 'Y3388E2S3S3' + >>> e.estimated() + 338800 + >>> e.lower_fuzzy()[:3] + (338000, 1, 1) + >>> e.upper_fuzzy()[:3] + (338999, 12, 31) ### Natural language representation diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index 730f47d..e6232c4 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -48,8 +48,9 @@ oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)]) zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)]) -positiveDigit = Word(nums, exact=1, excludeChars="0") digit = Word(nums, exact=1) +positiveDigit = Word(nums, exact=1, excludeChars="0") +positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) second = zeroThru59 minute = zeroThru59 @@ -63,13 +64,18 @@ ^ (L("02")("month") + "-" + oneThru29("day")) ) +# Significant digits suffix +significantDigits = "S" + Word(nums)("significant_digits") + # 4 digits, 0 to 9 positiveYear = Word(nums, exact=4) # Negative version of positive year, but "-0000" is illegal negativeYear = NotAny(L("-0000")) + ("-" + positiveYear) -year = Combine(positiveYear ^ negativeYear)("year") +year = Combine(positiveYear ^ negativeYear)("year") + Optional(significantDigits) +# simple version for Consecutives +year_basic = Combine(positiveYear ^ negativeYear)("year") yearMonth = year + "-" + month yearMonthDay = year + "-" + monthDay # o hai iso date @@ -112,9 +118,13 @@ # (* *** Long Year - Simple Form *** *) -longYearSimple = "Y" + Combine( - Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit) -)("year") +longYearSimple = ( + "Y" + + Combine(Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit))( + "year" + ) + + Optional(significantDigits) +) LongYear.set_parser(longYearSimple) # (* *** L1Interval *** *) @@ -238,13 +248,12 @@ def f(toks): seasonQualified = season + "^" + seasonQualifier # (* ** Long Year - Scientific Form ** *) -positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) longYearScientific = ( "Y" + Combine(Optional("-") + positiveInteger)("base") + "E" + positiveInteger("exponent") - + Optional("S" + positiveInteger("precision")) + + Optional(significantDigits) ) ExponentialYear.set_parser(longYearScientific) @@ -260,15 +269,13 @@ def f(toks): ) Level2Interval.set_parser(level2Interval) -# (* ** Masked precision ** *) eliminated in latest specs -# maskedPrecision = Combine(digit + digit + ((digit + "x") ^ "xx"))("year") -# MaskedPrecision.set_parser(maskedPrecision) - # (* ** Inclusive list and choice list** *) consecutives = ( (yearMonthDay("lower") + ".." + yearMonthDay("upper")) ^ (yearMonth("lower") + ".." + yearMonth("upper")) - ^ (year("lower") + ".." + year("upper")) + ^ ( + year_basic("lower") + ".." + year_basic("upper") + ) # using year_basic because some tests were throwing `'list' object has no attribute 'expandtabs'` - somewhere, pyparsing.parse_string() was being passed a list ) Consecutives.set_parser(consecutives) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index bb9a213..e12ecbd 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -261,7 +261,9 @@ def get_month(self): month = property(get_month, set_month) - def __init__(self, year=None, month=None, day=None, **kwargs): + def __init__( + self, year=None, month=None, day=None, significant_digits=None, **kwargs + ): for param in ("date", "lower", "upper"): if param in kwargs: self.__init__(**kwargs[param]) @@ -270,6 +272,9 @@ def __init__(self, year=None, month=None, day=None, **kwargs): self.year = year # Year is required, but sometimes passed in as a 'date' dict. self.month = month self.day = day + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) def __str__(self): r = self.year @@ -277,6 +282,8 @@ def __str__(self): r += f"-{self.month}" if self.day: r += f"-{self.day}" + if self.significant_digits: + r += f"S{self.significant_digits}" return r def isoformat(self, default=date.max): @@ -286,6 +293,36 @@ def isoformat(self, default=date.max): int(self.day or default.day), ) + def lower_fuzzy(self): + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) + ) + else: + total_digits = len(self.year) + insignificant_digits = total_digits - self.significant_digits + lower_year = ( + int(self.year) + // (10**insignificant_digits) + * (10**insignificant_digits) + ) + return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def upper_fuzzy(self): + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + add, self.upper_strict(), self._get_fuzzy_padding(LATEST) + ) + else: + total_digits = len(self.year) + insignificant_digits = total_digits - self.significant_digits + upper_year = (int(self.year) // (10**insignificant_digits) + 1) * ( + 10**insignificant_digits + ) - 1 + return struct_time( + [upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS + ) + def _precise_year(self, lean): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: @@ -337,6 +374,9 @@ def precision(self): return PRECISION_MONTH return PRECISION_YEAR + def estimated(self): + return self._precise_year(EARLIEST) + class DateAndTime(EDTFObject): def __init__(self, date, time): @@ -537,11 +577,17 @@ def _get_fuzzy_padding(self, lean): class LongYear(EDTFObject): - def __init__(self, year): + def __init__(self, year, significant_digits=None): self.year = year + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) def __str__(self): - return f"Y{self.year}" + if self.significant_digits: + return f"Y{self.year}S{self.significant_digits}" + else: + return f"Y{self.year}" def _precise_year(self): return int(self.year) @@ -553,6 +599,45 @@ def _strict_date(self, lean): else: return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + def estimated(self): + return self._precise_year() + + def lower_fuzzy(self): + full_year = self._precise_year() + strict_val = self.lower_strict() + if not self.significant_digits: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + else: + insignificant_digits = len(str(full_year)) - int(self.significant_digits) + if insignificant_digits <= 0: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + lower_year = sig_digits * padding_value + return apply_delta( + sub, + struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(EARLIEST), + ) + + def upper_fuzzy(self): + full_year = self._precise_year() + strict_val = self.upper_strict() + if not self.significant_digits: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + else: + insignificant_digits = len(str(full_year)) - self.significant_digits + if insignificant_digits <= 0: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + upper_year = (sig_digits + 1) * padding_value - 1 + return apply_delta( + add, + struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(LATEST), + ) + class Season(Date): def __init__(self, year, season, **kwargs): @@ -806,10 +891,6 @@ def _strict_date(self, lean): return min([x._strict_date(lean) for x in self.objects]) -class MaskedPrecision(Date): - pass - - class Level2Interval(Level1Interval): def __init__(self, lower, upper): # Check whether incoming lower/upper values are single-item lists, and @@ -831,18 +912,23 @@ class Level2Season(Season): class ExponentialYear(LongYear): - def __init__(self, base, exponent, precision=None): + def __init__(self, base, exponent, significant_digits=None): self.base = base self.exponent = exponent - self.precision = precision + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) def _precise_year(self): return int(self.base) * 10 ** int(self.exponent) def get_year(self): - if self.precision: - return f"{self.base}E{self.exponent}S{self.precision}" + if self.significant_digits: + return f"{self.base}E{self.exponent}S{self.significant_digits}" else: return f"{self.base}E{self.exponent}" year = property(get_year) + + def estimated(self): + return self._precise_year() diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index 8d9a770..1ec7452 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -14,8 +14,8 @@ # where the first value is a tuple, the second item is a tuple of the normalised parse result. # # The values in the second tuple indicate the iso versions of the derived Python `date`s. -# - If there's one other value, all the derived dates should be the same. -# - If there're two other values, then all the lower values should be the same +# - If there is one other value, all the derived dates should be the same. +# - If there are two other values, then all the lower values should be the same # and all the upper values should be the same. # - If there are three other values, then the upper and lower ``_strict`` values # should be the first value, and the upper and lower ``_fuzzy`` values should be @@ -193,8 +193,22 @@ # the year -170000000 ("Y-17E7", ("-170000000-01-01", "-170000000-12-31")), # L2 significant digits + # Some year between 1900 and 1999, estimated to be 1950 + ("1950S2", ("1950-01-01", "1950-12-31", "1900-01-01", "1999-12-31")), + ("1953S2", ("1953-01-01", "1953-12-31", "1900-01-01", "1999-12-31")), + ("1953S3", ("1953-01-01", "1953-12-31", "1950-01-01", "1959-12-31")), # Some year between 171010000 and 171999999, estimated to be 171010000 ('S3' indicates a precision of 3 significant digits.) - # ('Y17101E4S3', ('171010000-01-01', '171999999-12-31')), + ( + "Y17101E4S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), + # Some year between 338000 and 338999, estimated to be 338800 + ("Y3388E2S3", ("338800-01-01", "338800-12-31", "338000-01-01", "338999-12-31")), + # some year between 171000000 and 171999999 estimated to be 171010000 + ( + "Y171010000S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), # L2 Seasons # Spring southern hemisphere, 2001 ("2001-29", ("2001-09-01", "2001-11-30")),