Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Significant Digits #55

Merged
merged 5 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 35 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ The object returned by `parse_edtf()` is an instance of an `edtf.parser.parser_c
PartialUnspecified
OneOfASet
MultipleDates
MaskedPrecision
Level2Interval
Level2Season
ExponentialYear
Expand Down Expand Up @@ -139,9 +138,8 @@ Test coverage includes every example given in the spec table of features.

* Partial uncertain/approximate:

>>> parse_edtf('(2011)-06-04~') # year certain, month/day approximate.
# Note that the result text is normalized
PartialUncertainOrApproximate: '2011-(06-04)~'
>>> parse_edtf('2004-06~-11') # year certain, month/day approximate.
PartialUncertainOrApproximate: '2004-06~-11'

* Partial unspecified:

Expand All @@ -158,20 +156,44 @@ Test coverage includes every example given in the spec table of features.
>>> parse_edtf('{1667,1668, 1670..1672}')
MultipleDates: '{1667, 1668, 1670..1672}'

* Masked precision:

>>> parse_edtf('197x') # A date in the 1970s.
MaskedPrecision: '197x'

* Level 2 Extended intervals:

>>> parse_edtf('2004-06-(01)~/2004-06-(20)~')
Level2Interval: '2004-06-(01)~/2004-06-(20)~'
>>> parse_edtf('2004-06-~01/2004-06-~20')
Level2Interval: '2004-06-~01/2004-06-~20'

* Year requiring more than 4 digits - exponential form:

>>> parse_edtf('Y-17e7')
ExponentialYear: 'Y-17e7'
>>> e = parse_edtf('Y-17E7')
ExponentialYear: 'Y-17E7'
>>> e.estimated()
-170000000

* Significant digits:
# '1950S2': some year between 1900 and 1999, estimated to be 1950
>>> d = parse_edtf('1950S2')
Date: '1950S2'
>>> d.lower_fuzzy()[:3]
(1900, 1, 1)
>>> d.upper_fuzzy()[:3]
(1999, 12, 31)
# 'Y171010000S3': some year between some year between 171000000 and 171999999 estimated to be 171010000, with 3 significant digits.
>>> l = parse_edtf('Y171010000S3')
LongYear: 'Y171010000S3'
>>> l.estimated()
171010000
>>> l.lower_fuzzy()[:3]
(171000000, 1, 1)
>>> l.upper_fuzzy()[:3]
(171999999, 12, 31)
# 'Y3388E2S3': some year in exponential notation between 338000 and 338999, estimated to be 338800
>>> e = parse_edtf('Y3388E2S3')
ExponentialYear: 'Y3388E2S3S3'
>>> e.estimated()
338800
>>> e.lower_fuzzy()[:3]
(338000, 1, 1)
>>> e.upper_fuzzy()[:3]
(338999, 12, 31)

### Natural language representation

Expand Down
31 changes: 19 additions & 12 deletions edtf/parser/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@
oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)])
zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)])

positiveDigit = Word(nums, exact=1, excludeChars="0")
digit = Word(nums, exact=1)
positiveDigit = Word(nums, exact=1, excludeChars="0")
positiveInteger = Combine(positiveDigit + ZeroOrMore(digit))

second = zeroThru59
minute = zeroThru59
Expand All @@ -63,13 +64,18 @@
^ (L("02")("month") + "-" + oneThru29("day"))
)

# Significant digits suffix
significantDigits = "S" + Word(nums)("significant_digits")

# 4 digits, 0 to 9
positiveYear = Word(nums, exact=4)

# Negative version of positive year, but "-0000" is illegal
negativeYear = NotAny(L("-0000")) + ("-" + positiveYear)

year = Combine(positiveYear ^ negativeYear)("year")
year = Combine(positiveYear ^ negativeYear)("year") + Optional(significantDigits)
# simple version for Consecutives
year_basic = Combine(positiveYear ^ negativeYear)("year")

yearMonth = year + "-" + month
yearMonthDay = year + "-" + monthDay # o hai iso date
Expand Down Expand Up @@ -112,9 +118,13 @@

# (* *** Long Year - Simple Form *** *)

longYearSimple = "Y" + Combine(
Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit)
)("year")
longYearSimple = (
"Y"
+ Combine(Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit))(
"year"
)
+ Optional(significantDigits)
)
LongYear.set_parser(longYearSimple)

# (* *** L1Interval *** *)
Expand Down Expand Up @@ -238,13 +248,12 @@ def f(toks):
seasonQualified = season + "^" + seasonQualifier

# (* ** Long Year - Scientific Form ** *)
positiveInteger = Combine(positiveDigit + ZeroOrMore(digit))
longYearScientific = (
"Y"
+ Combine(Optional("-") + positiveInteger)("base")
+ "E"
+ positiveInteger("exponent")
+ Optional("S" + positiveInteger("precision"))
+ Optional(significantDigits)
)
ExponentialYear.set_parser(longYearScientific)

Expand All @@ -260,15 +269,13 @@ def f(toks):
)
Level2Interval.set_parser(level2Interval)

# (* ** Masked precision ** *) eliminated in latest specs
# maskedPrecision = Combine(digit + digit + ((digit + "x") ^ "xx"))("year")
# MaskedPrecision.set_parser(maskedPrecision)

# (* ** Inclusive list and choice list** *)
consecutives = (
(yearMonthDay("lower") + ".." + yearMonthDay("upper"))
^ (yearMonth("lower") + ".." + yearMonth("upper"))
^ (year("lower") + ".." + year("upper"))
^ (
year_basic("lower") + ".." + year_basic("upper")
) # using year_basic because some tests were throwing `'list' object has no attribute 'expandtabs'` - somewhere, pyparsing.parse_string() was being passed a list
)
Consecutives.set_parser(consecutives)

Expand Down
108 changes: 97 additions & 11 deletions edtf/parser/parser_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,9 @@ def get_month(self):

month = property(get_month, set_month)

def __init__(self, year=None, month=None, day=None, **kwargs):
def __init__(
self, year=None, month=None, day=None, significant_digits=None, **kwargs
):
for param in ("date", "lower", "upper"):
if param in kwargs:
self.__init__(**kwargs[param])
Expand All @@ -270,13 +272,18 @@ def __init__(self, year=None, month=None, day=None, **kwargs):
self.year = year # Year is required, but sometimes passed in as a 'date' dict.
self.month = month
self.day = day
self.significant_digits = (
int(significant_digits) if significant_digits else None
)

def __str__(self):
r = self.year
if self.month:
r += f"-{self.month}"
if self.day:
r += f"-{self.day}"
if self.significant_digits:
r += f"S{self.significant_digits}"
return r

def isoformat(self, default=date.max):
Expand All @@ -286,6 +293,36 @@ def isoformat(self, default=date.max):
int(self.day or default.day),
)

def lower_fuzzy(self):
if not hasattr(self, "significant_digits") or not self.significant_digits:
return apply_delta(
sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST)
)
else:
total_digits = len(self.year)
insignificant_digits = total_digits - self.significant_digits
lower_year = (
int(self.year)
// (10**insignificant_digits)
* (10**insignificant_digits)
)
return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS)

def upper_fuzzy(self):
if not hasattr(self, "significant_digits") or not self.significant_digits:
return apply_delta(
add, self.upper_strict(), self._get_fuzzy_padding(LATEST)
)
else:
total_digits = len(self.year)
insignificant_digits = total_digits - self.significant_digits
upper_year = (int(self.year) // (10**insignificant_digits) + 1) * (
10**insignificant_digits
) - 1
return struct_time(
[upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS
)

def _precise_year(self, lean):
# Replace any ambiguous characters in the year string with 0s or 9s
if lean == EARLIEST:
Expand Down Expand Up @@ -337,6 +374,9 @@ def precision(self):
return PRECISION_MONTH
return PRECISION_YEAR

def estimated(self):
return self._precise_year(EARLIEST)


class DateAndTime(EDTFObject):
def __init__(self, date, time):
Expand Down Expand Up @@ -537,11 +577,17 @@ def _get_fuzzy_padding(self, lean):


class LongYear(EDTFObject):
def __init__(self, year):
def __init__(self, year, significant_digits=None):
self.year = year
self.significant_digits = (
int(significant_digits) if significant_digits else None
)

def __str__(self):
return f"Y{self.year}"
if self.significant_digits:
return f"Y{self.year}S{self.significant_digits}"
else:
return f"Y{self.year}"

def _precise_year(self):
return int(self.year)
Expand All @@ -553,6 +599,45 @@ def _strict_date(self, lean):
else:
return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS)

def estimated(self):
return self._precise_year()

def lower_fuzzy(self):
full_year = self._precise_year()
strict_val = self.lower_strict()
if not self.significant_digits:
return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST))
else:
insignificant_digits = len(str(full_year)) - int(self.significant_digits)
if insignificant_digits <= 0:
return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST))
padding_value = 10**insignificant_digits
sig_digits = full_year // padding_value
lower_year = sig_digits * padding_value
return apply_delta(
sub,
struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS),
self._get_fuzzy_padding(EARLIEST),
)

def upper_fuzzy(self):
full_year = self._precise_year()
strict_val = self.upper_strict()
if not self.significant_digits:
return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST))
else:
insignificant_digits = len(str(full_year)) - self.significant_digits
if insignificant_digits <= 0:
return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST))
padding_value = 10**insignificant_digits
sig_digits = full_year // padding_value
upper_year = (sig_digits + 1) * padding_value - 1
return apply_delta(
add,
struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS),
self._get_fuzzy_padding(LATEST),
)


class Season(Date):
def __init__(self, year, season, **kwargs):
Expand Down Expand Up @@ -806,10 +891,6 @@ def _strict_date(self, lean):
return min([x._strict_date(lean) for x in self.objects])


class MaskedPrecision(Date):
pass


class Level2Interval(Level1Interval):
def __init__(self, lower, upper):
# Check whether incoming lower/upper values are single-item lists, and
Expand All @@ -831,18 +912,23 @@ class Level2Season(Season):


class ExponentialYear(LongYear):
def __init__(self, base, exponent, precision=None):
def __init__(self, base, exponent, significant_digits=None):
self.base = base
self.exponent = exponent
self.precision = precision
self.significant_digits = (
int(significant_digits) if significant_digits else None
)

def _precise_year(self):
return int(self.base) * 10 ** int(self.exponent)

def get_year(self):
if self.precision:
return f"{self.base}E{self.exponent}S{self.precision}"
if self.significant_digits:
return f"{self.base}E{self.exponent}S{self.significant_digits}"
else:
return f"{self.base}E{self.exponent}"

year = property(get_year)

def estimated(self):
return self._precise_year()
20 changes: 17 additions & 3 deletions edtf/parser/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
# where the first value is a tuple, the second item is a tuple of the normalised parse result.
#
# The values in the second tuple indicate the iso versions of the derived Python `date`s.
# - If there's one other value, all the derived dates should be the same.
# - If there're two other values, then all the lower values should be the same
# - If there is one other value, all the derived dates should be the same.
# - If there are two other values, then all the lower values should be the same
# and all the upper values should be the same.
# - If there are three other values, then the upper and lower ``_strict`` values
# should be the first value, and the upper and lower ``_fuzzy`` values should be
Expand Down Expand Up @@ -193,8 +193,22 @@
# the year -170000000
("Y-17E7", ("-170000000-01-01", "-170000000-12-31")),
# L2 significant digits
# Some year between 1900 and 1999, estimated to be 1950
("1950S2", ("1950-01-01", "1950-12-31", "1900-01-01", "1999-12-31")),
("1953S2", ("1953-01-01", "1953-12-31", "1900-01-01", "1999-12-31")),
("1953S3", ("1953-01-01", "1953-12-31", "1950-01-01", "1959-12-31")),
# Some year between 171010000 and 171999999, estimated to be 171010000 ('S3' indicates a precision of 3 significant digits.)
# ('Y17101E4S3', ('171010000-01-01', '171999999-12-31')),
(
"Y17101E4S3",
("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"),
),
# Some year between 338000 and 338999, estimated to be 338800
("Y3388E2S3", ("338800-01-01", "338800-12-31", "338000-01-01", "338999-12-31")),
# some year between 171000000 and 171999999 estimated to be 171010000
(
"Y171010000S3",
("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"),
),
# L2 Seasons
# Spring southern hemisphere, 2001
("2001-29", ("2001-09-01", "2001-11-30")),
Expand Down
Loading