Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reformat tests to use Pytest style; fix infinite comparison for OneOfASet #45

Merged
merged 3 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
Expand Down
115 changes: 51 additions & 64 deletions edtf/natlang/tests.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
import unittest
import pytest
from edtf.natlang.en import text_to_edtf

# TODO update the tests and code to test and output the new spec

# where examples are tuples, the second item is the normalised output
EXAMPLES = (
('active late 17th-19th centuries', '16xx/18xx'), # ignoring 'late' for now
('active 17-19th Centuries', '16xx/18xx'), # ignoring 'late' for now
@pytest.mark.parametrize("input_text,expected_output", [
# Ignoring 'late' for simplicity in these examples
('active late 17th-19th centuries', '16xx/18xx'),
('active 17-19th Centuries', '16xx/18xx'),

# Unrecognised values
('', None),
('this isn\'t a date', None),

# Explicity rejected values that would otherwise be badly converted
# Explicitly rejected values that would otherwise be badly converted
('23rd Dynasty', None),

('90', '1990'), # implied century
# Implied century and specific years
('90', '1990'), # Implied century
('1860', '1860'),
('the year 1800', '1800'),
('the year 1897', '1897'),
('January 2008', '2008-01'),
('January 12, 1940', '1940-01-12'),

# uncertain/approximate
# Uncertain or approximate dates
('1860?', '1860?'),
('1862 (uncertain)', '1862?'),
('maybe 1862', '1862?'),
Expand All @@ -31,11 +35,11 @@
('~ Feb 1812', '1812-02~'),
('circa Feb 1812', '1812-02~'),
('Feb 1812 approx', '1812-02~'),
('c1860', '1860~'), # different abbreviations
('c.1860', '1860~'), # with or without .
('c1860', '1860~'), # Different abbreviations
('c.1860', '1860~'), # With or without .
('ca1860', '1860~'),
('ca.1860', '1860~'),
('c 1860', '1860~'), # with or without space
('c 1860', '1860~'), # With or without space
('c. 1860', '1860~'),
('ca. 1860', '1860~'),
('approx 1860', '1860~'),
Expand All @@ -44,75 +48,72 @@
('approximately 1860', '1860~'),
('about 1860', '1860~'),
('about Spring 1849', '1849-21~'),
('notcirca 1860', '1860'), # avoid words containing circa
('attica 1802', '1802'),
# avoid false positive circa at the end of preceding word
('attic. 1802', '1802'), # avoid false positive circa
('notcirca 1860', '1860'), # Avoid words containing 'circa'
('attica 1802', '1802'), # Avoid false positive 'circa' at the end of preceding word
('attic. 1802', '1802'), # Avoid false positive 'circa'

# masked precision
('1860s', '186x'), # 186x has decade precision, 186u has year precision.
# Masked precision
('1860s', '186x'), # 186x has decade precision, 186u has year precision.

# masked precision + uncertainty
# Masked precision + uncertainty
('ca. 1860s', '186x~'),
('c. 1860s', '186x~'),
('Circa 1840s', '184x~'),
('circa 1840s', '184x~'),
('ca. 1860s?', '186x?~'),
('uncertain: approx 1862', '1862?~'),

# masked precision with first decade (ambiguous)
('1800s', '18xx'), # without additional uncertainty, use the century
('2000s', '20xx'), # without additional uncertainty, use the century
('c1900s', '190x~'), # if there's additional uncertainty, use the decade
('c1800s?', '180x?~'), # if there's additional uncertainty, use the decade
# Ambiguous masked precision for centuries and decades
('1800s', '18xx'), # Without additional uncertainty, use the century
('2000s', '20xx'), # Without additional uncertainty, use the century
('c1900s', '190x~'), # If there's additional uncertainty, use the decade
('c1800s?', '180x?~'), # If there's additional uncertainty, use the decade

# unspecified
# Unspecified dates
('January 12', 'uuuu-01-12'),
('January', 'uuuu-01'),
('10/7/2008', '2008-10-07'),
('7/2008', '2008-07'),

# seasons
# Seasons mapped to specific codes
('Spring 1872', '1872-21'),
('Summer 1872', '1872-22'),
('Autumn 1872', '1872-23'),
('Fall 1872', '1872-23'),
('Winter 1872', '1872-24'),

# before/after
# Dates relative to known events (before/after)
('earlier than 1928', 'unknown/1928'),
('before 1928', 'unknown/1928'),
('after 1928', '1928/unknown'),
('later than 1928', '1928/unknown'),
('before January 1928', 'unknown/1928-01'),
('before 18 January 1928', 'unknown/1928-01-18'),

# before/after approx
# Approximations combined with before/after
('before approx January 18 1928', 'unknown/1928-01-18~'),
('before approx January 1928', 'unknown/1928-01~'),
('after approx January 1928', '1928-01~/unknown'),
('after approx Summer 1928', '1928-22~/unknown'),

# before/after and uncertain/unspecificed
# Before and after with uncertain / unspecified components
('after about the 1920s', '192x~/unknown'),
('before about the 1900s', 'unknown/190x~'),
('before the 1900s', 'unknown/19xx'),

# unspecified
# Specifying unspecified components within a date
# ('decade in 1800s', '18ux'), #too esoteric
# ('decade somewhere during the 1800s', '18ux'), #lengthier. Keywords are 'in' or 'during'
('year in the 1860s', '186u'),
# 186x has decade precision, 186u has year precision.
('year in the 1800s', '18xu'),
('year in the 1860s', '186u'), # 186x has decade precision
('year in the 1800s', '18xu'), # 186u has year precision
('year in about the 1800s', '180u~'),
('month in 1872', '1872-uu'),
('day in Spring 1849', '1849-21-uu'),
('day in January 1872', '1872-01-uu'),
('day in 1872', '1872-uu-uu'),
('birthday in 1872', '1872'),
# avoid false positive at end of preceding word

# centuries
# Handling centuries with approximation and uncertainty
('1st century', '00xx'),
('10c', '09xx'),
('19th century', '18xx'),
Expand All @@ -126,7 +127,7 @@
('19c?', '18xx?'),
('c.19c?', '18xx?~'),

# BC/AD
# BC/AD dating
('1 AD', '0001'),
('17 CE', '0017'),
('127 CE', '0127'),
Expand All @@ -136,18 +137,17 @@
('c127 CE', '0127~'),
('c1270 CE', '1270~'),
('c64 BCE', '-0064~'),
('2nd century bc', '-01xx'), # -200 to -101
('2nd century bc', '-01xx'), # -200 to -101
('2nd century bce', '-01xx'),
('2nd century ad', '01xx'),
('2nd century ce', '01xx'),

# c-c-c-combo
# just showing off now...
# Combining uncertainties and approximations in creative ways
('a day in about Spring 1849?', '1849-21-uu?~'),

# simple ranges. Not all of these results are correct EDTF, but
# this is as good as the EDTF implementation and simple natural
# language parser we have.
# Simple date ranges, showcasing both the limitations and capabilities of the parser
# Not all of these results are correct EDTF, but this is as good as the EDTF implementation
# and simple natural language parser we have.
('1851-1852', '1851/1852'),
('1851-1852; printed 1853-1854', '1851/1852'),
('1851-52', '1851/1852'),
Expand All @@ -156,7 +156,6 @@
('1857-mid 1860s', '1857/186x'),
('1858/1860', '[1858, 1860]'),
('1860s-1870s', '186x/187x'),
('1861, printed 1869', '1861'),
('1910-30', '1910/1930'),
('active 1910-30', '1910/1930'),
('1861-67', '1861/1867'),
Expand All @@ -174,16 +173,13 @@
('1900; 1973', '1900'),
('1900; printed 1912', '1900'),
('1915 late - autumn 1916', '1915/1916-23'),

('1915, from Camerawork, October 1916', '1915'), # should be {1915, 1916-10}
('1915, from Camerawork, October 1916', '1915'), # should be {1915, 1916-10}
('1920s -early 1930s', '192x/193x'),
('1930s, printed early 1960s', '193x'), # should be something like {193x, 196x},
# though those forms aren't explicitly supported in the spec.
('1932, printed 1976 by Gunther Sander', '1932'), # should be {1932, 1976}
('1938, printed 1940s-1950s', '1938'), # should be something like {1938, 194x-195x}


('1938, printed 1940s-1950s', '1938') # should be something like {1938, 194x-195x}

# Uncertain and approximate on different parts of the date
# for these to work we need to recast is_uncertain and is_approximate
# such that they work on different parts. Probably worth rolling our own
# dateparser at this point.
Expand All @@ -194,22 +190,13 @@
# ('a day in about Spring in about 1849', '1849~-21~-uu'),
# ('maybe January in some year in about the 1830s', '183u~-01?'),
# ('about July? in about 1849', '1849~-07?~'),
)


class TestLevel0(unittest.TestCase):
def test_natlang(self):
"""
For each of the examples, establish that:
- the unicode of the parsed object is acceptably equal to the EDTF string
- the parsed object is a subclass of EDTFObject
:return:
"""
for i, o in EXAMPLES:
e = text_to_edtf(i)
print("%s => %s" % (i, e))
self.assertEqual(e, o)
])

def test_natlang(input_text, expected_output):
"""
Test natural language conversion to EDTF format:
Verify that the conversion from text to EDTF format matches the expected output.
"""
result = text_to_edtf(input_text)
assert result == expected_output, f"Failed for input: {input_text}"

if __name__ == '__main__':
unittest.main()
12 changes: 10 additions & 2 deletions edtf/parser/parser_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,10 +713,18 @@ def __str__(self):
return "[%s]" % (", ".join([str(o) for o in self.objects]))

def _strict_date(self, lean):
strict_dates = [x._strict_date(lean) for x in self.objects]
# Accounting for possible 'inf' and '-inf' values
if lean == LATEST:
return max([x._strict_date(lean) for x in self.objects])
if any(isinstance(d, float) and d == float('inf') for d in strict_dates):
return float('inf')
else:
return max((d for d in strict_dates if not isinstance(d, float)), default=float('inf'))
else:
return min([x._strict_date(lean) for x in self.objects])
if any(isinstance(d, float) and d == float('-inf') for d in strict_dates):
return float('-inf')
else:
return min((d for d in strict_dates if not isinstance(d, float)), default=float('-inf'))


class MultipleDates(EDTFObject):
Expand Down
Loading