-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnatural_to_date.py
143 lines (118 loc) · 4.81 KB
/
natural_to_date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import csv
from datetime import datetime
from datasets import load_dataset
import random
import time
random.seed(time.time())
def get_dates(date_str):
"""
This function takes a date string in the format "MM/DD/YYYY"
and returns a datetime object.
"""
date_range = date_str.split("-")
if len(date_range) != 2:
raise ValueError("Input date string is not in the correct format 'MM/DD/YYYY-MM/DD/YYYY' or 'MM/DD/YY-MM/DD/YY'")
date1_str = date_range[0].strip()
date2_str = date_range[1].strip()
# Check the length of the year part to determine the format
year_length = len(date1_str.split("/")[-1])
if year_length == 2:
date_format = '%m/%d/%y'
elif year_length == 4:
date_format = '%m/%d/%Y'
else:
raise ValueError("Year part of the date is not in a recognized format")
date1 = datetime.strptime(date1_str, date_format)
date2 = datetime.strptime(date2_str, date_format)
return date1, date2
from functools import lru_cache
@lru_cache(maxsize=None)
def scorer(date):
#get month day and year from date time object
month = date.month
day = date.day
year = date.year
#month, day, year = map(int, date.split('/'))
if year < 100: # Assuming two-digit years are 2000s
year += 2000
score = year * 10000 + month * 100 + day * 1
return score
def is_similar(start_date1, end_date1, start_date2, end_date2):
if abs(start_date1.year-start_date2.year) > 2:
return False
# Check for overlap directly using datetime comparisons
return (start_date2 <= start_date1 <= end_date2) or (start_date2 <= end_date1 <= end_date2)
# Load data from CSV files into dictionaries
def load_data(file_path, data_dict):
with open(file_path, 'r') as file:
reader = csv.reader(file)
for row in reader:
if "Computed" in row or "xx" in row[1]:
continue
data_dict[row[0]] = row[1]
def generate_hash_map(data_dict):
hash_map = {}
for key, date_str in data_dict.items():
start_date, end_date = get_dates(date_str)
hash_map[key] = (scorer(start_date), scorer(end_date))
return hash_map
# File paths for your CSV files
seasons_file = 'csv/seasons.csv'
dates_file = 'csv/dates_updated.csv'
lastx_file = 'csv/lastx_updated.csv'
relatives_dates_file = 'csv/relatives_dates_updated.csv'
months_file = 'csv/months.csv'
# Initialize dictionaries
season_dict = {}
dates_dict = {}
lastx_dict = {}
relativedate_dict = {}
monthsdict = {}
# Load data into dictionaries while skipping unwanted rows
load_data(seasons_file, season_dict)
load_data(dates_file, dates_dict)
load_data(lastx_file, lastx_dict)
load_data(relatives_dates_file, relativedate_dict)
load_data(months_file, monthsdict)
# Date formats to use
date_formats = ['%m/%d/%Y', '%d/%m/%Y', '%Y/%m/%d', '%Y-%m-%d', '%m-%d-%Y']
#writing to file titled csv/natural_to_date.csv
file = open('csv/natural_to_date.csv', 'w', newline='')
writer = csv.writer(file, delimiter='|')
writer.writerow(['Value', 'Computed'])
rows = []
for i in range(5):
for season_key, season_date in season_dict.items():
season_start, season_end = get_dates(season_date)
#generate a date between the two datetime objects
random_date = season_start + (season_end - season_start) * random.random()
date_format = random.choice(date_formats)
rows.append([season_key, random_date.strftime(date_format)])
for date_key, date_str in dates_dict.items():
start_date, end_date = get_dates(date_str)
random_date = start_date + (end_date - start_date) * random.random()
date_format = random.choice(date_formats)
rows.append([date_key, random_date.strftime(date_format)])
#for i in range(3):
#print lastx size
print(len(lastx_dict), "lastx size")
for lastx_key, lastx_date in lastx_dict.items():
lastx_start, lastx_end = get_dates(lastx_date)
random_date = lastx_start + (lastx_end - lastx_start) * random.random()
date_format = random.choice(date_formats)
rows.append([lastx_key, random_date.strftime(date_format)])
print(len(relativedate_dict), "relativedate size")
for relative_key, relative_date in relativedate_dict.items():
relative_start, relative_end = get_dates(relative_date)
random_date = relative_start + (relative_end - relative_start) * random.random()
date_format = random.choice(date_formats)
rows.append([relative_key, random_date.strftime(date_format)])
#for i in range(3):
for month_key, month_date in monthsdict.items():
month_start, month_end = get_dates(month_date)
random_date = month_start + (month_end - month_start) * random.random()
date_format = random.choice(date_formats)
rows.append([month_key, random_date.strftime(date_format)])
random.shuffle(rows)
writer.writerows(rows)
print(f"wrote {len(rows)} rows to natural_to_date.csv")