forked from joaonmatos/feup-machine-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaner.py
82 lines (69 loc) · 3.33 KB
/
data_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import csv
def no_questions(cell):
return "" if cell == "?" else cell
with open("data/account.csv", newline="") as account:
reader = csv.reader(account, delimiter=";", quotechar="\"")
clean = open("clean-data/account.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/district.csv", newline="") as district:
reader = csv.reader(district, delimiter=";",
quotechar="\"", skipinitialspace=True)
clean = open("clean-data/district.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
for row in reader:
writer.writerow([no_questions(elem.strip().casefold())
for elem in row])
with open("data/client.csv", newline="") as client:
reader = csv.reader(client, delimiter=";",
quotechar="\"", skipinitialspace=True)
rows = [row for row in reader]
head = rows[0]
head.append("gender")
clean = open("clean-data/client.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerow([elem.strip().casefold() for elem in head])
for row in rows[1:]:
clean_strings = [elem.strip().casefold() for elem in row]
birth_number = int(clean_strings[1])
if (birth_number // 100) % 100 > 12:
clean_strings[1] = str(birth_number - 5000)
clean_strings.append("woman")
else:
clean_strings.append("man")
writer.writerow(clean_strings)
with open("data/disp.csv", newline="") as disposition:
reader = csv.reader(disposition, delimiter=";", quotechar="\"")
clean = open("clean-data/disposition.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/card_train.csv", newline="") as card:
reader = csv.reader(card, delimiter=";", quotechar="\"")
clean = open("clean-data/card_train.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/card_test.csv", newline="") as card:
reader = csv.reader(card, delimiter=";", quotechar="\"")
clean = open("clean-data/card_test.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/trans_test.csv", newline="") as transaction:
reader = csv.reader(transaction, delimiter=";", quotechar="\"")
clean = open("clean-data/transaction_test.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/trans_train.csv", newline="") as transaction:
reader = csv.reader(transaction, delimiter=";", quotechar="\"")
clean = open("clean-data/transaction_train.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/loan_train.csv", newline="") as loan:
reader = csv.reader(loan, delimiter=";", quotechar="\"")
clean = open("clean-data/loan_train.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)
with open("data/loan_test.csv", newline="") as loan:
reader = csv.reader(loan, delimiter=";", quotechar="\"")
clean = open("clean-data/loan_test.csv", mode="w", newline="")
writer = csv.writer(clean, dialect="excel")
writer.writerows(reader)