-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_data.py
126 lines (95 loc) · 5.32 KB
/
create_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
import os
from functions.sub_data_prep import prep_data, scale_timeseries, transform_to_timeseries
from global_vars import T_MAX, GAMMA
from global_vars import path_data, path_data_backtesting
def run_main(mode = 'train', kfolds=2):
'''
Perform the following actions
1) Fetch csv-data, i.e. a portfolio of contracts
2) scale data
3) obtain target data, i.e. discounted cash flows, which depend on
- the discounting factor GAMMA (HParam) and
- the implicit assumption on the cost structure in the transform_to_timeseries function
4) apply zero-padding to target and contract data to speed up neural training lateron
5) save the processed data
Note:
For training (mode = train) we assume the premiums to be known.
For economic testing (mode = test) of the final neural network architecture we will set the premium equal to zero and infer it from the trained transition probabilities.
More detail for the testing-mode will be given in the respective python-script. However, we use this preprocessing-function for creating the scaled contract data and the cash-flow data without premium related costs.
'''
assert mode in ['train', 'test'] # sanity-check for user input
assert kfolds==2, 'only 2fold crossvalidation implemented atm'
#### load original data for processing
data = pd.read_csv(os.path.join(path_data,r'Tarifierung_RI_2017.csv'), delimiter=';' )
# Note: scaler fit, but not used to scale/ transform data yet (!!)
data, scaler = prep_data(data, scale_age = (0, T_MAX))
with open(os.path.join(path_data, 'Data.npy'), 'wb') as f:
np.save(f, data)
if mode == 'test':
# set premium values to 0
# Note from prep_data(): data = x[['x', 'n', 't', 'ZahlweiseNum','Beginnjahr', 'Beginnmonat', 'GeschlechtNum', 'RauchertypNum', 'Leistung', 'tba']].values
data[:,-1] = 0
# transform to ts; data-format: list of sequences of different lengths
x_ts_raw, y_ts = transform_to_timeseries(data)
# scale ts (x-data only)
# Important: sklearn-learn scaler is not compatible with manually set feature-range, i.e. scaler.transform(x) should not be used -> custom function scale_timeseries()
x_ts = scale_timeseries(x_ts_raw, scaler)
# create new target data y_ts_discounted, which includes discounting straight away
# Note: discount needs to respect the frequency of observations, e.g. monthly, semi-anually, anually
y_ts_discounted = [None]*len(y_ts)
for k, (x_val, y_val) in enumerate(zip(x_ts, y_ts)):
number_steps = x_val.shape[1]
y_ts_discounted[k] = y_val*GAMMA**(x_val[:,:,3:4]*np.arange(number_steps).reshape(1,-1, 1))
# prepare for saving objects
x_ts, y_ts = np.array(x_ts, dtype='object'), np.array(y_ts, dtype='object')
x_ts_raw = np.array(x_ts_raw, dtype='object')
y_ts_discounted = np.array(y_ts_discounted, dtype='object')
# data for debugging
# if mode == 'train':
# with open(os.path.join(path_data_backtesting,r'x_ts_raw.npy'), 'wb') as f:
# np.save(f, x_ts_raw)
# with open(os.path.join(path_data_backtesting,r'x_ts.npy'), 'wb') as f:
# np.save(f, x_ts)
# with open(os.path.join(path_data_backtesting,r'y_ts.npy'), 'wb') as f:
# np.save(f, y_ts)
# with open(os.path.join(path_data_backtesting,r'y_ts_discounted.npy'), 'wb') as f:
# np.save(f, y_ts_discounted)
# create zero paded data
N_features_x = x_ts[0].shape[-1]
N_features_y = y_ts[0].shape[-1]
N_contracts = 0
max_len = 0
for el in x_ts:
N_contracts += len(el)
max_len = max(max_len, el.shape[1])
x, x_raw = np.zeros((N_contracts, max_len, N_features_x), dtype=np.float32), np.zeros((N_contracts, max_len, N_features_x), dtype=np.float32)
y = np.zeros((N_contracts, max_len, N_features_y), dtype=np.float32)
pointer = 0
for el_x, el_x_raw, el_yd in zip(x_ts, x_ts_raw, y_ts_discounted):
batch_sz = len(el_x)
steps = el_x.shape[1]
x[pointer:pointer+batch_sz, 0:steps] = el_x
x_raw[pointer:pointer+batch_sz, 0:steps] = el_x_raw
y[pointer:pointer+batch_sz, 0:steps] = el_yd
pointer += batch_sz
with open(os.path.join(path_data,r'x_{}.npy'.format(mode)), 'wb') as f:
np.save(f, x)
with open(os.path.join(path_data,r'x_{}_raw.npy'.format(mode)), 'wb') as f:
np.save(f, x_raw)
with open(os.path.join(path_data,r'y_{}.npy'.format(mode)), 'wb') as f:
np.save(f, y)
# data for cross validation study
for k in range(kfolds):
with open(os.path.join(path_data,r'x_{}_cv_{}.npy'.format(mode, k)), 'wb') as f:
np.save(f, x[k::kfolds])
with open(os.path.join(path_data,r'x_{}_raw_cv_{}.npy'.format(mode, k)), 'wb') as f:
np.save(f, x_raw[k::kfolds])
with open(os.path.join(path_data,r'y_{}_cv_{}.npy'.format(mode, k)), 'wb') as f:
np.save(f, y[k::kfolds])
if __name__ == '__main__':
# data and cash-flows with premium-values
run_main(mode='train')
# data and cash-flows without premium-values (resp. premium set to 0)
run_main(mode='test')