-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript_test_urns_cluster_NE.py
416 lines (358 loc) · 17.1 KB
/
script_test_urns_cluster_NE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 12 18:08:36 2022
@author: José Euclides de Melo Ferraz
"""
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.cluster import DBSCAN
from scipy.stats import t
#------------------------------------------------------------------------------
# Function to calculate haversine distance
#------------------------------------------------------------------------------
def haversine_distance(r, lat1, long1, lat2, long2, degree = True):
"""
Calculates the distance between two points along a maximum circle of
a spherical surface.
Parameters
----------
r : float
Radius of the spherical surface.
lat1 : float
Latitude of the first first point.
long1 : float
Longitude of the first second point.
lat2 : float
Latitude of the second point.
long2 : TYPE
Longitude of the second point.
degree : Bool
If true, assume coordinates in degrees.
If false, assume coordinates in radians.
Returns
-------
d : float
"""
#Convert coordinates to radians, if necessary
if degree:
m = np.pi / 180
lat1 = m * lat1
long1 = m * long1
lat2 = m * lat2
long2 = m * long2
x1 = math.sin((lat2-lat1)/2)**2
x2 = math.cos(lat1) * math.cos(lat2) * math.sin((long2-long1)/2)**2
d = 2 * r * math.asin(np.sqrt(x1 + x2))
return d
#------------------------------------------------------------------------------
# Function to perform test
#-=----------------------------------------------------------------------------
def test_difference(df):
"""
Tests the difference in proportion of votes to candidate 13 between older
models and 2020 model (UE2020), using the cluster of municipalities as the
experimental units.
Parameters
----------
df : dataframe
Dataframe with the results from the 2020 Brazilian Election, as in
file `VOTOS_T1E2.xlsx`, but with the added column 'cluster'.
Returns
-------
df_summary : dataframe
DataFrame with the following columns:
- `Statistic`: ["Sample size", "Mean", "Standard deviation", "t-statistic", "p-value"]
- `First_ound` : statistics for the test of favoring to candidate 13
in older urns x model 2020, in the first round.
- `Second_ound` : statistics for the test of favoring to candidate 13
in older urns x model 2020, in the second round.
df_out : dataframe
Dataframe with the following columns:
- `REGION` : region_list
- `CLUSTER` : cluster_filter_list
- `VOTE_13_NON_UE2020_T1` : vote_13_urn_other_T1_list
- `VOTE_22_NON_UE2020_T1` : vote_22_urn_other_T1_list
- `VOTE_TOTAL_NON_UE2020_T1` : vote_total_urn_other_T1_list
- `VOTE_13_UE2020_T1` : vote_13_urn_2020_T1_list
- `VOTE_22_UE2020_T1` : vote_22_urn_2020_T1_list
- `VOTE_TOTAL_2020_T1` : vote_total_urn_2020_T1_list
- `PROP_13_NON_UE2020_T1` : proportion_13_urn_other_T1_list
- `PROP_13_UE2020_T1` : proportion_13_urn_2020_T1_list
- `DIF_13_T1`: dif_13_T1_list
- `PROP_22_NON_UE2020_T1` : proportion_22_urn_other_T1_list
- `PROP_22_UE2020_T1` : proportion_22_urn_2020_T1_list
- `DIF_22_T1` : dif_22_T1_list
- `VOTE_13_NON_UE2020_T2` : vote_13_urn_other_T2_list
- `VOTE_22_NON_UE2020_T2`: vote_22_urn_other_T2_list
- `VOTE_TOTAL_NON_UE2020_T2`: vote_total_urn_other_T2_list
- `VOTE_13_UE2020_T2` : vote_13_urn_2020_T2_list
- `VOTE_22_UE2020_T2` : vote_22_urn_2020_T2_list
- `VOTE_TOTAL_2020_T2` : vote_total_urn_2020_T2_list
- `PROP_13_NON_UE2020_T2` : proportion_13_urn_other_T2_list
- `PROP_13_UE2020_T2` : proportion_13_urn_2020_T2_list
- `DIF_13_T2` : dif_13_T2_list
- `PROP_22_NON_UE2020_T2` : proportion_22_urn_other_T2_list
- `PROP_22_UE2020_T2` : proportion_22_urn_2020_T2_list
- `DIF_22_T2` : dif_22_T2_list
"""
#Extract urn models and clusters
cluster_list_aux = df["CLUSTER"].to_list()
urn_model_list_aux = df["LOG_MODELO"].to_list()
cluster_list = list(set(cluster_list_aux))
urn_model_list = list(set(urn_model_list_aux))
cluster_list.sort()
urn_model_list.sort()
#Eliminate non-eletronic urn from the urn list
urn_model_list_aux = urn_model_list
urn_model_list = [urn for urn in urn_model_list_aux if urn != "-"]
#Sum votes for 13 on each cluster, distriminating between UE2020 and other urn models
cluster_filter_list = []
region_list = []
vote_13_urn_other_T1_list = []
vote_13_urn_2020_T1_list = []
vote_22_urn_other_T1_list = []
vote_22_urn_2020_T1_list = []
vote_total_urn_other_T1_list = []
vote_total_urn_2020_T1_list = []
proportion_13_urn_other_T1_list = []
proportion_13_urn_2020_T1_list = []
proportion_22_urn_other_T1_list = []
proportion_22_urn_2020_T1_list = []
dif_13_T1_list = []
dif_22_T1_list = []
vote_13_urn_other_T2_list = []
vote_13_urn_2020_T2_list = []
vote_22_urn_other_T2_list = []
vote_22_urn_2020_T2_list = []
vote_total_urn_other_T2_list = []
vote_total_urn_2020_T2_list = []
proportion_13_urn_other_T2_list = []
proportion_13_urn_2020_T2_list = []
proportion_22_urn_other_T2_list = []
proportion_22_urn_2020_T2_list = []
dif_13_T2_list = []
dif_22_T2_list = []
for cluster in cluster_list:
#Identify cluster
df_cluster = df.loc[df["CLUSTER"] == cluster]
#Identify urns with model not-2020 in cluster
df_other = df_cluster.loc[(df_cluster["LOG_MODELO"] != 'UE2020') & (df_cluster["LOG_MODELO"] != '-')]
#Identify urns with model 2020 in cluster
df_2020 = df_cluster.loc[df_cluster["LOG_MODELO"] == 'UE2020']
#Select cluster with both types of urn model
if len(df_other) > 0 and len(df_2020) > 0:
region = df_2020.iloc[0]["REGIAO"]
#First round
sum_13_urn_other_T1 = df_other["T1QT13"].sum()
sum_13_urn_2020_T1 = df_2020["T1QT13"].sum()
sum_22_urn_other_T1 = df_other["T1QT22"].sum()
sum_22_urn_2020_T1 = df_2020["T1QT22"].sum()
sum_total_urn_other_T1 = df_other["T1QTVAL"].sum()
sum_total_urn_2020_T1 = df_2020["T1QTVAL"].sum()
proportion_13_urn_other_T1 = sum_13_urn_other_T1 / sum_total_urn_other_T1
proportion_13_urn_2020_T1 = sum_13_urn_2020_T1 / sum_total_urn_2020_T1
proportion_22_urn_other_T1 = sum_22_urn_other_T1 / sum_total_urn_other_T1
proportion_22_urn_2020_T1 = sum_22_urn_2020_T1 / sum_total_urn_2020_T1
dif_13_T1 = proportion_13_urn_other_T1 - proportion_13_urn_2020_T1
dif_22_T1 = proportion_22_urn_other_T1 - proportion_22_urn_2020_T1
#Second round
sum_13_urn_other_T2 = df_other["T2QT13"].sum()
sum_13_urn_2020_T2 = df_2020["T2QT13"].sum()
sum_22_urn_other_T2 = df_other["T2QT22"].sum()
sum_22_urn_2020_T2 = df_2020["T2QT22"].sum()
sum_total_urn_other_T2 = df_other["T2QTVAL"].sum()
sum_total_urn_2020_T2 = df_2020["T2QTVAL"].sum()
proportion_13_urn_other_T2 = sum_13_urn_other_T2 / sum_total_urn_other_T2
proportion_13_urn_2020_T2 = sum_13_urn_2020_T2 / sum_total_urn_2020_T2
proportion_22_urn_other_T2 = sum_22_urn_other_T2 / sum_total_urn_other_T2
proportion_22_urn_2020_T2 = sum_22_urn_2020_T2 / sum_total_urn_2020_T2
dif_13_T2 = proportion_13_urn_other_T2 - proportion_13_urn_2020_T2
dif_22_T2 = proportion_22_urn_other_T2 - proportion_22_urn_2020_T2
#Append variables
cluster_filter_list.append(cluster)
region_list.append(region)
vote_13_urn_other_T1_list.append(sum_13_urn_other_T1)
vote_22_urn_other_T1_list.append(sum_22_urn_other_T1)
vote_total_urn_other_T1_list.append(sum_total_urn_other_T1)
vote_13_urn_2020_T1_list.append(sum_13_urn_2020_T1)
vote_22_urn_2020_T1_list.append(sum_22_urn_2020_T1)
vote_total_urn_2020_T1_list.append(sum_total_urn_2020_T1)
proportion_13_urn_other_T1_list.append(proportion_13_urn_other_T1)
proportion_13_urn_2020_T1_list.append(proportion_13_urn_2020_T1)
proportion_22_urn_other_T1_list.append(proportion_22_urn_other_T1)
proportion_22_urn_2020_T1_list.append(proportion_22_urn_2020_T1)
dif_13_T1_list.append(dif_13_T1)
dif_22_T1_list.append(dif_22_T1)
vote_13_urn_other_T2_list.append(sum_13_urn_other_T2)
vote_22_urn_other_T2_list.append(sum_22_urn_other_T2)
vote_total_urn_other_T2_list.append(sum_total_urn_other_T2)
vote_13_urn_2020_T2_list.append(sum_13_urn_2020_T2)
vote_22_urn_2020_T2_list.append(sum_22_urn_2020_T2)
vote_total_urn_2020_T2_list.append(sum_total_urn_2020_T2)
proportion_13_urn_other_T2_list.append(proportion_13_urn_other_T2)
proportion_13_urn_2020_T2_list.append(proportion_13_urn_2020_T2)
proportion_22_urn_other_T2_list.append(proportion_22_urn_other_T2)
proportion_22_urn_2020_T2_list.append(proportion_22_urn_2020_T2)
dif_13_T2_list.append(dif_13_T2)
dif_22_T2_list.append(dif_22_T2)
#Prepare output dataframe
df_out = pd.DataFrame()
df_out["region"] = region_list
df_out["cluster"] = cluster_filter_list
df_out["vote_13_non_UE2020_t1"] = vote_13_urn_other_T1_list
df_out["vote_22_non_UE2020_t1"] = vote_22_urn_other_T1_list
df_out["vote_total_non_UE2020_t1"] = vote_total_urn_other_T1_list
df_out["vote_13_UE2020_t1"] = vote_13_urn_2020_T1_list
df_out["vote_22_UE2020_t1"] = vote_22_urn_2020_T1_list
df_out["vote_total_2020_t1"] = vote_total_urn_2020_T1_list
df_out["prop_13_non_UE2020_t1"] = proportion_13_urn_other_T1_list
df_out["prop_13_UE2020_t1"] = proportion_13_urn_2020_T1_list
df_out["diff_13_t1"] = dif_13_T1_list
df_out["prop_22_non_UE2020_t1"] = proportion_22_urn_other_T1_list
df_out["prop_22_UE2020_t1"] = proportion_22_urn_2020_T1_list
df_out["diff_22_t1"] = dif_22_T1_list
df_out["vote_13_non_UE2020_t2"] = vote_13_urn_other_T2_list
df_out["vote_22_non_UE2020_t2"] = vote_22_urn_other_T2_list
df_out["vote_total_non_UE2020_t2"] = vote_total_urn_other_T2_list
df_out["vote_13_UE2020_t2"] = vote_13_urn_2020_T2_list
df_out["vote_22_UE2020_t2"] = vote_22_urn_2020_T2_list
df_out["vote_total_2020_t2"] = vote_total_urn_2020_T2_list
df_out["prop_13_non_UE2020_t2"] = proportion_13_urn_other_T2_list
df_out["prop_13_UE2020_t2"] = proportion_13_urn_2020_T2_list
df_out["diff_13_t2"] = dif_13_T2_list
df_out["prop_22_non_UE2020_t2"] = proportion_22_urn_other_T2_list
df_out["prop_22_UE2020_t2"] = proportion_22_urn_2020_T2_list
df_out["diff_22_t2"] = dif_22_T2_list
#Sort df_out
df_out.sort_values(by = ["region", "cluster"], ascending = [True, True], inplace = True)
#Calculate test statistic for first round - candidate 13
# H0 : d1 = 0
# H1: d1 > 0
d_13_T1_H0 = 0.0
d_13_T1 = df_out["diff_13_t1"].to_list()
d_13_T1_mean = np.mean(d_13_T1)
d_13_T1_std = np.std(d_13_T1)
n = len(df_out)
d_13_T1_t = np.sqrt(n)*(d_13_T1_mean - d_13_T1_H0)/d_13_T1_std
d_13_T1_pvalue = 1 - t.cdf(d_13_T1_t, n-1)
#Calculate test statistic for second round - candidate 13
# H0 : d2 = 0
# H1: d2 > 0
d_13_T2_H0 = 0.0
d_13_T2 = df_out["diff_13_t2"].to_list()
d_13_T2_mean = np.mean(d_13_T2)
d_13_T2_std = np.std(d_13_T2)
n = len(df_out)
d_13_T2_t = np.sqrt(n)*(d_13_T2_mean - d_13_T2_H0)/d_13_T2_std
d_13_T2_pvalue = 1 - t.cdf(d_13_T2_t, n-1)
#Create dataframe with summary statistics
df_summary = pd.DataFrame()
stat_list = ["Sample size", "Mean", "Standard deviation", "t-statistic", "p-value"]
first_round_list = [n, d_13_T1_mean, d_13_T1_std, d_13_T1_t, d_13_T1_pvalue]
second_round_list = [n, d_13_T2_mean, d_13_T2_std, d_13_T2_t, d_13_T2_pvalue]
df_summary["statistic"] = stat_list
df_summary["first_round"] = first_round_list
df_summary["second_round"] = second_round_list
return df_summary, df_out
#------------------------------------------------------------------------------
#Define oprational folder
folder = "D:\\Dados\\Users\\Euclides\\Diversos\\Brazil_was_stolen"
#Read data file
df = pd.read_excel(os.path.join(folder, "VOTOS_T1E2.xlsx"))
urn_city_name_list = df["NM_MUNICIPIO"].to_list()
urn_city_name_list_lower = [x.lower() for x in urn_city_name_list]
#Read data file with municipalities and respective geographical coordinates
df_cities = pd.read_csv(os.path.join(folder, "municipios.csv"))
city_name_list = df_cities["nome"].to_list()
city_name_list_lower = [x.lower() for x in city_name_list]
#Read data file with states
df_states = pd.read_csv(os.path.join(folder, "estados.csv"))
#Extract coordinates from municipalities
coords = np.matrix(df_cities[["latitude", "longitude"]])
#Define average radius of Earth
r = 6371.088 #km
#Define list of cuttoff distances, in km
cutoff_list = [400.0, 300.0, 200.0, 100.0, 50.0, 20.0, 10.0, 5.0, 0.1]
#Fill state name and region in df_cities
codigo_uf_list = df_states["codigo_uf"].to_list()
uf_in_df_cities_list = []
region_symbol_in_df_cities_list = []
for i, row in df_cities.iterrows():
uf_ind = codigo_uf_list.index(row["codigo_uf"])
uf = df_states.iloc[uf_ind]["uf"]
uf_in_df_cities_list.append(uf)
df_cities["uf"] = uf_in_df_cities_list
#Find urn of the city in the city_name_list
city_index_list = []
missing = -1
for city in urn_city_name_list_lower:
if city in city_name_list_lower:
city_index_list.append(city_name_list_lower.index(city))
else:
city_index_list.append(missing)
#Add city index in dataframe
df["city_index"] = city_index_list
#Filter out records of unlocated cities
df = df.loc[df["city_index"] != missing]
#Perform analysis
columns = ["cutoff", "n_t1", "diff_mean_t1", "diff_std_t2", "diff_t_t1", "diff_pvalue_t1",\
"n_t2", "diff_mean_t2", "diff_std_t2", "diff_t_t2", "diff_pvalue_t2"]
df_cutoff = pd.DataFrame(columns = columns)
for cutoff in cutoff_list:
#Get clusters based on the haversine distance
epsilon = cutoff / r
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.degrees(coords))
cluster_label_list = db.labels_
clusters = list(set(cluster_label_list))
#Append cluster in the city dataframe
df_cities_aux = df_cities.copy()
df_cities_aux["cluster"] = cluster_label_list
#Assign cluster to the urn dataset
city_index_list = df["city_index"].to_list()
urn_city_cluster_list = [cluster_label_list[x] for x in city_index_list]
df["CLUSTER"] = urn_city_cluster_list
#Separate cluster in NE from clusters in other regions
df_NE = df.loc[df["REGIAO"] == "NE"]
#Perform analysis
df_summary_cluster_NE, df_out_cluster_NE = test_difference(df_NE)
#For each cluster, find the components (cities)
comp_list = []
for i, row_i in df_out_cluster_NE.iterrows():
df_components = df_cities_aux.loc[df_cities_aux["cluster"] == row_i["cluster"]]
comp_i_str = ""
for j, row_j in df_components.iterrows():
comp_i_str += f'{row_j["nome"]} / {row_j["uf"]};'
comp_list.append(comp_i_str[0:-1])
df_out_cluster_NE["cluster_components"] = comp_list
#Append data in output dataframe
line = [cutoff] + df_summary_cluster_NE["first_round"].to_list() + df_summary_cluster_NE["second_round"].to_list()
df_cutoff.loc[len(df_cutoff)] = line
#Save last dataframes (at city level)
df_summary_cluster_NE.to_csv(os.path.join(folder, "df_summary_cluster_NE.csv"), sep=";", decimal=",", encoding = 'latin1')
df_out_cluster_NE.to_csv(os.path.join(folder, "df_out_cluster_NE.csv"), sep=";", decimal=",", encoding = 'latin1')
df_cutoff.to_csv(os.path.join(folder, "df_cutoff.csv"), sep=";", decimal=",")
#Plot histogram d_13_T1 discrimating NE x other regions - candidate 13
d_13_t1 = df_out_cluster_NE["diff_13_t1"]
plt.hist(d_13_t1)
plt.title("Hist. diff. urn non-2020 x urn 2020, for candidate 13, 1st round, NE cities")
plt.savefig(os.path.join(folder, "histogram_cluster_NE_13_T1.jpg"))
plt.show()
plt.close()
d_13_t2 = df_out_cluster_NE["diff_13_t2"]
plt.hist(d_13_t2)
plt.title("Hist. diff. urn non-2020 x urn 2020, for candidate 13, 2nd round, NE cities")
plt.savefig(os.path.join(folder, "histogram_cluster_NE_13_T2.jpg"))
plt.show()
plt.close()
#Plot p-valyues x cutoff
plt.plot(cutoff_list, df_cutoff["diff_pvalue_t1"].to_list(), cutoff_list, df_cutoff["diff_pvalue_t2"].to_list())
plt.legend(["1st round", "2nd round"])
plt.title("p-values from test H0: dmean_13 = 0 x H1: dmean_13 > 0")
plt.xlabel("cutoff (km)")
plt.ylabel("p-value")
plt.savefig(os.path.join(folder, "pvalue_cluster_NE.jpg"))
plt.show()
plt.close()