Skip to content

Commit

Permalink
First steps in adding a second model based off the BRD dataset (not G…
Browse files Browse the repository at this point in the history
…ED), this second model is not functional yet
  • Loading branch information
Bvlampe committed Oct 26, 2022
1 parent eddc639 commit fc6f92c
Show file tree
Hide file tree
Showing 8 changed files with 1,596 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .idea/CCIP dataset prep.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1,404 changes: 1,404 additions & 0 deletions BRD.csv

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions Countries_BRD.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
BRD,homicides
Laos,Lao PDR
UnitedKingdom,United Kingdom
Serbia(Yugoslavia),Serbia
FYROM,North Macedonia
SierraLeone,Sierra Leone
Bosnia-Herzegovina,Bosnia and Herzegovina
Syria,Syrian Arab Republic
Egypt,"Egypt, Arab Rep."
Kyrgyzstan,Kyrgyz Republic
Myanmar(Burma),Myanmar
TrinidadandTobago,Trinidad and Tobago
SolomonIslands,Solomon Islands
Turkey,Turkiye
SouthSudan,South Sudan
IvoryCoast,Cote d'Ivoire
PapuaNewGuinea,Papua New Guinea
SriLanka,Sri Lanka
Russia(SovietUnion),Russian Federation
Venezuela,"Venezuela, RB"
CentralAfricanRepublic,Central African Republic
BurkinaFaso,Burkina Faso
Congo,"Congo, Rep."
DRCongo(Zaire),"Congo, Dem. Rep."
UnitedStatesofAmerica,United States
Iran,"Iran, Islamic Rep."
Yemen(NorthYemen),"Yemen, Rep."
SaudiArabia,Saudi Arabia
ElSalvador,El Salvador
Cambodia(Kampuchea),Cambodia
File renamed without changes.
169 changes: 160 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
loc_homicides = "homicides.csv"
loc_ged = "GED_cleaned.csv"
loc_conflict_all = "ucdp-prio-acd-221.csv"
loc_concordance = "Country_names.csv"
loc_population = "CV_population.csv"

loc_population = "CV_population.csv"
loc_brd = "BRD.csv"

def avg_years(values):
try:
Expand All @@ -23,7 +23,8 @@ def avg_years(values):
print([type(x) for x in values])
sys.exit("Error in averaging the homicide rates for the following series:")

def prep():
def prepGED():
loc_concordance = "Countries_GED.csv"
# Read datasets
homicides = pd.read_csv(loc_homicides)
homicides.drop(columns=homicides.columns[0], axis=1, inplace=True)
Expand Down Expand Up @@ -125,7 +126,7 @@ def prep():
cc_ivdv.loc[i, "CV_pop"] = df_population.loc[df_population["Country Name"] == country, str(year)].values[0]

# Output "dirty" dataset
cc_ivdv.to_csv("output_dirty.csv")
cc_ivdv.to_csv("output_GED_dirty.csv")

# Determine SD for important variables, set outliers to none (beyond 2SD)
for variable in ["HR_rel_change", "CV_global_homicides"]:
Expand All @@ -140,14 +141,164 @@ def prep():
inplace=True)

# Output of csv for analysis
cc_ivdv.to_csv("output.csv")
cc_ivdv.to_csv("output_GED.csv")

return 0


def prepBRD():
loc_concordance = "Countries_BRD.csv"
# Read datasets
df_brd = pd.read_csv(loc_brd)[["conflict_id", "year", "battle_location", "bd_best"]]
homicides = pd.read_csv(loc_homicides)
homicides.drop(columns=homicides.columns[0], axis=1, inplace=True)
conflict_all = pd.read_csv(loc_conflict_all)
conflict_new = conflict_all[["conflict_id", "start_date2", "ep_end", "ep_end_date", "year"]]
df_population = pd.read_csv(loc_population, sep=';', header=1)

# Create country dict
country_df = pd.read_csv(loc_concordance)
country_dict = dict(zip(list(country_df["BRD"]), list(country_df["homicides"])))

# Create country set
BRD_countries = []
for i in df_brd.index:
BRD_countries += [x for x in df_brd.loc[i, "battle_location"].replace(' ', '').split(',')]
BRD_countries = set(BRD_countries)
for c in BRD_countries:
if c in country_dict.values():
BRD_countries.remove(c)
BRD_countries.add(BRD_countries[c])

# Create dummy variables per country
for c in BRD_countries:
df_brd[c] = 0

# Assign values to the dummy variables
for i in df_brd.index:
for c in df_brd.loc[i, "battle_location"].replace(' ', '').split(','):
if c in BRD_countries:
df_brd.loc[i, c] = 1
else:
df_brd.loc[i, country_dict[c]] = 1
df_brd.drop(columns=["battle_location"], inplace=True)

# Mark the rows of conflicts that are over
df = conflict_new.loc[:, ["conflict_id", "start_date2", "ep_end"]].groupby(["conflict_id", "start_date2"]).sum()
df.reset_index(inplace=True)
df.rename(columns={"ep_end": "has_ended"}, inplace=True)
conflict_new = conflict_new.merge(df, left_on=["conflict_id", "start_date2"],
right_on=["conflict_id", "start_date2"])

# Remove presently ongoing conflicts
conflict_new = conflict_new[conflict_new["has_ended"] == 1]

# Fill end date for all conflict-years, add start and end year as well as duration
conflict_new["ep_end_date"].fillna(method="bfill", inplace=True)
conflict_new["start_year"] = conflict_new["start_date2"].str[:4].astype(int)
conflict_new["end_year"] = conflict_new["ep_end_date"].str[:4].astype(int)
conflict_new["duration"] = conflict_new["end_year"].astype(int) - conflict_new["start_year"].astype(int) + 1

# Add up deaths per CCY triad
ccy_merge = df_brd.loc[:, :].groupby(
by=set(df_brd.columns).remove("bd_best")).sum()

return 0

ccy_merge.reset_index(inplace=True)

ccy_merge.rename(columns={"conflict_new_id": "conflict_id"}, inplace=True)
ccy_complete = ccy_merge.merge(conflict_new, left_on=["conflict_id", "year"], right_on=["conflict_id", "year"])

# Compact CCY into CC, removing the one line per year attribute
cc_iv = ccy_complete.loc[:, ["country", "conflict_id", "start_year", "end_year", "duration", "best"]].groupby(
by=["country", "conflict_id", "start_year", "end_year", "duration"]).sum().reset_index()
cc_iv["avg_deaths"] = cc_iv["best"] / cc_iv["duration"]

# Remove conflicts that started before 1965 or only just ended
cc_iv.drop(cc_iv[cc_iv.start_year < 1965].index, inplace=True)
cc_iv.drop(cc_iv[cc_iv.end_year > 2020].index, inplace=True)

# Harmonize country names
country_df = pd.read_csv(loc_concordance, sep=';')
country_dict = dict(zip(list(country_df["cc_iv"]), list(country_df["homicides"])))
cc_iv.replace({"country": country_dict}, inplace=True)

# Create DV columns
cc_iv["HR_before"] = None
cc_iv["HR_after"] = None

# Insert values for DV columns
for i in cc_iv.index: # cc_iv.shape[0]
country = cc_iv.loc[i, "country"]
start_year = int(cc_iv.loc[i, "start_year"])
end_year = int(cc_iv.loc[i, "end_year"])

homicides_row = homicides.loc[homicides["Country Name"] == country, :]
rates_before = [homicides_row.loc[:, str(y)].values[0] for y in range(start_year - 5, start_year)]
rates_after = [homicides_row.loc[:, str(y)].values[0] for y in range(end_year + 1, min(end_year + 6, 2022))]

avg_before = avg_years(rates_before)
avg_after = avg_years(rates_after)

cc_iv.loc[i, "HR_before"] = avg_before
cc_iv.loc[i, "HR_after"] = avg_after

# Ensure numeric format
cc_iv["HR_before"] = cc_iv["HR_before"].astype(float)
cc_iv["HR_after"] = cc_iv["HR_after"].astype(float)

# Drop country-episodes with 0 deaths
cc_iv.drop(cc_iv[cc_iv.best == 0].index, inplace=True)

# Add DV as ration of HR_after and HR_before
cc_ivdv = cc_iv
for i in cc_ivdv.index:
if cc_ivdv.loc[i, "HR_after"] and cc_ivdv.loc[i, "HR_before"]:
cc_ivdv.loc[i, "HR_rel_change"] = cc_ivdv.loc[i, "HR_after"] / cc_ivdv.loc[i, "HR_before"]
else:
cc_ivdv.loc[i, "HR_rel_change"] = None

# Add CV: global homicide rate (starting 2000)
cc_ivdv["CV_global_homicides"] = None
homicides_world = homicides.loc[homicides["Country Name"] == "World", :]
for i in cc_ivdv.index:
year = cc_ivdv.loc[i, "end_year"]
homicides = homicides_world.loc[:, str(year)].values[0]
if isinstance(homicides, str):
homicides = float(homicides.replace(',', '.'))
cc_ivdv.loc[i, "CV_global_homicides"] = homicides

# Add CV: country population at conflict end
cc_ivdv["CV_pop"] = None
for i in cc_ivdv.index:
year = cc_ivdv.loc[i, "end_year"]
country = cc_ivdv.loc[i, "country"]
cc_ivdv.loc[i, "CV_pop"] = df_population.loc[df_population["Country Name"] == country, str(year)].values[0]

# Output "dirty" dataset
cc_ivdv.to_csv("output_GED_dirty.csv")

# Determine SD for important variables, set outliers to none (beyond 2SD)
for variable in ["HR_rel_change", "CV_global_homicides"]:
mean = cc_ivdv[variable].mean()
sd = cc_ivdv[variable].std()
for i in cc_ivdv.index:
if abs(cc_ivdv.loc[i, variable] - mean) > 2 * sd:
cc_ivdv.loc[i, variable] = None

# Drop rows that have missing values
cc_ivdv.dropna(subset=["avg_deaths", "HR_before", "HR_after", "HR_rel_change", "CV_pop", "CV_global_homicides"],
inplace=True)

# Output of csv for analysis
cc_ivdv.to_csv("output_GED.csv")

return 0

def analyse():
dirty_data = pd.read_csv("output_dirty.csv")
dataset = pd.read_csv("output.csv")
dirty_data = pd.read_csv("output_GED_dirty.csv")
dataset = pd.read_csv("output_GED.csv")

dirty_data.describe().to_csv("dirty_descriptive_stats.csv")
dataset.describe().to_csv("descriptive_stats.csv")
Expand All @@ -156,8 +307,8 @@ def analyse():


def main():
prep()
analyse()
prepBRD()
# analyse()
return 0


Expand Down
File renamed without changes.
File renamed without changes.

0 comments on commit fc6f92c

Please sign in to comment.