Skip to content

Commit

Permalink
update / clean up query scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
vankesteren committed Nov 4, 2024
1 parent 38d26ae commit 4e2b7b1
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 27 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ requires-python = ">=3.12"
dependencies = [
"beautifulsoup4>=4.12.3",
"cmake>=3.30.4",
"fastexcel>=0.12.0",
"ipython>=8.28.0",
"lxml>=5.3.0",
"matplotlib>=3.9.2",
Expand Down
37 changes: 24 additions & 13 deletions src/query/query_space.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,44 @@
import polars as pl
from tqdm import tqdm
from pathlib import Path
import plotnine as p9
from src.query.utils import collect_year, compute_binomial_interval
from src.query.utils import query_disease_location_year, compute_binomial_interval

BASE_PATH = Path(".")
# optional if using external disk: BASE_PATH = Path("E:/", "disease_database")
COMBINED_DATA_FOLDER = Path("processed_data", "combined")
LOCATION_EXCEL_FILE = Path("raw_data", "manual_input", "municipalities_1869.xlsx")


def query_map(
disease_query: str,
year: int,
):
def query_space(disease_query: str, year: int):
# iterate over each municipality
muni_df = pl.read_excel(LOCATION_EXCEL_FILE)
muni_df = pl.read_excel(BASE_PATH / LOCATION_EXCEL_FILE).head()
text_df = pl.scan_parquet(
BASE_PATH / COMBINED_DATA_FOLDER / f"combined_{year}_{year + 1}.parquet"
)
df_list = []
for row in tqdm(muni_df.iter_rows(named=True), total=len(muni_df)):
try:
df_list.append(
collect_year(disease=disease_query, location=row["Regex"], year=year).with_columns(pl.lit(row["cbscode"]).alias("cbscode"))
query_disease_location_year(
df_lazy=text_df,
disease="(?i)" + disease_query,
location="(?i)" + row["Regex"],
year=year,
).with_columns(
pl.lit(row["cbscode"]).alias("cbscode"),
pl.lit(row["amsterdamcode"]).alias("amsterdamcode"),
pl.lit(row["Municipality"]).alias("Municipality"),
)
)
except Exception as e:
print(e)
df = pl.concat(df_list)
df = df.with_columns(
compute_binomial_interval(df["n_both"], df["n_location"])
)
df = df.with_columns(compute_binomial_interval(df["n_both"], df["n_location"]))
return df

res = query_map(r"choler.*|krim.?koorts", 1866)

res.write_
res = query_space(r"choler.*|krim.?koorts", 1866)

res.sort(["Municipality", "yr", "mo"])

res.write_parquet(Path("processed_data", "cholera_1866.parquet"))
26 changes: 19 additions & 7 deletions src/query/query_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
from tqdm import tqdm
from pathlib import Path
import plotnine as p9
from src.query.utils import collect_year, compute_binomial_interval
from src.query.utils import query_disease_location_year, compute_binomial_interval

BASE_PATH = Path(".")
# optional if using external disk: BASE_PATH = Path("E:/", "disease_database")
COMBINED_DATA_FOLDER = Path("processed_data", "combined")


def query_time(
disease_query: str,
location_query: str,
Expand All @@ -14,20 +17,29 @@ def query_time(
):
df_list = []
for yr in tqdm(range(start_year, end_year)):
text_df = pl.scan_parquet(
BASE_PATH / COMBINED_DATA_FOLDER / f"combined_{yr}_{yr + 1}.parquet"
)
try:
df_list.append(
collect_year(disease=disease_query, location=location_query, year=yr)
query_disease_location_year(
df_lazy=text_df,
disease="(?i)" + disease_query,
location="(?i)" + location_query,
year=yr,
)
)
except Exception as e:
print(e)

df = pl.concat(df_list)
df = df.with_columns(
compute_binomial_interval(df["n_both"], df["n_location"])
)
df = df.with_columns(compute_binomial_interval(df["n_both"], df["n_location"]))
return df

df = query_time(r"choler.*|krim.?koorts", r"graven.?hage|haag.*|s.?hage|grave\.")

df = query_time(
r"choler.*|krim.?koorts", r"graven.?hage|haag.*|s.?hage|grave\.", end_year=1870
)

plt = (
p9.ggplot(
Expand All @@ -43,7 +55,7 @@ def query_time(
+ p9.theme_linedraw()
+ p9.theme(legend_position="none", axis_text_x=p9.element_text(rotation="vertical"))
+ p9.labs(
title="Cholera in Amsterdam",
title="Cholera in The Hague",
y="Monthly normalized mentions",
)
)
Expand Down
15 changes: 8 additions & 7 deletions src/query/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
from scipy.stats import beta
import numpy as np

def collect_year(disease: str, location: str, year: int = 1830):
DIS = "(?i)" + disease
LOC = "(?i)" + location

def query_disease_location_year(
df_lazy: pl.LazyFrame, disease: str, location: str, year: int
):
return (
pl.scan_parquet(COMBINED_DATA_FOLDER / f"combined_{year}_{year + 1}.parquet")
.filter(
df_lazy.filter(
pl.col("newspaper_date").dt.year() >= year,
pl.col("newspaper_date").dt.year() <= year,
pl.col("article_text").str.contains(LOC),
pl.col("article_text").str.contains(location),
)
.with_columns(
pl.col("article_text").str.contains(DIS).alias("disease"),
pl.col("article_text").str.contains(disease).alias("disease"),
)
.sort(pl.col("newspaper_date"))
.with_columns(
Expand All @@ -28,6 +28,7 @@ def collect_year(disease: str, location: str, year: int = 1830):
.collect()
)


def compute_binomial_interval(
successes: pl.Series, tries: pl.Series, alpha: float = 0.95
):
Expand Down

0 comments on commit 4e2b7b1

Please sign in to comment.