From 4e2b7b1a493dc72bdb3664e827b9c566e554ead3 Mon Sep 17 00:00:00 2001 From: Erik-Jan van Kesteren Date: Mon, 4 Nov 2024 16:53:31 +0100 Subject: [PATCH] update / clean up query scripts --- pyproject.toml | 1 + src/query/query_space.py | 37 ++++++++++++++++++++++++------------- src/query/query_time.py | 26 +++++++++++++++++++------- src/query/utils.py | 15 ++++++++------- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index db820da..7491b2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.12" dependencies = [ "beautifulsoup4>=4.12.3", "cmake>=3.30.4", + "fastexcel>=0.12.0", "ipython>=8.28.0", "lxml>=5.3.0", "matplotlib>=3.9.2", diff --git a/src/query/query_space.py b/src/query/query_space.py index 3350c2d..7380edc 100644 --- a/src/query/query_space.py +++ b/src/query/query_space.py @@ -1,33 +1,44 @@ import polars as pl from tqdm import tqdm from pathlib import Path -import plotnine as p9 -from src.query.utils import collect_year, compute_binomial_interval +from src.query.utils import query_disease_location_year, compute_binomial_interval +BASE_PATH = Path(".") +# optional if using external disk: BASE_PATH = Path("E:/", "disease_database") COMBINED_DATA_FOLDER = Path("processed_data", "combined") LOCATION_EXCEL_FILE = Path("raw_data", "manual_input", "municipalities_1869.xlsx") -def query_map( - disease_query: str, - year: int, -): +def query_space(disease_query: str, year: int): # iterate over each municipality - muni_df = pl.read_excel(LOCATION_EXCEL_FILE) + muni_df = pl.read_excel(BASE_PATH / LOCATION_EXCEL_FILE).head() + text_df = pl.scan_parquet( + BASE_PATH / COMBINED_DATA_FOLDER / f"combined_{year}_{year + 1}.parquet" + ) df_list = [] for row in tqdm(muni_df.iter_rows(named=True), total=len(muni_df)): try: df_list.append( - collect_year(disease=disease_query, location=row["Regex"], year=year).with_columns(pl.lit(row["cbscode"]).alias("cbscode")) + query_disease_location_year( + df_lazy=text_df, + disease="(?i)" + disease_query, + location="(?i)" + row["Regex"], + year=year, + ).with_columns( + pl.lit(row["cbscode"]).alias("cbscode"), + pl.lit(row["amsterdamcode"]).alias("amsterdamcode"), + pl.lit(row["Municipality"]).alias("Municipality"), + ) ) except Exception as e: print(e) df = pl.concat(df_list) - df = df.with_columns( - compute_binomial_interval(df["n_both"], df["n_location"]) - ) + df = df.with_columns(compute_binomial_interval(df["n_both"], df["n_location"])) return df -res = query_map(r"choler.*|krim.?koorts", 1866) -res.write_ \ No newline at end of file +res = query_space(r"choler.*|krim.?koorts", 1866) + +res.sort(["Municipality", "yr", "mo"]) + +res.write_parquet(Path("processed_data", "cholera_1866.parquet")) diff --git a/src/query/query_time.py b/src/query/query_time.py index 658a6af..db76c80 100644 --- a/src/query/query_time.py +++ b/src/query/query_time.py @@ -2,10 +2,13 @@ from tqdm import tqdm from pathlib import Path import plotnine as p9 -from src.query.utils import collect_year, compute_binomial_interval +from src.query.utils import query_disease_location_year, compute_binomial_interval +BASE_PATH = Path(".") +# optional if using external disk: BASE_PATH = Path("E:/", "disease_database") COMBINED_DATA_FOLDER = Path("processed_data", "combined") + def query_time( disease_query: str, location_query: str, @@ -14,20 +17,29 @@ def query_time( ): df_list = [] for yr in tqdm(range(start_year, end_year)): + text_df = pl.scan_parquet( + BASE_PATH / COMBINED_DATA_FOLDER / f"combined_{yr}_{yr + 1}.parquet" + ) try: df_list.append( - collect_year(disease=disease_query, location=location_query, year=yr) + query_disease_location_year( + df_lazy=text_df, + disease="(?i)" + disease_query, + location="(?i)" + location_query, + year=yr, + ) ) except Exception as e: print(e) df = pl.concat(df_list) - df = df.with_columns( - compute_binomial_interval(df["n_both"], df["n_location"]) - ) + df = df.with_columns(compute_binomial_interval(df["n_both"], df["n_location"])) return df -df = query_time(r"choler.*|krim.?koorts", r"graven.?hage|haag.*|s.?hage|grave\.") + +df = query_time( + r"choler.*|krim.?koorts", r"graven.?hage|haag.*|s.?hage|grave\.", end_year=1870 +) plt = ( p9.ggplot( @@ -43,7 +55,7 @@ def query_time( + p9.theme_linedraw() + p9.theme(legend_position="none", axis_text_x=p9.element_text(rotation="vertical")) + p9.labs( - title="Cholera in Amsterdam", + title="Cholera in The Hague", y="Monthly normalized mentions", ) ) diff --git a/src/query/utils.py b/src/query/utils.py index 87dbb96..52d376e 100644 --- a/src/query/utils.py +++ b/src/query/utils.py @@ -2,18 +2,18 @@ from scipy.stats import beta import numpy as np -def collect_year(disease: str, location: str, year: int = 1830): - DIS = "(?i)" + disease - LOC = "(?i)" + location + +def query_disease_location_year( + df_lazy: pl.LazyFrame, disease: str, location: str, year: int +): return ( - pl.scan_parquet(COMBINED_DATA_FOLDER / f"combined_{year}_{year + 1}.parquet") - .filter( + df_lazy.filter( pl.col("newspaper_date").dt.year() >= year, pl.col("newspaper_date").dt.year() <= year, - pl.col("article_text").str.contains(LOC), + pl.col("article_text").str.contains(location), ) .with_columns( - pl.col("article_text").str.contains(DIS).alias("disease"), + pl.col("article_text").str.contains(disease).alias("disease"), ) .sort(pl.col("newspaper_date")) .with_columns( @@ -28,6 +28,7 @@ def collect_year(disease: str, location: str, year: int = 1830): .collect() ) + def compute_binomial_interval( successes: pl.Series, tries: pl.Series, alpha: float = 0.95 ):