Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding options to keep evaluated games only #6

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions ingest_lichess.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from ingester import ingest_lichess_data


def main(start, end, pq_dir, months=None, include_moves=False, restart_counter_games=True, dir_ndjson=None, ndjson_size=1e6):
def main(start, end, pq_dir, months=None, include_moves=False, number_moves=3, keep_only_evaluated=False,
fs=None, restart_counter_games=True, dir_ndjson=None, ndjson_size=1e6):
"""Download data with a check for existing parquet files."""
pq_dir = Path(pq_dir)
pq_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -18,7 +19,7 @@ def main(start, end, pq_dir, months=None, include_moves=False, restart_counter_g
years = range(start, end)
if months is None:
months = range(1, 13)
arguments = [(y, m, pq_dir, include_moves, dir_ndjson, ndjson_size) for y in years for m in months]
arguments = [(y, m, pq_dir, include_moves, number_moves, keep_only_evaluated, fs, dir_ndjson, ndjson_size) for y in years for m in months]

for arg in arguments:
if (Path(pq_dir) / f"{arg[0]}_{arg[1]:02}.parquet").exists():
Expand All @@ -31,7 +32,9 @@ def main(start, end, pq_dir, months=None, include_moves=False, restart_counter_g
parser.add_argument('--start', type=int, default=2013)
parser.add_argument('--end', type=int, default=datetime.date.today().year)
parser.add_argument('--months', nargs='+', type=int)
parser.add_argument('--include-moves', action='store_true', default=False)
parser.add_argument('--number-moves', default=3, type=int)
parser.add_argument('--keep-only-evaluated', action='store_true', default=False)
parser.add_argument('--fs-path', type=str, default=None)
parser.add_argument('--debug', action='store_true', default=False)
parser.add_argument('--parquet-dir', type=Path, default="./lichess_parquet")
parser.add_argument('--dir-ndjson', type=str, default=None)
Expand All @@ -46,6 +49,9 @@ def main(start, end, pq_dir, months=None, include_moves=False, restart_counter_g
months=args.months,
include_moves=args.include_moves,
pq_dir=args.parquet_dir,
number_moves=args.number_moves,
keep_only_evaluated=args.keep_only_evaluated,
fs=args.fs_path,
dir_ndjson=args.dir_ndjson,
ndjson_size=args.ndjson_size
)
35 changes: 25 additions & 10 deletions ingester.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def ingest_lichess_data(year: int,
month: int,
dir_parquet: str = "./lichess_parquet",
include_moves: bool = False,
n_moves: Optional[int] = 3,
keep_only_evaluated: bool = False,
fs: Optional[s3fs.core.S3FileSystem] = None,
dir_ndjson: Optional[str] = None,
ndjson_size: int = 1e6):
Expand All @@ -40,6 +42,8 @@ def ingest_lichess_data(year: int,
to "../lichess_parquet".
include_moves (bool, optional; default False): Whether to include games' moves in the
saved data. Including moves greatly increases the size of the Parquet files.
n_moves (int, optional; default 3): Number of moves to keep per game. None for all games.
keep_only_evaluated (bool, optional; default False): Whether to keep only evaluated games, or all games.
fs (s3fs.core.S3FileSystem, optional): If provided, the function will use this filesystem
to read and write files. Defaults to None.
dir_ndjson (str, optional): Directory where NDJSON files will be saved. Defaults to None.
Expand Down Expand Up @@ -152,9 +156,12 @@ def ingest_lichess_data(year: int,

elif line.startswith("1."):
if include_moves:
# Keep only 3 moves
# Keep only n_moves moves
moves = line.replace("\n", " ").strip()
moves = moves.split("4.")[0]
if n_moves is not None:
moves = moves.split(f"{n_moves+1}.")[0]


else:
moves = ""
elif line.startswith("["): # Game continues, keep appending
Expand All @@ -171,6 +178,7 @@ def ingest_lichess_data(year: int,

for player in ["White", "Black"]:
id_player = game_df[player]
other_player = "White" if player == "Black" else "Black"

game_type = game_df["Event"]
if game_type not in d_cum_games:
Expand Down Expand Up @@ -209,11 +217,11 @@ def ingest_lichess_data(year: int,

# Max ELO faced
max_elo = d_cum_games[game_type][f"{id_player}Elo_max_faced"]
if game_df[f"{player}Elo"] == "?":
if game_df[f"{other_player}Elo"] == "?":
game_df[f"{player}Elo_max_faced"] = max_elo
elif int(game_df[f"{player}Elo"]) > max_elo:
d_cum_games[game_type][f"{id_player}Elo_max_faced"] = int(game_df[f"{player}Elo"])
game_df[f"{player}Elo_max_faced"] = int(game_df[f"{player}Elo"])
elif int(game_df[f"{other_player}Elo"]) > max_elo:
d_cum_games[game_type][f"{id_player}Elo_max_faced"] = int(game_df[f"{other_player}Elo"])
game_df[f"{player}Elo_max_faced"] = int(game_df[f"{other_player}Elo"])
else:
game_df[f"{player}Elo_max_faced"] = max_elo

Expand All @@ -226,13 +234,20 @@ def ingest_lichess_data(year: int,
# Add concat DateTime field to replace seperate Date & TIme
game_df.update({'DateTime': f"{game_df['UTCDate']} {game_df['UTCTime']}"})

# Write complete game to temp file
temp_file.write(json.dumps(game_df) + "\n")
if keep_only_evaluated:
if game_df["Evaluation_flag"]:
# Write complete game to temp file
temp_file.write(json.dumps(game_df) + "\n")
games += 1
else:
# Write complete game to temp file
temp_file.write(json.dumps(game_df) + "\n")
games += 1

looking_for_game = True
game = []
moves = None
games += 1


if games >= ndjson_size:
temp_file.close()
Expand Down Expand Up @@ -278,7 +293,7 @@ def ingest_lichess_data(year: int,
fout.write(compressed_bytes)

return None

def _ndjson_to_parquet(ndjson_path: str, parquet_path: str, include_moves: bool, fs: Optional[s3fs.core.S3FileSystem] = None):
"""Creates a cleaned dataframe from an ndjson of Lichess game info."""
game_cols = ["ID", "ID_random", "Event", "Tournament", "ECO", "Opening", "TimeControl", "Termination", "DateTime"]
Expand Down
98 changes: 98 additions & 0 deletions notebooks/example_filtering.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import dependencies\n",
"import polars as pl\n",
"\n",
"# strings as categoricals for join\n",
"pl.enable_string_cache()\n",
"\n",
"\n",
"path_data_files = \"./data\"\n",
"\n",
"# connect to data (lazy evaluatioin!)\n",
"chess_df = pl.scan_parquet(f\"{path_data_files}/*.parquet\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (5, 34)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>ID</th><th>ID_random</th><th>Event</th><th>Tournament</th><th>ECO</th><th>Opening</th><th>TimeControl</th><th>Termination</th><th>DateTime</th><th>Moves</th><th>Evaluation_flag</th><th>Result</th><th>Role_player</th><th>Player</th><th>Opponent</th><th>PlayerElo</th><th>OpponentElo</th><th>PlayerElo_max</th><th>OpponentElo_max</th><th>PlayerElo_max_faced</th><th>OpponentElo_max_faced</th><th>PlayerTitle</th><th>OpponentTitle</th><th>PlayerTitle_flag</th><th>OpponentTitle_flag</th><th>PlayerRatingDiff</th><th>OpponentRatingDiff</th><th>Player_random</th><th>Opponent_random</th><th>Player_cum_games_total</th><th>Opponent_cum_games_total</th><th>Player_cum_games_type</th><th>Opponent_cum_games_type</th><th>PlayerElo_bin</th></tr><tr><td>str</td><td>f64</td><td>str</td><td>bool</td><td>str</td><td>str</td><td>str</td><td>cat</td><td>datetime[μs]</td><td>str</td><td>bool</td><td>cat</td><td>cat</td><td>str</td><td>str</td><td>i16</td><td>i16</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td><td>str</td><td>str</td><td>bool</td><td>bool</td><td>i16</td><td>i16</td><td>f64</td><td>f64</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td><td>cat</td></tr></thead><tbody><tr><td>&quot;hca0mb9v&quot;</td><td>0.183835</td><td>&quot;Rated Bullet game&quot;</td><td>false</td><td>&quot;C00&quot;</td><td>&quot;French Defense #2&quot;</td><td>&quot;60+0&quot;</td><td>&quot;Normal&quot;</td><td>2013-01-01 00:15:38</td><td>&quot;1. e4 { [%eval 0.2] } 1... e6 …</td><td>true</td><td>&quot;0-1&quot;</td><td>&quot;White&quot;</td><td>&quot;LEGENDARY_ERFAN&quot;</td><td>&quot;Mariss&quot;</td><td>1182</td><td>1457</td><td>1231</td><td>1457</td><td>1896</td><td>1182</td><td>null</td><td>null</td><td>false</td><td>false</td><td>-30</td><td>5</td><td>0.363595</td><td>0.777809</td><td>11</td><td>6</td><td>11</td><td>1</td><td>&quot;(1000, 1200]&quot;</td></tr><tr><td>&quot;hca0mb9v&quot;</td><td>0.183835</td><td>&quot;Rated Bullet game&quot;</td><td>false</td><td>&quot;C00&quot;</td><td>&quot;French Defense #2&quot;</td><td>&quot;60+0&quot;</td><td>&quot;Normal&quot;</td><td>2013-01-01 00:15:38</td><td>&quot;1. e4 { [%eval 0.2] } 1... e6 …</td><td>true</td><td>&quot;1-0&quot;</td><td>&quot;Black&quot;</td><td>&quot;Mariss&quot;</td><td>&quot;LEGENDARY_ERFAN&quot;</td><td>1457</td><td>1182</td><td>1457</td><td>1231</td><td>1182</td><td>1896</td><td>null</td><td>null</td><td>false</td><td>false</td><td>5</td><td>-30</td><td>0.777809</td><td>0.363595</td><td>6</td><td>11</td><td>1</td><td>11</td><td>&quot;(1400, 1600]&quot;</td></tr><tr><td>&quot;odq8vllt&quot;</td><td>0.427186</td><td>&quot;Rated Blitz game&quot;</td><td>false</td><td>&quot;C00&quot;</td><td>&quot;French Defense: Knight Variati…</td><td>&quot;240+0&quot;</td><td>&quot;Normal&quot;</td><td>2013-01-01 06:11:57</td><td>&quot;1. e4 { [%eval 0.31] } 1... e6…</td><td>true</td><td>&quot;1-0&quot;</td><td>&quot;White&quot;</td><td>&quot;psonio&quot;</td><td>&quot;Chenstix&quot;</td><td>1925</td><td>1685</td><td>1999</td><td>1685</td><td>1905</td><td>1925</td><td>null</td><td>null</td><td>false</td><td>false</td><td>19</td><td>-5</td><td>0.478087</td><td>0.516449</td><td>9</td><td>1</td><td>9</td><td>1</td><td>&quot;(1800, 2000]&quot;</td></tr><tr><td>&quot;odq8vllt&quot;</td><td>0.427186</td><td>&quot;Rated Blitz game&quot;</td><td>false</td><td>&quot;C00&quot;</td><td>&quot;French Defense: Knight Variati…</td><td>&quot;240+0&quot;</td><td>&quot;Normal&quot;</td><td>2013-01-01 06:11:57</td><td>&quot;1. e4 { [%eval 0.31] } 1... e6…</td><td>true</td><td>&quot;0-1&quot;</td><td>&quot;Black&quot;</td><td>&quot;Chenstix&quot;</td><td>&quot;psonio&quot;</td><td>1685</td><td>1925</td><td>1685</td><td>1999</td><td>1925</td><td>1905</td><td>null</td><td>null</td><td>false</td><td>false</td><td>-5</td><td>19</td><td>0.516449</td><td>0.478087</td><td>1</td><td>9</td><td>1</td><td>9</td><td>&quot;(1600, 1800]&quot;</td></tr><tr><td>&quot;2irq4pg0&quot;</td><td>0.73595</td><td>&quot;Rated Blitz game&quot;</td><td>false</td><td>&quot;C41&quot;</td><td>&quot;Philidor Defense: Exchange Var…</td><td>&quot;300+0&quot;</td><td>&quot;Normal&quot;</td><td>2013-01-01 06:15:59</td><td>&quot;1. e4 { [%eval 0.12] } 1... e5…</td><td>true</td><td>&quot;1-0&quot;</td><td>&quot;White&quot;</td><td>&quot;dobro55&quot;</td><td>&quot;psonio&quot;</td><td>1785</td><td>1944</td><td>1785</td><td>1999</td><td>1944</td><td>1905</td><td>null</td><td>null</td><td>false</td><td>false</td><td>15</td><td>-61</td><td>0.207169</td><td>0.478087</td><td>1</td><td>11</td><td>1</td><td>11</td><td>&quot;(1600, 1800]&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (5, 34)\n",
"┌──────────┬───────────┬────────┬────────────┬───┬────────────┬────────────┬───────────┬───────────┐\n",
"│ ID ┆ ID_random ┆ Event ┆ Tournament ┆ … ┆ Opponent_c ┆ Player_cum ┆ Opponent_ ┆ PlayerElo │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ um_games_t ┆ _games_typ ┆ cum_games ┆ _bin │\n",
"│ str ┆ f64 ┆ str ┆ bool ┆ ┆ otal ┆ e ┆ _type ┆ --- │\n",
"│ ┆ ┆ ┆ ┆ ┆ --- ┆ --- ┆ --- ┆ cat │\n",
"│ ┆ ┆ ┆ ┆ ┆ i32 ┆ i32 ┆ i32 ┆ │\n",
"╞══════════╪═══════════╪════════╪════════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡\n",
"│ hca0mb9v ┆ 0.183835 ┆ Rated ┆ false ┆ … ┆ 6 ┆ 11 ┆ 1 ┆ (1000, │\n",
"│ ┆ ┆ Bullet ┆ ┆ ┆ ┆ ┆ ┆ 1200] │\n",
"│ ┆ ┆ game ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ hca0mb9v ┆ 0.183835 ┆ Rated ┆ false ┆ … ┆ 11 ┆ 1 ┆ 11 ┆ (1400, │\n",
"│ ┆ ┆ Bullet ┆ ┆ ┆ ┆ ┆ ┆ 1600] │\n",
"│ ┆ ┆ game ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ odq8vllt ┆ 0.427186 ┆ Rated ┆ false ┆ … ┆ 1 ┆ 9 ┆ 1 ┆ (1800, │\n",
"│ ┆ ┆ Blitz ┆ ┆ ┆ ┆ ┆ ┆ 2000] │\n",
"│ ┆ ┆ game ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ odq8vllt ┆ 0.427186 ┆ Rated ┆ false ┆ … ┆ 9 ┆ 1 ┆ 9 ┆ (1600, │\n",
"│ ┆ ┆ Blitz ┆ ┆ ┆ ┆ ┆ ┆ 1800] │\n",
"│ ┆ ┆ game ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2irq4pg0 ┆ 0.73595 ┆ Rated ┆ false ┆ … ┆ 11 ┆ 1 ┆ 11 ┆ (1600, │\n",
"│ ┆ ┆ Blitz ┆ ┆ ┆ ┆ ┆ ┆ 1800] │\n",
"│ ┆ ┆ game ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"└──────────┴───────────┴────────┴────────────┴───┴────────────┴────────────┴───────────┴───────────┘"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check header\n",
"chess_df.head().collect(streaming=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "st",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
65 changes: 65 additions & 0 deletions notebooks/example_notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, \"../\")\n",
"from ingest_lichess import *"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017_01: 19.7GiB [25:05, 13.1MiB/s] \n",
"2017_02: 18.6GiB [23:27, 13.2MiB/s] \n",
"2017_03: 23.2GiB [29:08, 13.3MiB/s]\n",
"2017_04: 42.3GiB [42:48, 16.5MiB/s]\n",
"2017_05: 43.6GiB [43:41, 16.6MiB/s]\n",
"2017_06: 42.9GiB [44:10, 16.2MiB/s]\n",
"2017_07: 45.0GiB [45:24, 16.5MiB/s]\n",
"2017_08: 46.5GiB [46:50, 16.5MiB/s]\n",
"2017_09: 46.7GiB [46:44, 16.7MiB/s]\n",
"2017_10: 51.0GiB [52:15, 16.3MiB/s]\n",
"2017_11: 53.1GiB [53:30, 16.5MiB/s]\n",
"2017_12: 60.3GiB [1:00:27, 16.6MiB/s]\n"
]
}
],
"source": [
"main(start=2013, end=2018, pq_dir=\"./data\", months=range(1, 13), include_moves=True, keep_only_evaluated=True, \n",
" fs=None, restart_counter_games=True, dir_ndjson=None, ndjson_size=2e6)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "st",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}