Jon Solow
Update github_data to get stats and player rosters
203b087
raw
history blame
2.58 kB
import duckdb
import pandas as pd
import os
from typing import Callable
duckdb.default_connection.execute("SET GLOBAL pandas_analyze_sample=100000")
BASE_URL = "https://github.com/nflverse/nflverse-data/releases/download/"
SEASON = "2023"
FANTASY_POSITIONS = [
"QB",
"RB",
"WR",
"TE",
"FB",
"K",
]
def get_weekly_rosters(season_int: int = SEASON) -> pd.DataFrame:
df = duckdb.sql(
f"""
with grouped_stats as (
select player_id, sum(fantasy_points) as fantasy_points
from player_stats_player_stats_{season_int}
group by player_id
)
SELECT
team
, wr.position
, wr.jersey_number
, wr.status
, wr.full_name
, wr.headshot_url
, wr.week
, wr.gsis_id
, gs.fantasy_points
from weekly_rosters_roster_weekly_{season_int} wr
join grouped_stats gs
on wr.gsis_id = gs.player_id
"""
).df()
return df
def get_player_stats(season_int: int = SEASON) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from player_stats_player_stats_{season_int}").df()
return df
NFLVERSE_ASSETS = [
("weekly_rosters", f"roster_weekly_{SEASON}.parquet"),
("player_stats", f"player_stats_{SEASON}.parquet"),
]
class NflVerseDataAsset:
def __init__(
self,
release_tag: str,
asset_name: str,
dataframe_mutation_fxn: Callable[[pd.DataFrame], pd.DataFrame] = lambda x: x,
):
self.release_tag = release_tag
self.asset_name = asset_name
self.dataframe_mutation_fxn = dataframe_mutation_fxn
self.table_name = f"{release_tag}_{asset_name.rsplit('.', 1)[0]}"
def load_parquet_asset_to_df(self) -> pd.DataFrame:
location = os.path.join(BASE_URL, self.release_tag, self.asset_name)
df = pd.read_parquet(location)
return df
def register_asset_to_duckdb(self) -> None:
df = self.load_parquet_asset_to_df()
df = self.dataframe_mutation_fxn(df)
duckdb.register(self.table_name, df)
def load_assets():
for tag, asset in NFLVERSE_ASSETS:
asset = NflVerseDataAsset(tag, asset)
asset.register_asset_to_duckdb()
def get_current_tables() -> list[str]:
current_tables_df = duckdb.sql("SHOW TABLES").df()
return current_tables_df["name"].tolist()