import polars as pl def filter_data_by_date_and_game_kind(data, start_date=None, end_date=None, game_kind=None): if start_date is not None: data = data.filter(pl.col('date') >= start_date) if end_date is not None: data = data.filter(pl.col('date') <= end_date) if game_kind is not None: data = data.filter(pl.col('coarse_game_kind') == game_kind) return data def compute_team_games(data): data = ( data .with_columns( pl.col('gameId').unique().len().over('HomeTeamNameES').alias('home_games'), pl.col('gameId').unique().len().over('VisitorTeamNameES').alias('visitor_games') ) ) game_data = ( data .group_by('HomeTeamNameES') .first() [['HomeTeamNameES', 'home_games']] .rename({'HomeTeamNameES': 'team'}) .join( ( data .group_by('VisitorTeamNameES') .first() [['VisitorTeamNameES', 'visitor_games']] .rename({'VisitorTeamNameES': 'team'}) ), on='team', ) .with_columns((pl.col('home_games')+pl.col('visitor_games')).alias('games')) ) return ( data .drop('home_games', 'visitor_games') .join( game_data[['team', 'games']].rename({'games': 'home_games'}), left_on='HomeTeamNameES', right_on='team' ) .join( game_data[['team', 'games']].rename({'games': 'visitor_games'}), left_on='VisitorTeamNameES', right_on='team' ) ) def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1): assert player_type in ('pitcher', 'batter') assert pitch_class_type in ('general', 'specific') id_col = 'pitId' if player_type == 'pitcher' else 'batId' pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code' pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind' pitch_stats = ( data .group_by(id_col, pitch_col) .agg( pl.first('pitcher_name'), *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []), pl.first(pitch_name_col), pl.len().alias('count'), pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True), (pl.col('swing').sum() / pl.col('pitch').sum()).alias('Swing%'), ((pl.col('swing') & pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Z-Swing%'), ((pl.col('swing') & ~pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Chase%'), ((pl.col('swing') & ~pl.col('whiff')).sum()/pl.col('swing').sum()).alias('Contact%'), ((pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(pl.col('zone') & pl.col('swing')).sum()).alias('Z-Contact%'), ((~pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(~pl.col('zone') & pl.col('swing')).sum()).alias('O-Contact%'), (pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'), (pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'), (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'), (pl.col('zone').sum() / pl.col('pitch').sum()).alias('Zone%'), (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') < 0).otherwise(pl.col('x') > 0)).mean().alias('Glove%'), (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') >= 0).otherwise(pl.col('x') <= 0)).mean().alias('Arm%'), (pl.col('y') > 125).mean().alias('High%'), (pl.col('y') <= 125).mean().alias('Low%') ) .with_columns( (pl.col('count')/pl.sum('count').over('pitId')).alias('usage'), (pl.col('count') >= min_pitches).alias('qualified') ) .explode('batType') .unnest('batType') .pivot(on='batType', values='proportion') .fill_null(0) .with_columns( (pl.col('G') + pl.col('B')).alias('GB%'), (pl.col('F') + pl.col('P')).alias('FB%'), pl.col('L').alias('LD%').round(2), ) .drop('G', 'F', 'B', 'P', 'L', 'null') .with_columns( (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=((stat in ['FB%', 'LD%'] or 'Contact%' in stat)))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl') for stat in ['Zone%', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%'] ) .rename({pitch_col: 'ballKind_code', pitch_name_col: 'ballKind'} if pitch_class_type == 'general' else {}) .sort(id_col, 'count', descending=[False, True]) ) return pitch_stats