File size: 4,684 Bytes
d1369a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65fefb5
 
 
 
 
 
d1369a2
65fefb5
 
f89cae0
 
 
 
 
d1369a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65fefb5
 
d1369a2
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import polars as pl

def filter_data_by_date_and_game_kind(data, start_date=None, end_date=None, game_kind=None):
  if start_date is not None:
    data = data.filter(pl.col('date') >= start_date)
  if end_date is not None:
    data = data.filter(pl.col('date') <= end_date)
  if game_kind is not None:
    data = data.filter(pl.col('coarse_game_kind') == game_kind)
  return data

def compute_team_games(data):
  data = (
      data
      .with_columns(
          pl.col('gameId').unique().len().over('HomeTeamNameES').alias('home_games'),
          pl.col('gameId').unique().len().over('VisitorTeamNameES').alias('visitor_games')
      )
  )
  game_data = (
      data
      .group_by('HomeTeamNameES')
      .first()
      [['HomeTeamNameES', 'home_games']]
      .rename({'HomeTeamNameES': 'team'})
      .join(
          (
              data
              .group_by('VisitorTeamNameES')
              .first()
              [['VisitorTeamNameES', 'visitor_games']]
              .rename({'VisitorTeamNameES': 'team'})
          ),
          on='team',
      )
      .with_columns((pl.col('home_games')+pl.col('visitor_games')).alias('games'))
  )

  return (
      data
      .drop('home_games', 'visitor_games')
      .join(
          game_data[['team', 'games']].rename({'games': 'home_games'}),
          left_on='HomeTeamNameES',
          right_on='team'
      )
      .join(
          game_data[['team', 'games']].rename({'games': 'visitor_games'}),
          left_on='VisitorTeamNameES',
          right_on='team'
      )
  )


def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
  assert player_type in ('pitcher', 'batter')
  assert pitch_class_type in ('general', 'specific')
  id_col = 'pitId' if player_type == 'pitcher' else 'batId'
  pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
  pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
  pitch_stats = (
      data
      .group_by(id_col, pitch_col)
      .agg(
          pl.first('pitcher_name'),
          *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
          pl.first(pitch_name_col),
          pl.len().alias('count'),
          pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
          (pl.col('swing').sum() / pl.col('pitch').sum()).alias('Swing%'),
          ((pl.col('swing') & pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Z-Swing%'),
          ((pl.col('swing') & ~pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Chase%'),
          ((pl.col('swing') & ~pl.col('whiff')).sum()/pl.col('swing').sum()).alias('Contact%'),
          ((pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(pl.col('zone') & pl.col('swing')).sum()).alias('Z-Contact%'),
          ((~pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(~pl.col('zone') & pl.col('swing')).sum()).alias('O-Contact%'),
          (pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'),
          (pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'),
          (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
          (pl.col('zone').sum() / pl.col('pitch').sum()).alias('Zone%'),
          (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') < 0).otherwise(pl.col('x') > 0)).mean().alias('Glove%'),
          (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') >= 0).otherwise(pl.col('x') <= 0)).mean().alias('Arm%'),
          (pl.col('y') > 125).mean().alias('High%'),
          (pl.col('y') <= 125).mean().alias('Low%')
      )
      .with_columns(
          (pl.col('count')/pl.sum('count').over('pitId')).alias('usage'),
          (pl.col('count') >= min_pitches).alias('qualified')
      )
      .explode('batType')
      .unnest('batType')
      .pivot(on='batType', values='proportion')
      .fill_null(0)
      .with_columns(
          (pl.col('G') + pl.col('B')).alias('GB%'),
          (pl.col('F') + pl.col('P')).alias('FB%'),
          pl.col('L').alias('LD%').round(2),
      )
      .drop('G', 'F', 'B', 'P', 'L', 'null')
      .with_columns(
          (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=((stat in ['FB%', 'LD%'] or 'Contact%' in stat)))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
          for stat in ['Zone%', 'Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%']
      )
      .rename({pitch_col: 'ballKind_code', pitch_name_col: 'ballKind'} if pitch_class_type == 'general' else {})
      .sort(id_col, 'count', descending=[False, True])
  )
  return pitch_stats