patrickramos commited on
Commit
d1369a2
·
1 Parent(s): 2266731

Add general pitch classification

Browse files
Files changed (5) hide show
  1. convert.py +70 -15
  2. data.py +10 -2
  3. pitcher_overview.py +1 -1
  4. plotting.py +16 -94
  5. stats.py +95 -0
convert.py CHANGED
@@ -43,7 +43,7 @@ ball_kind = {
43
 
44
  ball_kind_code = {
45
  -1: '-',
46
- 31: 'FF',
47
  32: 'SL',
48
  33: 'VS',
49
  34: 'SV',
@@ -59,14 +59,66 @@ ball_kind_code = {
59
  44: 'PB',
60
  45: 'KN',
61
  46: 'SH',
62
- 47: 'FT',
63
- 48: 'FW',
64
  49: 'FC',
65
  50: 'EP', # technically "super" eephus but I haven't encountered a normal one yet
66
  51: 'HS',
67
  52: 'HL'
68
  }
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  lr = {1: 'l', 2: 'r'}
71
 
72
  presult = {
@@ -260,29 +312,32 @@ game_kind = {
260
 
261
  ball_kind_code_to_color = {
262
  '-': '',
263
- 'FF': 'crimson',
264
  'SL': 'gold',
265
- 'VS': '',
266
- 'SV': '',
267
  'CU': 'paleturquoise',
268
  'SC': 'royalblue',
269
- 'PC': '',
270
  'KC': 'rebeccapurple',
271
  'FO': 'darkturquoise',
272
  'FS': 'cadetblue',
273
  'CH': 'mediumseagreen',
274
- 'SI': '',
275
- 'SB': '',
276
- 'PB': '',
277
  'SH': 'tomato',
278
- 'FT': '',
279
- 'FW': '',
280
  'FC': 'sienna',
281
- 'EP': '', # technically "super" eephus but I haven't encountered a normal one yet
282
- 'HS': '',
283
- 'HL': ''
284
  }
 
 
285
  ball_kind_code_to_color = {k: v if v else 'C0' for k, v in ball_kind_code_to_color.items()}
 
286
  def get_text_color_from_color(color):
287
  if color in ['gold', 'paleturquoise']:
288
  return 'black'
 
43
 
44
  ball_kind_code = {
45
  -1: '-',
46
+ 31: '4S',
47
  32: 'SL',
48
  33: 'VS',
49
  34: 'SV',
 
59
  44: 'PB',
60
  45: 'KN',
61
  46: 'SH',
62
+ 47: '2S',
63
+ 48: '1S',
64
  49: 'FC',
65
  50: 'EP', # technically "super" eephus but I haven't encountered a normal one yet
66
  51: 'HS',
67
  52: 'HL'
68
  }
69
 
70
+ general_ball_kind = {
71
+ -1: '-',
72
+ 31: 'Fastball (4-seam)',
73
+ 32: 'Slider',
74
+ 33: 'Vertical Slider',
75
+ 34: 'Slurve',
76
+ 35: 'Curve',
77
+ 36: 'Curve',
78
+ 37: 'Curve',
79
+ 38: 'Curve',
80
+ 39: 'Splitter',
81
+ 40: 'Splitter',
82
+ 41: 'Changeup',
83
+ 42: 'Sinker',
84
+ 43: 'Screwball',
85
+ 44: 'Palmball',
86
+ 45: 'Knuckleball',
87
+ 46: 'Sinker',
88
+ 47: 'Sinker',
89
+ 48: 'Sinker',
90
+ 49: 'Cutter',
91
+ 50: 'Eephus', # technically "super" eephus but I haven't encountered a normal one yet
92
+ 51: 'Sinker',
93
+ 52: 'Vertical Slider',
94
+ }
95
+
96
+ general_ball_kind_code = {
97
+ -1: '-',
98
+ 31: '4S',
99
+ 32: 'SL',
100
+ 33: 'VS',
101
+ 34: 'SV',
102
+ 35: 'CU',
103
+ 36: 'CU',
104
+ 37: 'CU',
105
+ 38: 'CU',
106
+ 39: 'FS',
107
+ 40: 'FS',
108
+ 41: 'CH',
109
+ 42: 'SI',
110
+ 43: 'SB',
111
+ 44: 'PB',
112
+ 45: 'KN',
113
+ 46: 'SI',
114
+ 47: 'SI',
115
+ 48: 'SI',
116
+ 49: 'FC',
117
+ 50: 'EP', # technically "super" eephus but I haven't encountered a normal one yet
118
+ 51: 'SI',
119
+ 52: 'VS'
120
+ }
121
+
122
  lr = {1: 'l', 2: 'r'}
123
 
124
  presult = {
 
312
 
313
  ball_kind_code_to_color = {
314
  '-': '',
315
+ '4S': 'crimson',
316
  'SL': 'gold',
317
+ 'VS': 'khaki',
318
+ 'SV': 'lightsteelblue',
319
  'CU': 'paleturquoise',
320
  'SC': 'royalblue',
321
+ 'PC': 'turquoise',
322
  'KC': 'rebeccapurple',
323
  'FO': 'darkturquoise',
324
  'FS': 'cadetblue',
325
  'CH': 'mediumseagreen',
326
+ 'SI': 'orange',
327
+ 'SB': 'lightgreen',
328
+ 'PB': 'yellowgreen',
329
  'SH': 'tomato',
330
+ '2S': 'orangered',
331
+ '1S': 'lightsalmon',
332
  'FC': 'sienna',
333
+ 'EP': 'darkgray', # technically "super" eephus but I haven't encountered a normal one yet
334
+ 'HS': 'darkorange',
335
+ 'HL': 'darkkhaki'
336
  }
337
+
338
+
339
  ball_kind_code_to_color = {k: v if v else 'C0' for k, v in ball_kind_code_to_color.items()}
340
+
341
  def get_text_color_from_color(color):
342
  if color in ['gold', 'paleturquoise']:
343
  return 'black'
data.py CHANGED
@@ -4,15 +4,21 @@ from tqdm.auto import tqdm
4
  import pykakasi
5
  from huggingface_hub import snapshot_download
6
 
7
- from convert import aux_global_id_to_code, presult, ball_kind, ball_kind_code, lr, game_kind
 
 
 
 
8
 
9
 
10
  DATA_PATH = snapshot_download(
11
  repo_id='Ramos-Ramos/npb_data_app',
12
  repo_type='dataset',
13
  local_dir='./files',
14
- cache_dir='./.cache'
 
15
  )
 
16
  SEASONS = [2021, 2022, 2023, 2024, 2025]
17
 
18
  data_df = pl.DataFrame()
@@ -212,6 +218,8 @@ data_df = (
212
  pl.col('presult').alias('presult_id'),
213
  pl.col('ballKind').replace_strict(ball_kind),
214
  pl.col('ballKind').replace_strict(ball_kind_code).alias('ballKind_code'),
 
 
215
  pl.col('batLR').replace_strict(lr),
216
  pl.col('date').str.to_date('%Y%m%d'),
217
 
 
4
  import pykakasi
5
  from huggingface_hub import snapshot_download
6
 
7
+ from convert import (
8
+ aux_global_id_to_code, presult,
9
+ ball_kind, ball_kind_code, general_ball_kind, general_ball_kind_code, lr,
10
+ game_kind
11
+ )
12
 
13
 
14
  DATA_PATH = snapshot_download(
15
  repo_id='Ramos-Ramos/npb_data_app',
16
  repo_type='dataset',
17
  local_dir='./files',
18
+ cache_dir='./.cache',
19
+ allow_patterns=['*/pbp_data.parquet', '*/pbp_text.parquet', '*/pbp_aux.parquet', '*/schedule.parquet', '*/aux_schedule.parquet', 'players.parquet', 'players_kana.parquet']
20
  )
21
+
22
  SEASONS = [2021, 2022, 2023, 2024, 2025]
23
 
24
  data_df = pl.DataFrame()
 
218
  pl.col('presult').alias('presult_id'),
219
  pl.col('ballKind').replace_strict(ball_kind),
220
  pl.col('ballKind').replace_strict(ball_kind_code).alias('ballKind_code'),
221
+ pl.col('ballKind').replace_strict(general_ball_kind).alias('general_ballKind'),
222
+ pl.col('ballKind').replace_strict(general_ball_kind_code).alias('general_ballKind_code'),
223
  pl.col('batLR').replace_strict(lr),
224
  pl.col('date').str.to_date('%Y%m%d'),
225
 
pitcher_overview.py CHANGED
@@ -31,7 +31,7 @@ def gr_create_pitcher_overview_card(name, season):
31
 
32
  def create_pitcher_overview(data_df):
33
  with gr.Blocks() as app:
34
- gr.Markdown('Pitcher overview')
35
 
36
  with gr.Row():
37
  with gr.Column():
 
31
 
32
  def create_pitcher_overview(data_df):
33
  with gr.Blocks() as app:
34
+ gr.Markdown('# Pitcher overview')
35
 
36
  with gr.Row():
37
  with gr.Column():
plotting.py CHANGED
@@ -12,62 +12,23 @@ from datetime import date
12
 
13
  from data import data_df
14
  from convert import ball_kind_code_to_color, get_text_color_from_color
 
15
 
16
- mpl.use('Agg')
17
-
18
- def compute_team_games(data):
19
- data = (
20
- data
21
- .with_columns(
22
- pl.col('gameId').unique().len().over('HomeTeamNameES').alias('home_games'),
23
- pl.col('gameId').unique().len().over('VisitorTeamNameES').alias('visitor_games')
24
- )
25
- )
26
- game_data = (
27
- data
28
- .group_by('HomeTeamNameES')
29
- .first()
30
- [['HomeTeamNameES', 'home_games']]
31
- .rename({'HomeTeamNameES': 'team'})
32
- .join(
33
- (
34
- data
35
- .group_by('VisitorTeamNameES')
36
- .first()
37
- [['VisitorTeamNameES', 'visitor_games']]
38
- .rename({'VisitorTeamNameES': 'team'})
39
- ),
40
- on='team',
41
- )
42
- .with_columns((pl.col('home_games')+pl.col('visitor_games')).alias('games'))
43
- )
44
 
45
- return (
46
- data
47
- .drop('home_games', 'visitor_games')
48
- .join(
49
- game_data[['team', 'games']].rename({'games': 'home_games'}),
50
- left_on='HomeTeamNameES',
51
- right_on='team'
52
- )
53
- .join(
54
- game_data[['team', 'games']].rename({'games': 'visitor_games'}),
55
- left_on='VisitorTeamNameES',
56
- right_on='team'
57
- )
58
- )
59
 
60
 
61
- def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1):
62
  source_data = data_df.filter(pl.col('ballKind_code') != '-')
63
 
64
- if start_date is not None:
65
- source_data = source_data.filter(pl.col('date') >= start_date)
66
- if end_date is not None:
67
- source_data = source_data.filter(pl.col('date') <= end_date)
68
-
69
- if game_kind is not None:
70
- source_data = source_data.filter(pl.col('coarse_game_kind') == game_kind)
 
71
 
72
  source_data = (
73
  compute_team_games(source_data)
@@ -82,49 +43,10 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
82
  else:
83
  source_data = source_data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
84
 
85
-
86
  if lr is not None:
87
  source_data = source_data.filter(pl.col('batLR') == lr)
88
 
89
- pitch_stats = (
90
- source_data
91
- # .with_columns(
92
- # pl.col('presult').is_in(['None', 'Balk', 'Batter interference', 'Catcher interference', 'Pitcher delay', 'Intentional walk', 'Unknown']).not_().alias('pitch'),
93
- # pl.col('presult').is_in(['Swinging strike', 'Swinging strikeout']).alias('whiff'),
94
- # )
95
- # .with_columns(
96
- # (pl.col('pitch') & pl.col('presult').is_in(['Hit by pitch', 'Sacrifice bunt', 'Sacrifice fly', 'Looking strike', 'Ball', 'Walk', 'Looking strikeout', 'Sacrifice hit error', 'Sacrifice fly error', "Sacrifice fielder's choice", 'Bunt strikeout']).not_()).alias('swing'),
97
- # (pl.col('whiff') | pl.col('presult').is_in(['Looking strike', 'Uncaught third strike', 'Looking strikeout'])).alias('csw')
98
- # )
99
- .group_by('pitId', 'ballKind_code')
100
- .agg(
101
- pl.len().alias('count'),
102
- pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
103
- (pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'),
104
- (pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'),
105
- (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%')
106
- )
107
- .with_columns(
108
- (pl.col('count')/pl.sum('count').over('pitId')).alias('usage'),
109
- (pl.col('count') >= min_pitches).alias('qualified')
110
- )
111
- .explode('batType')
112
- .unnest('batType')
113
- .pivot(on='batType', values='proportion')
114
- .fill_null(0)
115
- .with_columns(
116
- (pl.col('G') + pl.col('B')).alias('GB%'),
117
- (pl.col('F') + pl.col('P')).alias('FB%'),
118
- pl.col('L').alias('LD%').round(2),
119
- )
120
- .drop('G', 'F', 'B', 'P', 'L')
121
- .with_columns(
122
- (pl.when(pl.col('qualified')).then(pl.col(stat)).rank()/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
123
- for stat in ['SwStr%', 'Whiff%', 'CSW%', 'GB%']
124
- )
125
- .sort('pitId', 'count', descending=[False, True])
126
- .filter(pl.col('pitId') == id)
127
- )
128
 
129
  pitch_shapes = (
130
  source_data
@@ -134,7 +56,7 @@ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=Non
134
  pl.col('y').is_not_null() &
135
  (pl.col('ballSpeed') > 0)
136
  )
137
- [['pitId', 'ballKind_code', 'ballSpeed', 'x', 'y']]
138
  )
139
 
140
  pitcher_stats = (
@@ -224,7 +146,7 @@ def plot_loc(ax, locs):
224
  ax.add_patch(plt.Rectangle((-40, 75), width=80, height=100, facecolor='ivory', edgecolor='darkgray'))
225
  ax.add_patch(plt.Polygon([(0, -10), (45, -30), (51, -50), (-51, -50), (-45, -30), (0, -10)], facecolor='snow', edgecolor='darkgray'))
226
 
227
- for (pitch,), _locs in locs.sort(pl.len().over('ballKind_code'), descending=True).group_by('ballKind_code', maintain_order=True):
228
  if len(_locs) <= 2:
229
  continue
230
 
@@ -243,7 +165,7 @@ def plot_loc(ax, locs):
243
 
244
  def plot_velo(ax, velos):
245
  trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
246
- for (pitch,), _velos in velos.group_by('ballKind_code'):
247
  if len(_velos) <= 1:
248
  continue
249
 
@@ -317,7 +239,7 @@ font = load_google_font('Saira Extra Condensed', weight='medium')
317
 
318
 
319
  def create_pitcher_overview_card(id, season, dpi=300):
320
- data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100)
321
 
322
  fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
323
  gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
 
12
 
13
  from data import data_df
14
  from convert import ball_kind_code_to_color, get_text_color_from_color
15
+ from stats import filter_data_by_date_and_game_kind, compute_team_games, compute_pitch_stats
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ mpl.use('Agg')
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
+ def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1, pitch_class_type='specific'):
22
  source_data = data_df.filter(pl.col('ballKind_code') != '-')
23
 
24
+ # if start_date is not None:
25
+ # source_data = source_data.filter(pl.col('date') >= start_date)
26
+ # if end_date is not None:
27
+ # source_data = source_data.filter(pl.col('date') <= end_date)
28
+ #
29
+ # if game_kind is not None:
30
+ # source_data = source_data.filter(pl.col('coarse_game_kind') == game_kind)
31
+ source_data = filter_data_by_date_and_game_kind(source_data, start_date=start_date, end_date=end_date, game_kind=game_kind)
32
 
33
  source_data = (
34
  compute_team_games(source_data)
 
43
  else:
44
  source_data = source_data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
45
 
 
46
  if lr is not None:
47
  source_data = source_data.filter(pl.col('batLR') == lr)
48
 
49
+ pitch_stats = compute_pitch_stats(source_data, player_type='pitcher', pitch_class_type=pitch_class_type, min_pitches=min_pitches).filter(pl.col('pitId') == id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  pitch_shapes = (
52
  source_data
 
56
  pl.col('y').is_not_null() &
57
  (pl.col('ballSpeed') > 0)
58
  )
59
+ [['pitId', 'general_ballKind_code', 'ballKind_code', 'ballSpeed', 'x', 'y']]
60
  )
61
 
62
  pitcher_stats = (
 
146
  ax.add_patch(plt.Rectangle((-40, 75), width=80, height=100, facecolor='ivory', edgecolor='darkgray'))
147
  ax.add_patch(plt.Polygon([(0, -10), (45, -30), (51, -50), (-51, -50), (-45, -30), (0, -10)], facecolor='snow', edgecolor='darkgray'))
148
 
149
+ for (pitch,), _locs in locs.sort(pl.len().over('general_ballKind_code'), descending=True).group_by('general_ballKind_code', maintain_order=True):
150
  if len(_locs) <= 2:
151
  continue
152
 
 
165
 
166
  def plot_velo(ax, velos):
167
  trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
168
+ for (pitch,), _velos in velos.group_by('general_ballKind_code'):
169
  if len(_velos) <= 1:
170
  continue
171
 
 
239
 
240
 
241
  def create_pitcher_overview_card(id, season, dpi=300):
242
+ data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100, pitch_class_type='general')
243
 
244
  fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
245
  gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
stats.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+
3
+ def filter_data_by_date_and_game_kind(data, start_date=None, end_date=None, game_kind=None):
4
+ if start_date is not None:
5
+ data = data.filter(pl.col('date') >= start_date)
6
+ if end_date is not None:
7
+ data = data.filter(pl.col('date') <= end_date)
8
+ if game_kind is not None:
9
+ data = data.filter(pl.col('coarse_game_kind') == game_kind)
10
+ return data
11
+
12
+ def compute_team_games(data):
13
+ data = (
14
+ data
15
+ .with_columns(
16
+ pl.col('gameId').unique().len().over('HomeTeamNameES').alias('home_games'),
17
+ pl.col('gameId').unique().len().over('VisitorTeamNameES').alias('visitor_games')
18
+ )
19
+ )
20
+ game_data = (
21
+ data
22
+ .group_by('HomeTeamNameES')
23
+ .first()
24
+ [['HomeTeamNameES', 'home_games']]
25
+ .rename({'HomeTeamNameES': 'team'})
26
+ .join(
27
+ (
28
+ data
29
+ .group_by('VisitorTeamNameES')
30
+ .first()
31
+ [['VisitorTeamNameES', 'visitor_games']]
32
+ .rename({'VisitorTeamNameES': 'team'})
33
+ ),
34
+ on='team',
35
+ )
36
+ .with_columns((pl.col('home_games')+pl.col('visitor_games')).alias('games'))
37
+ )
38
+
39
+ return (
40
+ data
41
+ .drop('home_games', 'visitor_games')
42
+ .join(
43
+ game_data[['team', 'games']].rename({'games': 'home_games'}),
44
+ left_on='HomeTeamNameES',
45
+ right_on='team'
46
+ )
47
+ .join(
48
+ game_data[['team', 'games']].rename({'games': 'visitor_games'}),
49
+ left_on='VisitorTeamNameES',
50
+ right_on='team'
51
+ )
52
+ )
53
+
54
+
55
+ def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1):
56
+ assert player_type in ('pitcher', 'batter')
57
+ assert pitch_class_type in ('general', 'specific')
58
+ id_col = 'pitId' if player_type == 'pitcher' else 'batId'
59
+ pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code'
60
+ pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind'
61
+ pitch_stats = (
62
+ data
63
+ .group_by(id_col, pitch_col)
64
+ .agg(
65
+ pl.first('pitcher_name'),
66
+ *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []),
67
+ pl.first(pitch_name_col),
68
+ pl.len().alias('count'),
69
+ pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
70
+ (pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'),
71
+ (pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'),
72
+ (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%')
73
+ )
74
+ .with_columns(
75
+ (pl.col('count')/pl.sum('count').over('pitId')).alias('usage'),
76
+ (pl.col('count') >= min_pitches).alias('qualified')
77
+ )
78
+ .explode('batType')
79
+ .unnest('batType')
80
+ .pivot(on='batType', values='proportion')
81
+ .fill_null(0)
82
+ .with_columns(
83
+ (pl.col('G') + pl.col('B')).alias('GB%'),
84
+ (pl.col('F') + pl.col('P')).alias('FB%'),
85
+ pl.col('L').alias('LD%').round(2),
86
+ )
87
+ .drop('G', 'F', 'B', 'P', 'L', 'null')
88
+ .with_columns(
89
+ (pl.when(pl.col('qualified')).then(pl.col(stat)).rank()/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
90
+ for stat in ['SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%']
91
+ )
92
+ .rename({pitch_col: 'ballKind_code', pitch_name_col: 'ballKind'} if pitch_class_type == 'general' else {})
93
+ .sort(id_col, 'count', descending=[False, True])
94
+ )
95
+ return pitch_stats