patrickramos commited on
Commit
a8b6a3f
·
1 Parent(s): 53e0878
Files changed (4) hide show
  1. app.py +12 -0
  2. convert.py +259 -0
  3. data.py +168 -0
  4. pitcher_overview.py +36 -0
app.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from data import data_df
4
+ from pitcher_overview import create_pitcher_overview
5
+
6
+
7
+ with gr.Blocks() as demo:
8
+
9
+ create_pitcher_overview(data_df)
10
+
11
+ if __name__ == '__main__':
12
+ demo.launch()
convert.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aux_global_id_to_code = {
2
+ 7911: 'G',
3
+ 7912: 'S',
4
+ 7913: 'DB',
5
+ 7914: 'D',
6
+ 7915: 'T',
7
+ 7916: 'C',
8
+ 7917: 'F',
9
+ 7918: 'E',
10
+ 7919: 'L',
11
+ 7920: 'M',
12
+ 7921: 'B',
13
+ 7922: 'H',
14
+ 7925: 'PL',
15
+ 7926: 'CL'
16
+ }
17
+
18
+ ball_kind = {
19
+ -1: '-',
20
+ 31: 'Fastball (4-seam)',
21
+ 32: 'Slider',
22
+ 33: 'Vertical Slider',
23
+ 34: 'Slurve',
24
+ 35: 'Curve',
25
+ 36: 'Slow Curve',
26
+ 37: 'Power Curve',
27
+ 38: 'Knuckle Curve',
28
+ 39: 'Forkball',
29
+ 40: 'Splitter',
30
+ 41: 'Changeup',
31
+ 42: 'Sinker',
32
+ 43: 'Screwball',
33
+ 44: 'Palmball',
34
+ 45: 'Knuckleball',
35
+ 46: 'Shootball',
36
+ 47: 'Fastball (2-seam)',
37
+ 48: 'Fastball (1-seam)',
38
+ 49: 'Cutter',
39
+ 50: 'Eephus', # technically "super" eephus but I haven't encountered a normal one yet
40
+ 51: 'Hard Sinker',
41
+ 52: 'Hard Slider',
42
+ }
43
+
44
+ ball_kind_code = {
45
+ -1: '-',
46
+ 31: 'FF',
47
+ 32: 'SL',
48
+ 33: 'VS',
49
+ 34: 'SV',
50
+ 35: 'CU',
51
+ 36: 'SC',
52
+ 37: 'PC',
53
+ 38: 'KC',
54
+ 39: 'FO',
55
+ 40: 'FS',
56
+ 41: 'CH',
57
+ 42: 'SI',
58
+ 43: 'SB',
59
+ 44: 'PB',
60
+ 45: 'KN',
61
+ 46: 'SH',
62
+ 47: 'FT',
63
+ 48: 'FW',
64
+ 49: 'FC',
65
+ 50: 'EP', # technically "super" eephus but I haven't encountered a normal one yet
66
+ 51: 'HS',
67
+ 52: 'HL'
68
+ }
69
+
70
+ lr = {1: 'l', 2: 'r'}
71
+
72
+ presult = {
73
+ 0: 'None',
74
+ 101: 'Foul',
75
+ 102: 'Single',
76
+ 103: 'Hit by pitch',
77
+ 104: 'Double',
78
+ 105: 'Triple',
79
+ 106: 'Home run',
80
+ 107: 'Error',
81
+ 108: 'Groundout',
82
+ 109: 'Flyout',
83
+ 110: 'Lineout',
84
+ 111: 'Groundout (Double play)',
85
+ 112: 'Foul fly',
86
+ 113: 'Foul line (?)',
87
+ 114: 'Sacrifice bunt',
88
+ 115: 'Sacrifice fly',
89
+ 116: 'Swinging strike',
90
+ 117: 'Looking strike',
91
+ 118: 'Ball',
92
+ 119: 'Walk',
93
+ 120: 'Balk',
94
+ 121: 'Batter interference',
95
+ 122: 'Catcher interference',
96
+ 123: 'Uncaught third strike',
97
+ 124: 'Sacrifice hit error',
98
+ 125: 'Sacrifice fly, error',
99
+ 126: "Fielder's choice",
100
+ 128: "Sacrifice fielder's choice",
101
+ 129: 'Bunt strikeout',
102
+ 130: 'Swinging strikeout',
103
+ 131: 'Looking strikeout',
104
+ 133: 'Inside-the-park home run',
105
+ 134: 'Pitcher delay',
106
+ 135: 'Balk',
107
+ 139: 'Intentional walk',
108
+ 140: 'Groundout (Double play)',
109
+ 141: 'Unknown'
110
+ }
111
+
112
+ bresult = {
113
+ 0: '空振り三振',
114
+ 1: '単打', # 1b gb p
115
+ 2: '単打', # 1b gb c
116
+ 3: '単打', # 1b gb 1b
117
+ 4: '単打', # 1b gb 2b
118
+ 5: '単打', # 1b gb 3b? ld
119
+ 6: '単打', # 1b gb ss
120
+ 7: '単打', # 1b gb lf? ld
121
+ 8: '単打', # 1b gb cf
122
+ 9: '単打', # 1b gb rf
123
+ 10: '2塁打', # 2b gb p
124
+
125
+ 12: '2塁打', # 2b gb 1b
126
+ 13: '2塁打', # 2b gb 2b
127
+ 14: '2塁打', # 2b gb 3b? ld
128
+ 15: '2塁打', # 2b gb ss
129
+ 16: '2塁打', # 2b gb lf? ld
130
+ 17: '2塁打', # 2b gb cf
131
+ 18: '2塁打', # 2b gb rf
132
+
133
+ 22: '3塁打', # 3b gb 2b
134
+
135
+ 25: '3塁打', # 3b gb lf? ld
136
+ 26: '3塁打', # 3b gb cf
137
+ 27: '3塁打', # 3b gb rf
138
+ 28: '本塁打(ランニング)', # ihr lf?
139
+ 29: '本塁打(ランニング)', # ihr cf?
140
+ 30: '本塁打(ランニング)', # ihr rf?
141
+
142
+ 31: '空振り三振',
143
+ 32: '振逃げ',
144
+ 33: '振逃げ',
145
+ 34: '四球',
146
+ 35: 'フライ', # fb cf
147
+ 36: '死球',
148
+ 37: '打撃妨害',
149
+ 38: '守備妨害',
150
+
151
+ 39: '犠打野選',
152
+ 40: 'ゴロ', #gb p
153
+ 41: 'ゴロ', #gb c
154
+ 42: 'ゴロ', #gb 1b
155
+ 43: 'ゴロ', #gb 2b
156
+ 44: 'ゴロ', #gb 3b
157
+ 45: 'ゴロ', #gb ss
158
+ 46: 'ゴロ', #gb lf
159
+ 47: 'ゴロ', #gb cf
160
+ 48: 'ゴロ', #gb rf
161
+
162
+ 49: 'フライ', # fb p
163
+ 50: 'フライ', # fb c
164
+ 51: 'フライ', # fb 1b
165
+ 52: 'フライ', # fb 2b
166
+ 53: 'フライ', # fb 3b
167
+ 54: 'フライ', # fb ss
168
+ 55: 'フライ', # fb lf
169
+ 56: 'フライ', # fb cf
170
+ 57: 'フライ', # fb rf
171
+ 58: 'ライナー', # ld p
172
+ 60: 'ライナー', # ld 1b
173
+ 61: 'ライナー', # ld 2b
174
+ 62: 'ライナー', # ld 3b
175
+ 63: 'ライナー', # ld ss
176
+ 64: 'ライナー', # ld lf
177
+
178
+ 66: 'ライナー', # ld rf
179
+ 67: '犠打',
180
+ 68: '犠打',
181
+ 69: '犠打',
182
+ 70: '犠打',
183
+ 71: '犠打',
184
+ 79: '犠飛', # sac fly lf
185
+ 80: '犠飛', # sac fly cf
186
+ 81:'犠飛', # sac fly rf
187
+ 82: '邪飛',
188
+ 83: '邪飛',
189
+ 84: '邪飛',
190
+ 85: '邪飛',
191
+ 86: '邪飛',
192
+ 87: '邪飛',
193
+ 88: '邪飛',
194
+
195
+ 90: '邪飛',
196
+ 91: 'ゴロ(併殺打)',
197
+ 92: 'ゴロ(併殺打)',
198
+ 93: 'ゴロ(併殺打)',
199
+ 94: 'ゴロ(併殺打)',
200
+ 95: 'ゴロ(併殺打)',
201
+ 96: 'ゴロ(併殺打)',
202
+
203
+ 100: '失策出塁',
204
+ 101: '失策出塁',
205
+ 102: '失策出塁',
206
+ 103: '失策出塁',
207
+ 104: '失策出塁',
208
+ 105: '失策出塁',
209
+ 106: '失策出塁',
210
+ 107: '失策出塁',
211
+ 108: '失策出塁',
212
+ 109: '野選',
213
+ 110: '野選',
214
+ 111: '野選',
215
+ 112: '野選',
216
+ 113: '野選',
217
+ 114: '野選',
218
+ 115: '犠打失策',
219
+ 116: '犠飛失策',
220
+
221
+ 208: '本塁打', # hr lf?
222
+
223
+ 210: '本塁打', # hr cf?
224
+ 212: '本塁打', # hr rf?
225
+
226
+ 214: '見送り三振',
227
+ 215: '犠打野選',
228
+ 216: '犠打野選',
229
+
230
+ 218: '犠打野選',
231
+
232
+ 229: '振逃げ',
233
+
234
+ 234: '邪直',
235
+
236
+ 236: '邪直',
237
+
238
+ 241: '3バント失敗',
239
+
240
+ 258: '規則違反',
241
+
242
+ 267: '打撃妨害',
243
+
244
+ 338: '?'
245
+ }
246
+
247
+ game_kind = {
248
+ 1: 'CL Regular Season',
249
+ 2: 'PL Regular Season',
250
+ 3: 'Nippon Series',
251
+ 4: 'All-Star Game',
252
+ 5: 'Spring Training',
253
+ 11: 'Farm Championship',
254
+ 26: 'Interleague',
255
+ 35: 'CL Climax Series First Stage',
256
+ 36: 'CL Climax Series Final Stage',
257
+ 37: 'PL Climax Series First Stage',
258
+ 38: 'PL Climax Series Final Stage'
259
+ }
data.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+ from glob import glob
3
+ import os
4
+ from tqdm.auto import tqdm
5
+
6
+ from convert import aux_global_id_to_code, presult, ball_kind, ball_kind_code, lr, game_kind
7
+
8
+ DATA_PATH = os.path.expanduser('~/Documents/npb_data_collector/npb')
9
+ # SEASONS = list(range(2021, 2025+1))
10
+ SEASONS = [2021, 2022, 2023, 2024, 2025]
11
+
12
+ data_df = pl.DataFrame()
13
+ text_df = pl.DataFrame()
14
+ aux_df = pl.DataFrame()
15
+ sched_df = pl.DataFrame()
16
+ aux_sched_df = pl.DataFrame()
17
+
18
+ for season in tqdm(SEASONS):
19
+ _data_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'pbp_data.parquet'))
20
+ data_df = pl.concat((data_df, _data_df))
21
+
22
+ _text_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'pbp_text.parquet'))
23
+ text_df = pl.concat((text_df, _text_df))
24
+
25
+ _aux_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'pbp_aux.parquet'))
26
+ aux_df = pl.concat((aux_df, _aux_df), how='diagonal_relaxed')
27
+
28
+ _sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'schedule.parquet'))
29
+ sched_df = pl.concat((sched_df, _sched_df))
30
+
31
+ _aux_sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'aux_schedule.parquet'))
32
+ aux_sched_df = pl.concat((aux_sched_df, _aux_sched_df))
33
+
34
+
35
+ # sched_df = sched_df.
36
+
37
+ aux_df = (
38
+ aux_df
39
+ .filter(pl.col('type') != 'RUNNER')
40
+ .join(aux_sched_df[['gameGlobalId', 'gameDate']], on='gameGlobalId')
41
+ .with_columns(
42
+ pl.col('gameDate').str.to_date().dt.strftime('%Y%m%d'),
43
+ pl.col('home').struct.field('globalId').replace_strict(aux_global_id_to_code).alias('home'),
44
+ pl.col('visitor').struct.field('globalId').replace_strict(aux_global_id_to_code).alias('visitor'),
45
+ pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2')).alias('tob_code'),
46
+ )
47
+ .filter(
48
+ # pl.col('pitch').struct.field('count') > 0
49
+
50
+ # either one alone should be enough but let's use them together to be safe
51
+ ~((pl.col('code') == 98) & (pl.col('id') == 1))
52
+ )
53
+ .with_columns(
54
+ (pl.col('pitch').struct.field('count') == 1).cum_sum().over(['gameGlobalId', 'inning', 'tob']).alias('pa_count')
55
+ )
56
+ .with_columns(
57
+ pl.col('code').is_in([6402, 6404, 6406, 6405]).any().over(['gameGlobalId', 'inning', 'tob', 'pa_count']).alias('ibb')
58
+ )
59
+ .with_columns(
60
+ pl.when(~pl.col('ibb')).then(pl.col('pitch').struct.field('count') == 1).cum_sum().over(['gameGlobalId', 'inning', 'tob']).alias('new_pa_count')
61
+ )
62
+ .with_columns(
63
+ pl.len().over(['gameGlobalId', 'inning', 'tob', 'new_pa_count']).alias('pa_pitches'),
64
+ pl.max('new_pa_count').over(['gameGlobalId', 'inning', 'tob']).alias('inning_pas')
65
+ )
66
+ .with_columns(
67
+ (
68
+ pl.col('gameDate') + '_' + \
69
+ pl.col('visitor') + '_' + \
70
+ pl.col('home') + '_' + \
71
+ pl.col('inning').str.zfill(2) + pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2')) + pl.col('new_pa_count').cast(pl.String).str.zfill(2) + '_' +\
72
+ pl.col('pitch').struct.field('count').cast(pl.String)
73
+ ).alias('universal_code'),
74
+ (
75
+ pl.col('gameDate') + '_' + \
76
+ pl.col('visitor') + '_' + \
77
+ pl.col('home') + '_' + \
78
+ pl.col('inning').str.zfill(2) + pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2'))
79
+ ).alias('inning_code'),
80
+ (
81
+ pl.col('gameDate') + '_' + \
82
+ pl.col('visitor') + '_' + \
83
+ pl.col('home') + '_' + \
84
+ pl.col('inning').str.zfill(2) + pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2')) + pl.col('new_pa_count').cast(pl.String).str.zfill(2)
85
+ ).alias('pa_code')
86
+ )
87
+ )
88
+
89
+ data_df = data_df
90
+
91
+ data_df = (
92
+ data_df
93
+ .with_columns(
94
+ *[
95
+ pl.col(col).cast(pl.Int32)
96
+ for col
97
+ in ['gameId', 'ballKind', 'ballSpeed', 'x', 'y', 'presult', 'bresult', 'battedX', 'battedY']
98
+ ],
99
+ pl.col('UpdatedAt').str.to_datetime(),
100
+ pl.col('fiveDigitSerialNumber').str.slice(offset=0, length=3).alias('half_inning'),
101
+ pl.col('fiveDigitSerialNumber').str.slice(offset=3, length=2).alias('batter'),
102
+ )
103
+ .with_columns(
104
+ # pl.count('ID').over(['gameId', 'fiveDigitSerialNumber']).alias('pa_pitches')
105
+ (~pl.col('presult').is_in([0])).sum().over(['gameId', 'fiveDigitSerialNumber']).alias('pa_pitches'),
106
+ pl.col('presult').is_in([139]).any().over(['gameId', 'fiveDigitSerialNumber']).alias('ibb')
107
+ )
108
+ .filter(
109
+ (pl.col('pa_pitches') > 0)
110
+ )
111
+ .with_columns(
112
+ pl.when(~pl.col('ibb')).then(pl.col('batter'))
113
+ )
114
+ .with_columns(
115
+ pl.when(~pl.col('ibb')).then(pl.col('batter').rank('dense')).over(['gameId', 'half_inning']).cast(pl.String).str.zfill(2).alias('new_batter')
116
+ )
117
+ .with_columns(
118
+ (pl.col('half_inning') + pl.col('new_batter')).alias('newFiveDigitSerialNumber')
119
+ )
120
+ .with_columns(pl.max('new_batter').cast(pl.Int32).over(['gameId', pl.col('newFiveDigitSerialNumber').str.slice(offset=0, length=3)]).alias('inning_pas'))
121
+ .join(sched_df[['GameID', 'HomeTeamNameES', 'VisitorTeamNameES']].rename({'GameID': 'gameId'}), on='gameId')
122
+ .with_columns(pl.col('UpdatedAt').dt.strftime('%Y%m%d').alias('date'))
123
+ .with_columns(
124
+ (pl.col('date') + '_' + pl.col('VisitorTeamNameES') + '_' + pl.col('HomeTeamNameES') + '_' + pl.col('newFiveDigitSerialNumber')).alias('universal_code') + '_' + pl.col('atBatBallCount'),
125
+ (pl.col('date') + '_' + pl.col('VisitorTeamNameES') + '_' + pl.col('HomeTeamNameES') + '_' + pl.col('newFiveDigitSerialNumber').str.slice(offset=0, length=3)).alias('inning_code'),
126
+ (pl.col('date') + '_' + pl.col('VisitorTeamNameES') + '_' + pl.col('HomeTeamNameES') + '_' + pl.col('newFiveDigitSerialNumber')).alias('pa_code')
127
+ )
128
+ .join(
129
+ (
130
+ aux_df.filter(~pl.col('ibb'))[['universal_code', 'battingResult', 'inning_pas', 'pa_pitches']]
131
+ .rename({'battingResult': 'aux_bresult', 'inning_pas': 'aux_inning_pas', 'pa_pitches': 'aux_pa_pitches'})
132
+ ),
133
+ on='universal_code',
134
+ how='left'
135
+ )
136
+ .with_columns(
137
+
138
+ )
139
+ .join(
140
+ text_df[['GameID', 'GameKindID']].with_columns(
141
+ pl.col('GameID').cast(pl.Int32),
142
+ pl.col('GameKindID').cast(pl.Int32),
143
+ ).unique(),
144
+ how='left',
145
+ left_on='gameId',
146
+ right_on='GameID'
147
+ )
148
+ .with_columns(pl.col('GameKindID').replace_strict(game_kind).alias('GameKindName'))
149
+ .with_columns(
150
+ pl.when((pl.col('inning_pas') == pl.col('aux_inning_pas')) & (pl.col('pa_pitches') == pl.col('aux_pa_pitches')))
151
+ .then('aux_bresult')
152
+ .alias('aux_bresult'),
153
+
154
+ pl.col('x').add(-100).mul(-1),
155
+ pl.col('y').neg().add(250),
156
+ pl.col('presult').replace_strict(presult),
157
+ pl.col('ballKind').replace_strict(ball_kind),
158
+ pl.col('ballKind').replace_strict(ball_kind_code).alias('ballKind_code'),
159
+ pl.col('batLR').replace_strict(lr),
160
+
161
+ pl.when(pl.col('GameKindName').str.contains('Regular Season') | (pl.col('GameKindName') == 'Interleague'))
162
+ .then(pl.lit('Regular Season'))
163
+ .when(~pl.col('GameKindName').is_in(['Spring Training', 'All-Star Game']))
164
+ .then(pl.lit('Postseason'))
165
+ .otherwise('GameKindName')
166
+ .alias('coarse_game_kind')
167
+ )
168
+ )
pitcher_overview.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from data import SEASONS
4
+
5
+ def dummy(*inputs):
6
+ return inputs
7
+
8
+ def adjust_season_end_based_on_season_start(season_start, season_end):
9
+ return max(season_start, season_end)
10
+
11
+ def adjust_season_start_based_on_season_end(season_end, season_start):
12
+ return min(season_start, season_end)
13
+
14
+
15
+ def create_pitcher_overview(data_df):
16
+ with gr.Blocks() as app:
17
+ gr.Markdown('Test')
18
+
19
+ name = gr.Dropdown(sorted(data_df['pitId'].unique().to_list()), label='Name')
20
+ season_start = gr.Dropdown(SEASONS, label='Season start')
21
+ season_end = gr.Dropdown(SEASONS, label='Season end')
22
+
23
+ season_start.input(adjust_season_end_based_on_season_start, inputs=[season_start, season_end], outputs=season_end)
24
+ season_end.input(adjust_season_start_based_on_season_end, inputs=[season_end, season_start], outputs=season_start)
25
+
26
+ game_type = gr.Dropdown(['Spring Training', 'Regular Season', 'Postseason'], label='Game Type'])
27
+
28
+ generate = gr.Button('Generate')
29
+
30
+ dummy_io = [name, season_start, season_end, game_type
31
+ generate.click(dummy, inputs=dummy_io, outputs=dummy_io)
32
+
33
+ return app
34
+
35
+ if __name__ == '__main__':
36
+ create_pitcher_overview().launch()