Spaces:
Running
Running
Commit
·
b421bc5
1
Parent(s):
0ed953a
Add working app
Browse files- README.md +4 -4
- convert.py +30 -0
- data.py +67 -6
- pitcher_overview.py +54 -17
- plotting.py +396 -0
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
title: Npb Data App
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.37.0
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
1 |
---
|
2 |
title: Npb Data App
|
3 |
+
emoji: ⚾️
|
4 |
+
colorFrom: white
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.37.0
|
8 |
+
app_file: pitcher_overview.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
convert.py
CHANGED
@@ -257,3 +257,33 @@ game_kind = {
|
|
257 |
37: 'PL Climax Series First Stage',
|
258 |
38: 'PL Climax Series Final Stage'
|
259 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
37: 'PL Climax Series First Stage',
|
258 |
38: 'PL Climax Series Final Stage'
|
259 |
}
|
260 |
+
|
261 |
+
ball_kind_code_to_color = {
|
262 |
+
'-': '',
|
263 |
+
'FF': 'crimson',
|
264 |
+
'SL': 'gold',
|
265 |
+
'VS': '',
|
266 |
+
'SV': '',
|
267 |
+
'CU': 'paleturquoise',
|
268 |
+
'SC': 'royalblue',
|
269 |
+
'PC': '',
|
270 |
+
'KC': 'rebeccapurple',
|
271 |
+
'FO': 'darkturquoise',
|
272 |
+
'FS': 'cadetblue',
|
273 |
+
'CH': 'mediumseagreen',
|
274 |
+
'SI': '',
|
275 |
+
'SB': '',
|
276 |
+
'PB': '',
|
277 |
+
'SH': 'tomato',
|
278 |
+
'FT': '',
|
279 |
+
'FW': '',
|
280 |
+
'FC': 'sienna',
|
281 |
+
'EP': '', # technically "super" eephus but I haven't encountered a normal one yet
|
282 |
+
'HS': '',
|
283 |
+
'HL': ''
|
284 |
+
}
|
285 |
+
ball_kind_code_to_color = {k: v if v else 'C0' for k, v in ball_kind_code_to_color.items()}
|
286 |
+
def get_text_color_from_color(color):
|
287 |
+
if color in ['gold', 'paleturquoise']:
|
288 |
+
return 'black'
|
289 |
+
return 'white'
|
data.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1 |
import polars as pl
|
2 |
import os
|
3 |
from tqdm.auto import tqdm
|
|
|
|
|
4 |
|
5 |
from convert import aux_global_id_to_code, presult, ball_kind, ball_kind_code, lr, game_kind
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
SEASONS = [2021, 2022, 2023, 2024, 2025]
|
10 |
-
# SEASONS = [2024]
|
11 |
|
12 |
data_df = pl.DataFrame()
|
13 |
text_df = pl.DataFrame()
|
@@ -31,8 +37,60 @@ for season in tqdm(SEASONS):
|
|
31 |
_aux_sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'aux_schedule.parquet'))
|
32 |
aux_sched_df = pl.concat((aux_sched_df, _aux_sched_df))
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
aux_df = (
|
38 |
aux_df
|
@@ -131,8 +189,8 @@ data_df = (
|
|
131 |
on='universal_code',
|
132 |
how='left'
|
133 |
)
|
134 |
-
.
|
135 |
-
|
136 |
)
|
137 |
.join(
|
138 |
text_df[['GameID', 'GameKindID']].with_columns(
|
@@ -176,3 +234,6 @@ data_df = (
|
|
176 |
(pl.col('whiff') | pl.col('presult').is_in(['Looking strike', 'Uncaught third strike', 'Looking strikeout'])).alias('csw')
|
177 |
)
|
178 |
)
|
|
|
|
|
|
|
|
1 |
import polars as pl
|
2 |
import os
|
3 |
from tqdm.auto import tqdm
|
4 |
+
import pykakasi
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
|
7 |
from convert import aux_global_id_to_code, presult, ball_kind, ball_kind_code, lr, game_kind
|
8 |
|
9 |
+
|
10 |
+
DATA_PATH = snapshot_download(
|
11 |
+
repo_id='Ramos-Ramos/npb_data_app',
|
12 |
+
repo_type='dataset',
|
13 |
+
local_dir='./files',
|
14 |
+
cache_dir='./.cache'
|
15 |
+
)
|
16 |
SEASONS = [2021, 2022, 2023, 2024, 2025]
|
|
|
17 |
|
18 |
data_df = pl.DataFrame()
|
19 |
text_df = pl.DataFrame()
|
|
|
37 |
_aux_sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'aux_schedule.parquet'))
|
38 |
aux_sched_df = pl.concat((aux_sched_df, _aux_sched_df))
|
39 |
|
40 |
+
players_df = pl.read_parquet(os.path.join(DATA_PATH, 'players.parquet'))
|
41 |
+
kana_df = pl.read_parquet(os.path.join(DATA_PATH, 'players_kana.parquet'))
|
42 |
+
|
43 |
+
kks = pykakasi.kakasi()
|
44 |
+
kana_df = (
|
45 |
+
kana_df
|
46 |
+
.with_columns(
|
47 |
+
pl.col('name').str.normalize('NFKC'),
|
48 |
+
(
|
49 |
+
pl.col('name_kana')
|
50 |
+
.map_elements(
|
51 |
+
lambda name: ''.join([word['hepburn'].capitalize() for word in kks.convert(name)]),
|
52 |
+
return_dtype=pl.String
|
53 |
+
)
|
54 |
+
.alias('name_en')
|
55 |
+
)
|
56 |
+
)
|
57 |
+
.with_columns(pl.col('name_en').str.to_lowercase())
|
58 |
+
)
|
59 |
+
|
60 |
+
for old_part, new_part in [
|
61 |
+
('you', 'yo'),
|
62 |
+
('kou', 'ko'),
|
63 |
+
('gou', 'go'),
|
64 |
+
('shou', 'sho'),
|
65 |
+
('jou', 'jo'),
|
66 |
+
('rou', 'ro'),
|
67 |
+
('ou', 'oh'),
|
68 |
+
('shuu', 'shu'),
|
69 |
+
('ryuu', 'ryu'),
|
70 |
+
('yuu', 'yu'),
|
71 |
+
('oo', 'o') # messes with someone whose name ends in koo
|
72 |
+
]:
|
73 |
+
kana_df = kana_df.with_columns(pl.col('name_en').str.replace(old_part, new_part))
|
74 |
|
75 |
+
kana_df = kana_df.with_columns(pl.col('name_en').str.to_titlecase())
|
76 |
+
|
77 |
+
players_df = players_df.with_columns(pl.col('playerName').str.normalize('NFKC'))
|
78 |
+
for old_char, new_char in [
|
79 |
+
('崎', '﨑'),
|
80 |
+
('高', '髙'),
|
81 |
+
('徳', '德'),
|
82 |
+
('濱', '濵'),
|
83 |
+
('瀬', '瀨')
|
84 |
+
]:
|
85 |
+
players_df = (
|
86 |
+
players_df.with_columns(
|
87 |
+
pl.when(~pl.col('playerName').is_in(kana_df['name']))
|
88 |
+
.then(pl.col('playerName').str.replace(old_char, new_char))
|
89 |
+
.otherwise('playerName')
|
90 |
+
)
|
91 |
+
)
|
92 |
+
|
93 |
+
players_df = players_df.join(kana_df, left_on='playerName', right_on='name', how='left')
|
94 |
|
95 |
aux_df = (
|
96 |
aux_df
|
|
|
189 |
on='universal_code',
|
190 |
how='left'
|
191 |
)
|
192 |
+
.join(
|
193 |
+
players_df.rename({'name_en': 'pitcher_name'}), left_on='pitId', right_on='playerId', how='left'
|
194 |
)
|
195 |
.join(
|
196 |
text_df[['GameID', 'GameKindID']].with_columns(
|
|
|
234 |
(pl.col('whiff') | pl.col('presult').is_in(['Looking strike', 'Uncaught third strike', 'Looking strikeout'])).alias('csw')
|
235 |
)
|
236 |
)
|
237 |
+
|
238 |
+
if __name__ == '__main__':
|
239 |
+
breakpoint()
|
pitcher_overview.py
CHANGED
@@ -1,36 +1,73 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
|
3 |
-
from data import SEASONS
|
|
|
|
|
4 |
|
5 |
def dummy(*inputs):
|
6 |
return inputs
|
7 |
|
8 |
-
def
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
def
|
12 |
-
return
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
def create_pitcher_overview(data_df):
|
16 |
with gr.Blocks() as app:
|
17 |
-
gr.Markdown('
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
return app
|
34 |
|
35 |
if __name__ == '__main__':
|
36 |
-
create_pitcher_overview().launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import polars as pl
|
4 |
|
5 |
+
from data import SEASONS, data_df
|
6 |
+
|
7 |
+
from plotting import create_pitcher_overview_card
|
8 |
|
9 |
def dummy(*inputs):
|
10 |
return inputs
|
11 |
|
12 |
+
def gr_create_pitcher_overview_card(name, season):
|
13 |
+
# pit_id = name.split(' | ')[-1]
|
14 |
+
pit_id = data_df.filter(pl.col('pitcher_name') == name)['pitId'].unique()
|
15 |
+
if len(pit_id) == 0:
|
16 |
+
raise gr.Error(f"No data found for {name}. If the name looks strangely spelled or formatted there's a possibility that's what causing the error.")
|
17 |
+
elif len(pit_id) > 1:
|
18 |
+
raise gr.Error(f'Multiple IDs for {name}')
|
19 |
+
else:
|
20 |
+
pit_id = pit_id.item()
|
21 |
+
create_pitcher_overview_card(pit_id, season=season, dpi=300)
|
22 |
+
plt.savefig('tmp.png', bbox_inches='tight')
|
23 |
+
return 'tmp.png'
|
24 |
|
25 |
+
# def adjust_season_end_based_on_season_start(season_start, season_end):
|
26 |
+
# return max(season_start, season_end)
|
27 |
+
#
|
28 |
+
# def adjust_season_start_based_on_season_end(season_end, season_start):
|
29 |
+
# return min(season_start, season_end)
|
30 |
|
31 |
|
32 |
def create_pitcher_overview(data_df):
|
33 |
with gr.Blocks() as app:
|
34 |
+
gr.Markdown('Pitcher overview')
|
35 |
|
36 |
+
with gr.Row():
|
37 |
+
with gr.Column():
|
38 |
+
# names = [f'{pit_name} | {pit_id}' for pit_name, pit_id in data_df[['pitcher_name', 'pitId']].unique().sort('pitId').iter_rows()]
|
39 |
+
names = data_df['pitcher_name'].unique().sort().to_list()
|
40 |
+
name = gr.Dropdown(names, label='Name')
|
41 |
+
season = gr.Dropdown(SEASONS, label='Season')
|
42 |
+
# season_start = gr.Dropdown(SEASONS, label='Season start')
|
43 |
+
# season_end = gr.Dropdown(SEASONS, label='Season end')
|
44 |
+
# game_type = gr.Dropdown(['Spring Training', 'Regular Season', 'Postseason'], label='Game Type'])
|
45 |
+
view = gr.Button('View')
|
46 |
+
gr.Markdown(
|
47 |
+
'''
|
48 |
+
**Limitations**
|
49 |
+
- Foreign players names are in Hebpurn romanization. Contact me if you need a card for a foreign player.
|
50 |
|
51 |
+
**To-do**
|
52 |
+
- Fix names of foreign playeres
|
53 |
+
- Add teams insignias
|
54 |
+
- Measure percentiles per pitcher handedness
|
55 |
+
- Allow for arbitrary date ranges
|
56 |
+
- Improve readability of pitch velocities
|
57 |
|
58 |
+
Last updated: 2025-07-19
|
59 |
+
'''
|
60 |
+
)
|
61 |
|
62 |
+
with gr.Column():
|
63 |
+
overview_card = gr.Image(label='Overview')
|
64 |
+
|
65 |
+
# season_start.input(adjust_season_end_based_on_season_start, inputs=[season_start, season_end], outputs=season_end)
|
66 |
+
# season_end.input(adjust_season_start_based_on_season_end, inputs=[season_end, season_start], outputs=season_start)
|
67 |
+
view.click(gr_create_pitcher_overview_card, inputs=[name, season], outputs=overview_card)
|
68 |
+
|
69 |
|
70 |
return app
|
71 |
|
72 |
if __name__ == '__main__':
|
73 |
+
create_pitcher_overview(data_df).launch()
|
plotting.py
ADDED
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib as mpl
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from matplotlib import transforms
|
4 |
+
from matplotlib.colors import LinearSegmentedColormap
|
5 |
+
import polars as pl
|
6 |
+
from pyfonts import load_google_font
|
7 |
+
from scipy.stats import gaussian_kde
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from types import SimpleNamespace
|
11 |
+
from datetime import date
|
12 |
+
|
13 |
+
from data import data_df
|
14 |
+
from convert import ball_kind_code_to_color, get_text_color_from_color
|
15 |
+
|
16 |
+
mpl.use('Agg')
|
17 |
+
|
18 |
+
def compute_team_games(data):
|
19 |
+
data = (
|
20 |
+
data
|
21 |
+
.with_columns(
|
22 |
+
pl.col('gameId').unique().len().over('HomeTeamNameES').alias('home_games'),
|
23 |
+
pl.col('gameId').unique().len().over('VisitorTeamNameES').alias('visitor_games')
|
24 |
+
)
|
25 |
+
)
|
26 |
+
game_data = (
|
27 |
+
data
|
28 |
+
.group_by('HomeTeamNameES')
|
29 |
+
.first()
|
30 |
+
[['HomeTeamNameES', 'home_games']]
|
31 |
+
.rename({'HomeTeamNameES': 'team'})
|
32 |
+
.join(
|
33 |
+
(
|
34 |
+
data
|
35 |
+
.group_by('VisitorTeamNameES')
|
36 |
+
.first()
|
37 |
+
[['VisitorTeamNameES', 'visitor_games']]
|
38 |
+
.rename({'VisitorTeamNameES': 'team'})
|
39 |
+
),
|
40 |
+
on='team',
|
41 |
+
)
|
42 |
+
.with_columns((pl.col('home_games')+pl.col('visitor_games')).alias('games'))
|
43 |
+
)
|
44 |
+
|
45 |
+
return (
|
46 |
+
data
|
47 |
+
.drop('home_games', 'visitor_games')
|
48 |
+
.join(
|
49 |
+
game_data[['team', 'games']].rename({'games': 'home_games'}),
|
50 |
+
left_on='HomeTeamNameES',
|
51 |
+
right_on='team'
|
52 |
+
)
|
53 |
+
.join(
|
54 |
+
game_data[['team', 'games']].rename({'games': 'visitor_games'}),
|
55 |
+
left_on='VisitorTeamNameES',
|
56 |
+
right_on='team'
|
57 |
+
)
|
58 |
+
)
|
59 |
+
|
60 |
+
|
61 |
+
def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1):
|
62 |
+
source_data = data_df.filter(pl.col('ballKind_code') != '-')
|
63 |
+
|
64 |
+
if start_date is not None:
|
65 |
+
source_data = source_data.filter(pl.col('date') >= start_date)
|
66 |
+
if end_date is not None:
|
67 |
+
source_data = source_data.filter(pl.col('date') <= end_date)
|
68 |
+
|
69 |
+
if game_kind is not None:
|
70 |
+
source_data = source_data.filter(pl.col('coarse_game_kind') == game_kind)
|
71 |
+
|
72 |
+
source_data = (
|
73 |
+
compute_team_games(source_data)
|
74 |
+
.with_columns(
|
75 |
+
pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'),
|
76 |
+
pl.col('inning_code').unique().len().over('pitId').alias('IP')
|
77 |
+
)
|
78 |
+
)
|
79 |
+
|
80 |
+
if min_ip == 'qualified':
|
81 |
+
source_data = source_data.with_columns((pl.col('IP') >= pl.col('games')).alias('qualified'))
|
82 |
+
else:
|
83 |
+
source_data = source_data.with_columns((pl.col('IP') >= min_ip).alias('qualified'))
|
84 |
+
|
85 |
+
|
86 |
+
if lr is not None:
|
87 |
+
source_data = source_data.filter(pl.col('batLR') == lr)
|
88 |
+
|
89 |
+
pitch_stats = (
|
90 |
+
source_data
|
91 |
+
# .with_columns(
|
92 |
+
# pl.col('presult').is_in(['None', 'Balk', 'Batter interference', 'Catcher interference', 'Pitcher delay', 'Intentional walk', 'Unknown']).not_().alias('pitch'),
|
93 |
+
# pl.col('presult').is_in(['Swinging strike', 'Swinging strikeout']).alias('whiff'),
|
94 |
+
# )
|
95 |
+
# .with_columns(
|
96 |
+
# (pl.col('pitch') & pl.col('presult').is_in(['Hit by pitch', 'Sacrifice bunt', 'Sacrifice fly', 'Looking strike', 'Ball', 'Walk', 'Looking strikeout', 'Sacrifice hit error', 'Sacrifice fly error', "Sacrifice fielder's choice", 'Bunt strikeout']).not_()).alias('swing'),
|
97 |
+
# (pl.col('whiff') | pl.col('presult').is_in(['Looking strike', 'Uncaught third strike', 'Looking strikeout'])).alias('csw')
|
98 |
+
# )
|
99 |
+
.group_by('pitId', 'ballKind_code')
|
100 |
+
.agg(
|
101 |
+
pl.len().alias('count'),
|
102 |
+
pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
|
103 |
+
(pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'),
|
104 |
+
(pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'),
|
105 |
+
(pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%')
|
106 |
+
)
|
107 |
+
.with_columns(
|
108 |
+
(pl.col('count')/pl.sum('count').over('pitId')).alias('usage'),
|
109 |
+
(pl.col('count') >= min_pitches).alias('qualified')
|
110 |
+
)
|
111 |
+
.explode('batType')
|
112 |
+
.unnest('batType')
|
113 |
+
.pivot(on='batType', values='proportion')
|
114 |
+
.fill_null(0)
|
115 |
+
.with_columns(
|
116 |
+
(pl.col('G') + pl.col('B')).alias('GB%'),
|
117 |
+
(pl.col('F') + pl.col('P')).alias('FB%'),
|
118 |
+
pl.col('L').alias('LD%').round(2),
|
119 |
+
)
|
120 |
+
.drop('G', 'F', 'B', 'P', 'L')
|
121 |
+
.with_columns(
|
122 |
+
(pl.when(pl.col('qualified')).then(pl.col(stat)).rank()/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
|
123 |
+
for stat in ['SwStr%', 'Whiff%', 'CSW%', 'GB%']
|
124 |
+
)
|
125 |
+
.sort('pitId', 'count', descending=[False, True])
|
126 |
+
.filter(pl.col('pitId') == id)
|
127 |
+
)
|
128 |
+
|
129 |
+
pitch_shapes = (
|
130 |
+
source_data
|
131 |
+
.filter(
|
132 |
+
(pl.col('pitId') == id) &
|
133 |
+
pl.col('x').is_not_null() &
|
134 |
+
pl.col('y').is_not_null() &
|
135 |
+
(pl.col('ballSpeed') > 0)
|
136 |
+
)
|
137 |
+
[['pitId', 'ballKind_code', 'ballSpeed', 'x', 'y']]
|
138 |
+
)
|
139 |
+
|
140 |
+
pitcher_stats = (
|
141 |
+
source_data
|
142 |
+
.group_by('pitId')
|
143 |
+
.agg(
|
144 |
+
pl.col('pitcher_name').first(),
|
145 |
+
(pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'),
|
146 |
+
(pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'),
|
147 |
+
(pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'),
|
148 |
+
pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True),
|
149 |
+
pl.first('qualified')
|
150 |
+
)
|
151 |
+
.explode('batType')
|
152 |
+
.unnest('batType')
|
153 |
+
.pivot(on='batType', values='proportion')
|
154 |
+
.fill_null(0)
|
155 |
+
.with_columns(
|
156 |
+
(pl.col('G') + pl.col('B')).alias('GB%'),
|
157 |
+
(pl.col('F') + pl.col('P')).alias('FB%'),
|
158 |
+
pl.col('L').alias('LD%'),
|
159 |
+
)
|
160 |
+
.drop('G', 'F', 'B', 'P', 'L')
|
161 |
+
.with_columns(
|
162 |
+
(pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl')
|
163 |
+
for stat in ['CSW%', 'K%', 'BB%', 'GB%']
|
164 |
+
)
|
165 |
+
.filter(pl.col('pitId') == id)
|
166 |
+
)
|
167 |
+
|
168 |
+
return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes)
|
169 |
+
|
170 |
+
|
171 |
+
def get_card_data(id, **kwargs):
|
172 |
+
both, left, right = get_pitcher_stats(id, **kwargs), get_pitcher_stats(id, 'l', **kwargs), get_pitcher_stats(id, 'r', **kwargs)
|
173 |
+
pitcher_stats = both.pitcher_stats.join(left.pitcher_stats, on='pitId', suffix='_left').join(right.pitcher_stats, on='pitId', suffix='_right')
|
174 |
+
pitch_stats = both.pitch_stats.join(left.pitch_stats, on='ballKind_code', how='full', suffix='_left').join(right.pitch_stats, on='ballKind_code', how='full', suffix='_right').fill_null(0)
|
175 |
+
return SimpleNamespace(
|
176 |
+
pitcher_stats=pitcher_stats,
|
177 |
+
pitch_stats=pitch_stats,
|
178 |
+
both_pitch_shapes=both.pitch_shapes,
|
179 |
+
left_pitch_shapes=left.pitch_shapes,
|
180 |
+
right_pitch_shapes=right.pitch_shapes
|
181 |
+
)
|
182 |
+
|
183 |
+
|
184 |
+
def plot_arsenal(ax, pitches):
|
185 |
+
ax.set_xlim(0, 11)
|
186 |
+
x = np.arange(len(pitches)) + 0.5
|
187 |
+
y = np.zeros(len(pitches))
|
188 |
+
ax.scatter(x, y, c=[ball_kind_code_to_color.get(pitch, 'C0') for pitch in pitches], s=170)
|
189 |
+
for i, pitch in enumerate(pitches):
|
190 |
+
color = ball_kind_code_to_color.get(pitch, 'C0')
|
191 |
+
ax.text(x=i+0.5, y=0, s=pitch, horizontalalignment='center', verticalalignment='center', font=font, color=get_text_color_from_color(color))
|
192 |
+
|
193 |
+
|
194 |
+
def plot_usage(ax, usages):
|
195 |
+
left = 0
|
196 |
+
height = 0.8
|
197 |
+
for pitch, usage in usages.iter_rows():
|
198 |
+
color = ball_kind_code_to_color[pitch]
|
199 |
+
ax.barh(0, usage, height=height, left=left, color=color)
|
200 |
+
if usage > 0.1:
|
201 |
+
ax.text(left+usage/2, 0, f'{usage:.0%}', horizontalalignment='center', verticalalignment='center', size=8, font=font, color=get_text_color_from_color(color))
|
202 |
+
left += usage
|
203 |
+
ax.set_xlim(0, 1)
|
204 |
+
ax.set_ylim(-height/2, height/2*2.75)
|
205 |
+
|
206 |
+
|
207 |
+
x_range = np.arange(-100, 100+1)
|
208 |
+
y_range = np.arange(0, 250+1)
|
209 |
+
X, Y = np.meshgrid(x_range, y_range)
|
210 |
+
|
211 |
+
|
212 |
+
def fit_pred_kde(data):
|
213 |
+
kde = gaussian_kde(data)
|
214 |
+
Z = kde(np.concat((X, Y)).reshape(2, -1)).reshape(*X.shape)
|
215 |
+
return Z
|
216 |
+
|
217 |
+
|
218 |
+
def plot_loc(ax, locs):
|
219 |
+
ax.set_aspect('equal', adjustable='datalim')
|
220 |
+
ax.set_ylim(-52, 252)
|
221 |
+
ax.add_patch(plt.Rectangle((-100, 0), width=200, height=250, facecolor='darkgray', edgecolor='dimgray'))
|
222 |
+
ax.add_patch(plt.Rectangle((-80, 25), width=160, height=200, facecolor='gainsboro', edgecolor='dimgray'))
|
223 |
+
ax.add_patch(plt.Rectangle((-60, 50), width=120, height=150, fill=False, edgecolor='yellowgreen', linestyle=':'))
|
224 |
+
ax.add_patch(plt.Rectangle((-40, 75), width=80, height=100, facecolor='ivory', edgecolor='darkgray'))
|
225 |
+
ax.add_patch(plt.Polygon([(0, -10), (45, -30), (51, -50), (-51, -50), (-45, -30), (0, -10)], facecolor='snow', edgecolor='darkgray'))
|
226 |
+
|
227 |
+
for (pitch,), _locs in locs.sort(pl.len().over('ballKind_code'), descending=True).group_by('ballKind_code', maintain_order=True):
|
228 |
+
if len(_locs) <= 2:
|
229 |
+
continue
|
230 |
+
|
231 |
+
Z = fit_pred_kde(_locs[['x', 'y']].to_numpy().T)
|
232 |
+
Z = Z / Z.sum()
|
233 |
+
|
234 |
+
Z_flat = Z.ravel()
|
235 |
+
sorted_Z = np.sort(Z_flat)
|
236 |
+
sorted_Z_idxs = np.argsort(Z_flat)
|
237 |
+
Z_cumsum = (sorted_Z).cumsum()
|
238 |
+
t = Z_flat[sorted_Z_idxs[np.argmin(np.abs(Z_cumsum - (1-0.68)))]]
|
239 |
+
|
240 |
+
ax.contourf(X, Y, Z, levels=[t, 1], colors=ball_kind_code_to_color[pitch], alpha=0.5)
|
241 |
+
ax.contour(X, Y, Z, levels=t.reshape(1), colors=ball_kind_code_to_color[pitch], alpha=0.75)
|
242 |
+
|
243 |
+
|
244 |
+
def plot_velo(ax, velos):
|
245 |
+
trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
|
246 |
+
for (pitch,), _velos in velos.group_by('ballKind_code'):
|
247 |
+
if len(_velos) <= 1:
|
248 |
+
continue
|
249 |
+
|
250 |
+
violin = ax.violinplot(_velos['ballSpeed'], orientation='horizontal', side='high', showextrema=False)
|
251 |
+
for _violin in violin['bodies']:
|
252 |
+
_violin.set_facecolor(ball_kind_code_to_color[pitch])
|
253 |
+
mean = _velos['ballSpeed'].mean()
|
254 |
+
ax.text(mean, 0.5, round(mean), horizontalalignment='center', verticalalignment='center', color='gray', alpha=0.75, font=font, transform=trans)
|
255 |
+
|
256 |
+
|
257 |
+
stat_cmap = LinearSegmentedColormap.from_list('stat', colors=['dodgerblue', 'snow', 'crimson'])
|
258 |
+
|
259 |
+
|
260 |
+
def plot_pitch_stats(ax, stats, stat_names):
|
261 |
+
|
262 |
+
ax.set_aspect('equal', adjustable='datalim')
|
263 |
+
|
264 |
+
# axis_to_data = lambda coords: ax.transData.inverted().transform(ax.transAxes.transform(coords))
|
265 |
+
|
266 |
+
table = mpl.table.Table(ax)
|
267 |
+
rows = len(stat_names) + 1
|
268 |
+
cols = len(stats) + 1
|
269 |
+
|
270 |
+
cell_height = 1/rows
|
271 |
+
cell_width = 1/cols
|
272 |
+
|
273 |
+
|
274 |
+
for row, stat in enumerate(stat_names, start=1):
|
275 |
+
cell = table.add_cell(row=row, col=0, width=cell_width, height=cell_height, text=stat, loc='center', fontproperties=font, edgecolor='white')
|
276 |
+
|
277 |
+
for col, pitch in enumerate(stats['ballKind_code'], start=1):
|
278 |
+
|
279 |
+
color = ball_kind_code_to_color.get(pitch, 'C0')
|
280 |
+
cell = table.add_cell(row=0, col=col, width=cell_width, height=cell_height, text=pitch, loc='center', fontproperties=font, facecolor=color, edgecolor='white')
|
281 |
+
cell.get_text().set_color(get_text_color_from_color(color))
|
282 |
+
|
283 |
+
_stats = stats.filter(pl.col('ballKind_code') == pitch)
|
284 |
+
qualified = _stats['qualified'].item()
|
285 |
+
for row, stat_name in enumerate(stat_names, start=1):
|
286 |
+
stat = _stats[stat_name].item()
|
287 |
+
stat_pctl = _stats[f'{stat_name}_pctl'].item()
|
288 |
+
cell = table.add_cell(row=row, col=col, width=cell_width, height=cell_height, text=f'{stat:.0%}', loc='center', fontproperties=font, facecolor=(stat_cmap([0, stat_pctl, 1])[1] if qualified else 'gainsboro'), edgecolor='white')
|
289 |
+
if not qualified:
|
290 |
+
cell.get_text().set_color('gray')
|
291 |
+
ax.add_artist(table)
|
292 |
+
|
293 |
+
|
294 |
+
def plot_pitcher_stats(ax, stats, stat_names):
|
295 |
+
|
296 |
+
ax.set_aspect('equal', adjustable='datalim')
|
297 |
+
|
298 |
+
table = mpl.table.Table(ax)
|
299 |
+
|
300 |
+
cell_height = 1
|
301 |
+
cell_width = 1/(len(stat_names)*2)
|
302 |
+
|
303 |
+
qualified = stats['qualified'].item()
|
304 |
+
|
305 |
+
for i, stat_name in enumerate(stat_names):
|
306 |
+
stat = stats[stat_name].item()
|
307 |
+
stat_pctl = stats[f'{stat_name}_pctl'].item()
|
308 |
+
|
309 |
+
table.add_cell(row=0, col=i*2, width=cell_width, height=cell_height, text=stat_name, loc='center', fontproperties=font, edgecolor='white')
|
310 |
+
cell = table.add_cell(row=0, col=i*2+1, width=cell_width, height=cell_height, text=f'{stat:.0%}', loc='center', fontproperties=font, facecolor=(stat_cmap([0, stat_pctl, 1])[1] if qualified else 'gainsboro'), edgecolor='white')
|
311 |
+
if not qualified:
|
312 |
+
cell.get_text().set_color('gray')
|
313 |
+
ax.add_artist(table)
|
314 |
+
|
315 |
+
|
316 |
+
font = load_google_font('Saira Extra Condensed', weight='medium')
|
317 |
+
|
318 |
+
|
319 |
+
def create_pitcher_overview_card(id, season, dpi=300):
|
320 |
+
data = get_card_data(id, start_date=date(season, 1, 1), end_date=date(season, 12, 31), game_kind='Regular Season', min_pitches=100)
|
321 |
+
|
322 |
+
fig = plt.figure(figsize=(1080/300, 1350/300), dpi=dpi)
|
323 |
+
gs = fig.add_gridspec(8, 6, height_ratios=[1, 1, 1.5, 6, 1, 3, 1, 0.5])
|
324 |
+
title_ax = fig.add_subplot(gs[0, :])
|
325 |
+
title_ax.text(x=0, y=0, s=data.pitcher_stats['pitcher_name'].item().upper(), verticalalignment='baseline', font=font, size=20)
|
326 |
+
# title_ax.text(x=1, y=1, s='2021\n-2023', horizontalalignment='right', verticalalignment='top', font=font, size=8)
|
327 |
+
title_ax.text(x=0.95, y=0, s=season, horizontalalignment='right', verticalalignment='baseline', font=font, size=20)
|
328 |
+
title_ax.text(x=1, y=0.5, s='REG', horizontalalignment='right', verticalalignment='center', font=font, size=10, rotation='vertical')
|
329 |
+
|
330 |
+
arsenal_ax = fig.add_subplot(gs[1, :])
|
331 |
+
plot_arsenal(arsenal_ax, data.pitch_stats['ballKind_code'])
|
332 |
+
|
333 |
+
|
334 |
+
usage_l_ax = fig.add_subplot(gs[2, :3])
|
335 |
+
plot_usage(usage_l_ax, data.pitch_stats[['ballKind_code', 'usage_left']])
|
336 |
+
usage_l_ax.text(0, 1, 'LHH usage', horizontalalignment='left', verticalalignment='top', linespacing=0.5, color='gray', font=font, size=10, transform=usage_l_ax.transAxes)
|
337 |
+
|
338 |
+
usage_r_ax = fig.add_subplot(gs[2, 3:])
|
339 |
+
plot_usage(usage_r_ax, data.pitch_stats[['ballKind_code', 'usage_right']])
|
340 |
+
usage_r_ax.text(0, 1, 'RHH usage', horizontalalignment='left', verticalalignment='top', linespacing=0.5, color='gray', font=font, size=10, transform=usage_r_ax.transAxes)
|
341 |
+
|
342 |
+
loc_l_ax = fig.add_subplot(gs[3, :3])
|
343 |
+
loc_l_ax.text(0, 1, 'LHH\nloc', verticalalignment='top', horizontalalignment='left', color='gray', font=font, size=10, transform=loc_l_ax.transAxes)
|
344 |
+
plot_loc(loc_l_ax, data.left_pitch_shapes)
|
345 |
+
|
346 |
+
loc_r_ax = fig.add_subplot(gs[3, 3:])
|
347 |
+
loc_r_ax.text(0, 1, 'RHH\nloc', verticalalignment='top', horizontalalignment='left', color='gray', font=font, size=10, transform=loc_r_ax.transAxes)
|
348 |
+
plot_loc(loc_r_ax, data.right_pitch_shapes)
|
349 |
+
|
350 |
+
velo_ax = fig.add_subplot(gs[4, :])
|
351 |
+
plot_velo(velo_ax, data.both_pitch_shapes)
|
352 |
+
velo_ax.text(0, 1, 'Velo', verticalalignment='top', horizontalalignment='left', color='gray', font=font, size=10, transform=velo_ax.transAxes)
|
353 |
+
|
354 |
+
pitch_stats_ax = fig.add_subplot(gs[5, :])
|
355 |
+
plot_pitch_stats(pitch_stats_ax, data.pitch_stats, ['CSW%', 'GB%'])
|
356 |
+
|
357 |
+
pitcher_stats_ax = fig.add_subplot(gs[6, :])
|
358 |
+
plot_pitcher_stats(pitcher_stats_ax, data.pitcher_stats, ['CSW%', 'K%', 'BB%', 'GB%'])
|
359 |
+
|
360 |
+
|
361 |
+
# k_ax = fig.add_subplot(gs[5, :2])
|
362 |
+
# plot_stat(k_ax, data.pitcher_stats, 'K%')
|
363 |
+
|
364 |
+
# bb_ax = fig.add_subplot(gs[5, 2:4])
|
365 |
+
# plot_stat(bb_ax, data.pitcher_s`tats, 'BB%')
|
366 |
+
|
367 |
+
# gb_ax = fig.add_subplot(gs[5, 4:])
|
368 |
+
# plot_stat(gb_ax, data.pitcher_stats, 'GB%')
|
369 |
+
|
370 |
+
credits_ax = fig.add_subplot(gs[7, :])
|
371 |
+
credits_ax.text(x=0, y=0.5, s='Data: SPAIA, Sanspo', verticalalignment='center', font=font, size=7)
|
372 |
+
credits_ax.text(x=1, y=0.5, s='@yakyucosmo', horizontalalignment='right', verticalalignment='center', font=font, size=7)
|
373 |
+
|
374 |
+
for ax in [
|
375 |
+
title_ax,
|
376 |
+
arsenal_ax,
|
377 |
+
usage_l_ax, usage_r_ax,
|
378 |
+
loc_l_ax, loc_r_ax,
|
379 |
+
velo_ax,
|
380 |
+
# k_ax, bb_ax, gb_ax,
|
381 |
+
pitch_stats_ax,
|
382 |
+
pitcher_stats_ax,
|
383 |
+
credits_ax
|
384 |
+
]:
|
385 |
+
ax.axis('off')
|
386 |
+
ax.tick_params(
|
387 |
+
axis='both',
|
388 |
+
which='both',
|
389 |
+
length=0,
|
390 |
+
labelbottom=False,
|
391 |
+
labelleft=False
|
392 |
+
)
|
393 |
+
|
394 |
+
return fig
|
395 |
+
# fig = create_card('1600153', season=2023, dpi=300)
|
396 |
+
# plt.show()
|