File size: 7,363 Bytes
a8b6a3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import polars as pl
from glob import glob
import os
from tqdm.auto import tqdm

from convert import aux_global_id_to_code, presult, ball_kind, ball_kind_code, lr, game_kind

DATA_PATH = os.path.expanduser('~/Documents/npb_data_collector/npb')
# SEASONS = list(range(2021, 2025+1))
SEASONS = [2021, 2022, 2023, 2024, 2025]

data_df = pl.DataFrame()
text_df = pl.DataFrame()
aux_df = pl.DataFrame()
sched_df = pl.DataFrame()
aux_sched_df = pl.DataFrame()

for season in tqdm(SEASONS):
  _data_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'pbp_data.parquet'))
  data_df = pl.concat((data_df, _data_df))

  _text_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'pbp_text.parquet'))
  text_df = pl.concat((text_df, _text_df))

  _aux_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'pbp_aux.parquet'))
  aux_df = pl.concat((aux_df, _aux_df), how='diagonal_relaxed')

  _sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'schedule.parquet'))
  sched_df = pl.concat((sched_df, _sched_df))

  _aux_sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'aux_schedule.parquet'))
  aux_sched_df = pl.concat((aux_sched_df, _aux_sched_df))


# sched_df = sched_df.

aux_df = (
    aux_df
    .filter(pl.col('type') != 'RUNNER')
    .join(aux_sched_df[['gameGlobalId', 'gameDate']], on='gameGlobalId')
    .with_columns(
        pl.col('gameDate').str.to_date().dt.strftime('%Y%m%d'),
        pl.col('home').struct.field('globalId').replace_strict(aux_global_id_to_code).alias('home'),
        pl.col('visitor').struct.field('globalId').replace_strict(aux_global_id_to_code).alias('visitor'),
        pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2')).alias('tob_code'),
    )
    .filter(
        # pl.col('pitch').struct.field('count') > 0

        # either one alone should be enough but let's use them together to be safe
        ~((pl.col('code') == 98) & (pl.col('id') == 1))
    )
    .with_columns(
        (pl.col('pitch').struct.field('count') == 1).cum_sum().over(['gameGlobalId', 'inning', 'tob']).alias('pa_count')
    )
    .with_columns(
        pl.col('code').is_in([6402, 6404, 6406, 6405]).any().over(['gameGlobalId', 'inning', 'tob', 'pa_count']).alias('ibb')
    )
    .with_columns(
        pl.when(~pl.col('ibb')).then(pl.col('pitch').struct.field('count') == 1).cum_sum().over(['gameGlobalId', 'inning', 'tob']).alias('new_pa_count')
    )
    .with_columns(
        pl.len().over(['gameGlobalId', 'inning', 'tob', 'new_pa_count']).alias('pa_pitches'),
        pl.max('new_pa_count').over(['gameGlobalId', 'inning', 'tob']).alias('inning_pas')
    )
    .with_columns(
        (
            pl.col('gameDate') + '_' + \
            pl.col('visitor') + '_' + \
            pl.col('home') + '_' + \
            pl.col('inning').str.zfill(2) + pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2')) + pl.col('new_pa_count').cast(pl.String).str.zfill(2) + '_' +\
            pl.col('pitch').struct.field('count').cast(pl.String)
        ).alias('universal_code'),
        (
            pl.col('gameDate') + '_' + \
            pl.col('visitor') + '_' + \
            pl.col('home') + '_' + \
            pl.col('inning').str.zfill(2) + pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2'))
        ).alias('inning_code'),
        (
            pl.col('gameDate') + '_' + \
            pl.col('visitor') + '_' + \
            pl.col('home') + '_' + \
            pl.col('inning').str.zfill(2) + pl.when(pl.col('tob') == 'Top').then(pl.lit('1')).otherwise(pl.lit('2')) + pl.col('new_pa_count').cast(pl.String).str.zfill(2)
        ).alias('pa_code')
    )
)

data_df = data_df

data_df = (
    data_df
    .with_columns(
        *[
            pl.col(col).cast(pl.Int32)
            for col
            in ['gameId', 'ballKind', 'ballSpeed', 'x', 'y', 'presult', 'bresult', 'battedX', 'battedY']
        ],
        pl.col('UpdatedAt').str.to_datetime(),
        pl.col('fiveDigitSerialNumber').str.slice(offset=0, length=3).alias('half_inning'),
        pl.col('fiveDigitSerialNumber').str.slice(offset=3, length=2).alias('batter'),
    )
    .with_columns(
        # pl.count('ID').over(['gameId', 'fiveDigitSerialNumber']).alias('pa_pitches')
        (~pl.col('presult').is_in([0])).sum().over(['gameId', 'fiveDigitSerialNumber']).alias('pa_pitches'),
        pl.col('presult').is_in([139]).any().over(['gameId', 'fiveDigitSerialNumber']).alias('ibb')
    )
    .filter(
        (pl.col('pa_pitches') > 0)
    )
    .with_columns(
        pl.when(~pl.col('ibb')).then(pl.col('batter'))
    )
    .with_columns(
        pl.when(~pl.col('ibb')).then(pl.col('batter').rank('dense')).over(['gameId', 'half_inning']).cast(pl.String).str.zfill(2).alias('new_batter')
    )
    .with_columns(
        (pl.col('half_inning') + pl.col('new_batter')).alias('newFiveDigitSerialNumber')
    )
    .with_columns(pl.max('new_batter').cast(pl.Int32).over(['gameId', pl.col('newFiveDigitSerialNumber').str.slice(offset=0, length=3)]).alias('inning_pas'))
    .join(sched_df[['GameID', 'HomeTeamNameES', 'VisitorTeamNameES']].rename({'GameID': 'gameId'}), on='gameId')
    .with_columns(pl.col('UpdatedAt').dt.strftime('%Y%m%d').alias('date'))
    .with_columns(
        (pl.col('date') + '_' + pl.col('VisitorTeamNameES') + '_' + pl.col('HomeTeamNameES') + '_' + pl.col('newFiveDigitSerialNumber')).alias('universal_code') + '_' + pl.col('atBatBallCount'),
        (pl.col('date') + '_' + pl.col('VisitorTeamNameES') + '_' + pl.col('HomeTeamNameES') + '_' + pl.col('newFiveDigitSerialNumber').str.slice(offset=0, length=3)).alias('inning_code'),
        (pl.col('date') + '_' + pl.col('VisitorTeamNameES') + '_' + pl.col('HomeTeamNameES') + '_' + pl.col('newFiveDigitSerialNumber')).alias('pa_code')
    )
    .join(
        (
            aux_df.filter(~pl.col('ibb'))[['universal_code', 'battingResult', 'inning_pas', 'pa_pitches']]
            .rename({'battingResult': 'aux_bresult', 'inning_pas': 'aux_inning_pas', 'pa_pitches': 'aux_pa_pitches'})
        ),
        on='universal_code',
        how='left'
    )
    .with_columns(

    )
    .join(
        text_df[['GameID', 'GameKindID']].with_columns(
            pl.col('GameID').cast(pl.Int32),
            pl.col('GameKindID').cast(pl.Int32),
        ).unique(),
        how='left',
        left_on='gameId',
        right_on='GameID'
    )
    .with_columns(pl.col('GameKindID').replace_strict(game_kind).alias('GameKindName'))
    .with_columns(
        pl.when((pl.col('inning_pas') == pl.col('aux_inning_pas')) & (pl.col('pa_pitches') == pl.col('aux_pa_pitches')))
        .then('aux_bresult')
        .alias('aux_bresult'),

        pl.col('x').add(-100).mul(-1),
        pl.col('y').neg().add(250),
        pl.col('presult').replace_strict(presult),
        pl.col('ballKind').replace_strict(ball_kind),
        pl.col('ballKind').replace_strict(ball_kind_code).alias('ballKind_code'),
        pl.col('batLR').replace_strict(lr),
        
        pl.when(pl.col('GameKindName').str.contains('Regular Season') | (pl.col('GameKindName') == 'Interleague'))
        .then(pl.lit('Regular Season'))
        .when(~pl.col('GameKindName').is_in(['Spring Training', 'All-Star Game']))
        .then(pl.lit('Postseason'))
        .otherwise('GameKindName')
        .alias('coarse_game_kind')
    )
)