|
from bs4 import BeautifulSoup |
|
import datetime |
|
from multiprocessing import Pool |
|
import numpy as np |
|
import pandas as pd |
|
from pydantic import BaseModel, Field |
|
import requests |
|
from typing import Optional |
|
from urllib.parse import urljoin |
|
|
|
from domain.teams import ALL_TEAMS, NFLTeam |
|
|
|
|
|
MULTIPROCESSING_ENABLED = False |
|
|
|
PRACTICE_WEEK = { |
|
"Mon": 0, |
|
"Tue": 1, |
|
"Wed": 2, |
|
"Thu": 3, |
|
"Fri": 4, |
|
"Sat": 5, |
|
"Sun": 6, |
|
"Monday": 0, |
|
"Tuesday": 1, |
|
"Wednesday": 2, |
|
"Thursday": 3, |
|
"Friday": 4, |
|
"Saturday": 5, |
|
"Sunday": 6, |
|
} |
|
|
|
|
|
DAY_OF_WEEK_STRING_MAPPING = { |
|
"Monday": "Mon", |
|
"Tuesday": "Tue", |
|
"Wednesday": "Wed", |
|
"Thursday": "Thu", |
|
"Friday": "Fri", |
|
"Saturday": "Sat", |
|
"Sunday": "Sun", |
|
} |
|
|
|
|
|
WEEK_1_BEGIN_DATE = datetime.datetime(2023, 9, 4) |
|
CURRENT_DATE = datetime.datetime.now() |
|
CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7)) |
|
CURRENT_SEASON = 2023 |
|
|
|
|
|
class PracticeReportRawRow(BaseModel): |
|
Team: str |
|
Player: str |
|
Position: str |
|
Injury: str |
|
Sun: Optional[str] = None |
|
Mon: Optional[str] = None |
|
Tue: Optional[str] = None |
|
Wed: Optional[str] = None |
|
Thu: Optional[str] = None |
|
Fri: Optional[str] = None |
|
Sat: Optional[str] = None |
|
game_status: str = Field(alias="Game Status") |
|
|
|
@classmethod |
|
def replace_nan(self, value) -> str: |
|
if isinstance(value, float): |
|
if np.isnan(value): |
|
return "" |
|
return value |
|
|
|
@classmethod |
|
def from_raw(cls, input_dict) -> "PracticeReportRawRow": |
|
return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()}) |
|
|
|
|
|
def get_injury_report_dataframe(team: NFLTeam): |
|
injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}") |
|
report_request = requests.get(injury_report_url) |
|
report_soup = BeautifulSoup(report_request.content) |
|
team_names_spans = report_soup.find_all("span", {"class": "nfl-o-injury-report__club-name"}) |
|
assert team_names_spans |
|
team_names_str = [x.get_text() for x in team_names_spans] |
|
assert team_names_str[0] == team.team_full_name |
|
tables = report_soup.find_all("table") |
|
df_report = pd.read_html(str(tables))[0] |
|
return df_report |
|
|
|
|
|
def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame: |
|
print(f"Scraping Injury Report for: {team.team_full_name}") |
|
try: |
|
team_report = get_injury_report_dataframe(team) |
|
except Exception: |
|
print(f"Failed to scrape practice report for: {team.team_full_name}") |
|
return pd.DataFrame() |
|
validated_row_list = [] |
|
for df_row_dict in team_report.to_dict("records"): |
|
row_to_add = df_row_dict |
|
row_to_add["Team"] = team.team_full_name |
|
validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add)) |
|
validated_df = pd.DataFrame([x.dict() for x in validated_row_list]) |
|
|
|
validated_df.dropna(axis=1, how="all", inplace=True) |
|
|
|
day_idx = 1 |
|
last_practice_day = None |
|
for col in validated_df.columns: |
|
if col in PRACTICE_WEEK: |
|
validated_df.rename(columns={col: str(day_idx)}, inplace=True) |
|
day_idx += 1 |
|
last_practice_day = col |
|
validated_df["Last Practice Day"] = last_practice_day |
|
return validated_df |
|
|
|
|
|
def scrape_all_team_injury_report() -> pd.DataFrame: |
|
if MULTIPROCESSING_ENABLED: |
|
with Pool() as pool: |
|
team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS) |
|
else: |
|
team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS] |
|
return pd.concat(team_df_list) |
|
|