################################### # regenerate_49ers_data.py ################################### import pandas as pd import random import uuid from faker import Faker import os # CONFIG: Where your input CSVs live INPUT_DIR = os.path.dirname(os.path.abspath(__file__)) # Uses the current script's directory COMMUNITIES_FILE = "49ers_fan_communities_clean_GOOD.csv" ROSTER_FILE = "49ers roster - Sheet1.csv" SCHEDULE_FILE = "nfl-2024-san-francisco-49ers-with-results.csv" # CONFIG: Output directory for final CSVs OUTPUT_DIR = os.path.join(INPUT_DIR, "niners_output") os.makedirs(OUTPUT_DIR, exist_ok=True) NUM_FANS = 2500 # We want 2500 synthetic fans # ------------------------------------------------------------ # 1. READ REAL CSVs # ------------------------------------------------------------ def load_real_data(): # Adjust columns/types based on your actual CSV structure df_communities = pd.read_csv(os.path.join(INPUT_DIR, COMMUNITIES_FILE)) df_roster = pd.read_csv(os.path.join(INPUT_DIR, ROSTER_FILE)) df_schedule = pd.read_csv(os.path.join(INPUT_DIR, SCHEDULE_FILE)) # Optional: rename columns or add IDs if your CSVs don't have them # For example, ensure df_roster has "player_id" column for each player if "player_id" not in df_roster.columns: df_roster["player_id"] = [str(uuid.uuid4()) for _ in range(len(df_roster))] # If df_schedule lacks a unique "game_id," add one: if "game_id" not in df_schedule.columns: df_schedule["game_id"] = [str(uuid.uuid4()) for _ in range(len(df_schedule))] # If df_communities lacks a "community_id," add one: if "community_id" not in df_communities.columns: df_communities["community_id"] = [str(uuid.uuid4()) for _ in range(len(df_communities))] return df_communities, df_roster, df_schedule # ------------------------------------------------------------ # 2. GENERATE 2,500 FANS (FAKE DATA) # ------------------------------------------------------------ def generate_synthetic_fans(num_fans: int) -> pd.DataFrame: """ Create a DataFrame of synthetic fans. Each fan has: - fan_id (UUID) - first_name - last_name - email - favorite_players (list of player_ids) - community_memberships (list of community_ids) """ fake = Faker() fans_list = [] for _ in range(num_fans): fan_id = str(uuid.uuid4()) first_name = fake.first_name() last_name = fake.last_name() email = fake.email() fans_list.append({ "fan_id": fan_id, "first_name": first_name, "last_name": last_name, "email": email, # We'll assign favorite_players & community_memberships below "favorite_players": [], "community_memberships": [] }) return pd.DataFrame(fans_list) # ------------------------------------------------------------ # 3. ASSIGN RANDOM FAVORITE PLAYERS AND COMMUNITIES # ------------------------------------------------------------ def assign_relationships(df_fans: pd.DataFrame, df_roster: pd.DataFrame, df_communities: pd.DataFrame): """ - Pick 1-3 random favorite players for each fan from the real roster - Assign 0 or 1 community to each fan from the real communities """ player_ids = df_roster["player_id"].tolist() community_ids = df_communities["community_id"].tolist() for idx, fan in df_fans.iterrows(): # Choose 1-3 players if len(player_ids) > 0: num_players = random.randint(1, 3) chosen_players = random.sample(player_ids, k=num_players) else: chosen_players = [] # 50% chance to join a community chosen_community = [] if len(community_ids) > 0 and random.choice([True, False]): chosen_community = [random.choice(community_ids)] # Update the row's columns df_fans.at[idx, "favorite_players"] = chosen_players df_fans.at[idx, "community_memberships"] = chosen_community # ------------------------------------------------------------ # 4. MAIN PIPELINE # ------------------------------------------------------------ def main(): # 4.1. Load real data df_communities, df_roster, df_schedule = load_real_data() # 4.2. Generate 2,500 synthetic fans df_fans = generate_synthetic_fans(NUM_FANS) # 4.3. Assign random relationships assign_relationships(df_fans, df_roster, df_communities) # 4.4. Export everything to CSV # (If you'd like to keep the original real-data files as is, # you can simply re-write them or rename them. Below we do an explicit "to_csv".) df_communities.to_csv(os.path.join(OUTPUT_DIR, "fan_communities.csv"), index=False) df_roster.to_csv(os.path.join(OUTPUT_DIR, "roster.csv"), index=False) df_schedule.to_csv(os.path.join(OUTPUT_DIR, "schedule.csv"), index=False) df_fans.to_csv(os.path.join(OUTPUT_DIR, "fans.csv"), index=False) print(f"Data generation complete! Files are in {OUTPUT_DIR}") print(" - fan_communities.csv (REAL)") print(" - roster.csv (REAL)") print(" - schedule.csv (REAL)") print(" - fans.csv (SYNTHETIC + relationships)") if __name__ == "__main__": main()