Spaces:
Runtime error
Runtime error
File size: 5,335 Bytes
06cb2a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
###################################
# regenerate_49ers_data.py
###################################
import pandas as pd
import random
import uuid
from faker import Faker
import os
# CONFIG: Where your input CSVs live
INPUT_DIR = os.path.dirname(os.path.abspath(__file__)) # Uses the current script's directory
COMMUNITIES_FILE = "49ers_fan_communities_clean_GOOD.csv"
ROSTER_FILE = "49ers roster - Sheet1.csv"
SCHEDULE_FILE = "nfl-2024-san-francisco-49ers-with-results.csv"
# CONFIG: Output directory for final CSVs
OUTPUT_DIR = os.path.join(INPUT_DIR, "niners_output")
os.makedirs(OUTPUT_DIR, exist_ok=True)
NUM_FANS = 2500 # We want 2500 synthetic fans
# ------------------------------------------------------------
# 1. READ REAL CSVs
# ------------------------------------------------------------
def load_real_data():
# Adjust columns/types based on your actual CSV structure
df_communities = pd.read_csv(os.path.join(INPUT_DIR, COMMUNITIES_FILE))
df_roster = pd.read_csv(os.path.join(INPUT_DIR, ROSTER_FILE))
df_schedule = pd.read_csv(os.path.join(INPUT_DIR, SCHEDULE_FILE))
# Optional: rename columns or add IDs if your CSVs don't have them
# For example, ensure df_roster has "player_id" column for each player
if "player_id" not in df_roster.columns:
df_roster["player_id"] = [str(uuid.uuid4()) for _ in range(len(df_roster))]
# If df_schedule lacks a unique "game_id," add one:
if "game_id" not in df_schedule.columns:
df_schedule["game_id"] = [str(uuid.uuid4()) for _ in range(len(df_schedule))]
# If df_communities lacks a "community_id," add one:
if "community_id" not in df_communities.columns:
df_communities["community_id"] = [str(uuid.uuid4()) for _ in range(len(df_communities))]
return df_communities, df_roster, df_schedule
# ------------------------------------------------------------
# 2. GENERATE 2,500 FANS (FAKE DATA)
# ------------------------------------------------------------
def generate_synthetic_fans(num_fans: int) -> pd.DataFrame:
"""
Create a DataFrame of synthetic fans.
Each fan has:
- fan_id (UUID)
- first_name
- last_name
- email
- favorite_players (list of player_ids)
- community_memberships (list of community_ids)
"""
fake = Faker()
fans_list = []
for _ in range(num_fans):
fan_id = str(uuid.uuid4())
first_name = fake.first_name()
last_name = fake.last_name()
email = fake.email()
fans_list.append({
"fan_id": fan_id,
"first_name": first_name,
"last_name": last_name,
"email": email,
# We'll assign favorite_players & community_memberships below
"favorite_players": [],
"community_memberships": []
})
return pd.DataFrame(fans_list)
# ------------------------------------------------------------
# 3. ASSIGN RANDOM FAVORITE PLAYERS AND COMMUNITIES
# ------------------------------------------------------------
def assign_relationships(df_fans: pd.DataFrame,
df_roster: pd.DataFrame,
df_communities: pd.DataFrame):
"""
- Pick 1-3 random favorite players for each fan from the real roster
- Assign 0 or 1 community to each fan from the real communities
"""
player_ids = df_roster["player_id"].tolist()
community_ids = df_communities["community_id"].tolist()
for idx, fan in df_fans.iterrows():
# Choose 1-3 players
if len(player_ids) > 0:
num_players = random.randint(1, 3)
chosen_players = random.sample(player_ids, k=num_players)
else:
chosen_players = []
# 50% chance to join a community
chosen_community = []
if len(community_ids) > 0 and random.choice([True, False]):
chosen_community = [random.choice(community_ids)]
# Update the row's columns
df_fans.at[idx, "favorite_players"] = chosen_players
df_fans.at[idx, "community_memberships"] = chosen_community
# ------------------------------------------------------------
# 4. MAIN PIPELINE
# ------------------------------------------------------------
def main():
# 4.1. Load real data
df_communities, df_roster, df_schedule = load_real_data()
# 4.2. Generate 2,500 synthetic fans
df_fans = generate_synthetic_fans(NUM_FANS)
# 4.3. Assign random relationships
assign_relationships(df_fans, df_roster, df_communities)
# 4.4. Export everything to CSV
# (If you'd like to keep the original real-data files as is,
# you can simply re-write them or rename them. Below we do an explicit "to_csv".)
df_communities.to_csv(os.path.join(OUTPUT_DIR, "fan_communities.csv"), index=False)
df_roster.to_csv(os.path.join(OUTPUT_DIR, "roster.csv"), index=False)
df_schedule.to_csv(os.path.join(OUTPUT_DIR, "schedule.csv"), index=False)
df_fans.to_csv(os.path.join(OUTPUT_DIR, "fans.csv"), index=False)
print(f"Data generation complete! Files are in {OUTPUT_DIR}")
print(" - fan_communities.csv (REAL)")
print(" - roster.csv (REAL)")
print(" - schedule.csv (REAL)")
print(" - fans.csv (SYNTHETIC + relationships)")
if __name__ == "__main__":
main()
|