File size: 5,335 Bytes
06cb2a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
###################################
# regenerate_49ers_data.py
###################################

import pandas as pd
import random
import uuid
from faker import Faker
import os

# CONFIG: Where your input CSVs live
INPUT_DIR = os.path.dirname(os.path.abspath(__file__))  # Uses the current script's directory
COMMUNITIES_FILE = "49ers_fan_communities_clean_GOOD.csv"
ROSTER_FILE = "49ers roster - Sheet1.csv"
SCHEDULE_FILE = "nfl-2024-san-francisco-49ers-with-results.csv"

# CONFIG: Output directory for final CSVs
OUTPUT_DIR = os.path.join(INPUT_DIR, "niners_output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

NUM_FANS = 2500  # We want 2500 synthetic fans

# ------------------------------------------------------------
# 1. READ REAL CSVs
# ------------------------------------------------------------
def load_real_data():
    # Adjust columns/types based on your actual CSV structure
    df_communities = pd.read_csv(os.path.join(INPUT_DIR, COMMUNITIES_FILE))
    df_roster = pd.read_csv(os.path.join(INPUT_DIR, ROSTER_FILE))
    df_schedule = pd.read_csv(os.path.join(INPUT_DIR, SCHEDULE_FILE))

    # Optional: rename columns or add IDs if your CSVs don't have them
    # For example, ensure df_roster has "player_id" column for each player
    if "player_id" not in df_roster.columns:
        df_roster["player_id"] = [str(uuid.uuid4()) for _ in range(len(df_roster))]

    # If df_schedule lacks a unique "game_id," add one:
    if "game_id" not in df_schedule.columns:
        df_schedule["game_id"] = [str(uuid.uuid4()) for _ in range(len(df_schedule))]

    # If df_communities lacks a "community_id," add one:
    if "community_id" not in df_communities.columns:
        df_communities["community_id"] = [str(uuid.uuid4()) for _ in range(len(df_communities))]

    return df_communities, df_roster, df_schedule

# ------------------------------------------------------------
# 2. GENERATE 2,500 FANS (FAKE DATA)
# ------------------------------------------------------------
def generate_synthetic_fans(num_fans: int) -> pd.DataFrame:
    """
    Create a DataFrame of synthetic fans.
    Each fan has:
      - fan_id (UUID)
      - first_name
      - last_name
      - email
      - favorite_players (list of player_ids)
      - community_memberships (list of community_ids)
    """
    fake = Faker()
    fans_list = []
    for _ in range(num_fans):
        fan_id = str(uuid.uuid4())
        first_name = fake.first_name()
        last_name = fake.last_name()
        email = fake.email()

        fans_list.append({
            "fan_id": fan_id,
            "first_name": first_name,
            "last_name": last_name,
            "email": email,
            # We'll assign favorite_players & community_memberships below
            "favorite_players": [],
            "community_memberships": []
        })

    return pd.DataFrame(fans_list)

# ------------------------------------------------------------
# 3. ASSIGN RANDOM FAVORITE PLAYERS AND COMMUNITIES
# ------------------------------------------------------------
def assign_relationships(df_fans: pd.DataFrame,
                         df_roster: pd.DataFrame,
                         df_communities: pd.DataFrame):
    """
    - Pick 1-3 random favorite players for each fan from the real roster
    - Assign 0 or 1 community to each fan from the real communities
    """
    player_ids = df_roster["player_id"].tolist()
    community_ids = df_communities["community_id"].tolist()

    for idx, fan in df_fans.iterrows():
        # Choose 1-3 players
        if len(player_ids) > 0:
            num_players = random.randint(1, 3)
            chosen_players = random.sample(player_ids, k=num_players)
        else:
            chosen_players = []

        # 50% chance to join a community
        chosen_community = []
        if len(community_ids) > 0 and random.choice([True, False]):
            chosen_community = [random.choice(community_ids)]

        # Update the row's columns
        df_fans.at[idx, "favorite_players"] = chosen_players
        df_fans.at[idx, "community_memberships"] = chosen_community

# ------------------------------------------------------------
# 4. MAIN PIPELINE
# ------------------------------------------------------------
def main():
    # 4.1. Load real data
    df_communities, df_roster, df_schedule = load_real_data()

    # 4.2. Generate 2,500 synthetic fans
    df_fans = generate_synthetic_fans(NUM_FANS)

    # 4.3. Assign random relationships
    assign_relationships(df_fans, df_roster, df_communities)

    # 4.4. Export everything to CSV
    # (If you'd like to keep the original real-data files as is,
    #  you can simply re-write them or rename them. Below we do an explicit "to_csv".)

    df_communities.to_csv(os.path.join(OUTPUT_DIR, "fan_communities.csv"), index=False)
    df_roster.to_csv(os.path.join(OUTPUT_DIR, "roster.csv"), index=False)
    df_schedule.to_csv(os.path.join(OUTPUT_DIR, "schedule.csv"), index=False)
    df_fans.to_csv(os.path.join(OUTPUT_DIR, "fans.csv"), index=False)

    print(f"Data generation complete! Files are in {OUTPUT_DIR}")
    print(" - fan_communities.csv  (REAL)")
    print(" - roster.csv           (REAL)")
    print(" - schedule.csv         (REAL)")
    print(" - fans.csv             (SYNTHETIC + relationships)")

if __name__ == "__main__":
    main()