Spaces:

aliss77777
/

IFX-sandbox

Runtime error

App Files Files Community

IFX-sandbox / data /data_generation.py

aliss77777

Upload folder using huggingface_hub

06cb2a3 verified 4 months ago

raw

history blame contribute delete

5.34 kB

	###################################
	# regenerate_49ers_data.py
	###################################

	import pandas as pd
	import random
	import uuid
	from faker import Faker
	import os

	# CONFIG: Where your input CSVs live
	INPUT_DIR = os.path.dirname(os.path.abspath(__file__)) # Uses the current script's directory
	COMMUNITIES_FILE = "49ers_fan_communities_clean_GOOD.csv"
	ROSTER_FILE = "49ers roster - Sheet1.csv"
	SCHEDULE_FILE = "nfl-2024-san-francisco-49ers-with-results.csv"

	# CONFIG: Output directory for final CSVs
	OUTPUT_DIR = os.path.join(INPUT_DIR, "niners_output")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	NUM_FANS = 2500 # We want 2500 synthetic fans

	# ------------------------------------------------------------
	# 1. READ REAL CSVs
	# ------------------------------------------------------------
	def load_real_data():
	# Adjust columns/types based on your actual CSV structure
	df_communities = pd.read_csv(os.path.join(INPUT_DIR, COMMUNITIES_FILE))
	df_roster = pd.read_csv(os.path.join(INPUT_DIR, ROSTER_FILE))
	df_schedule = pd.read_csv(os.path.join(INPUT_DIR, SCHEDULE_FILE))

	# Optional: rename columns or add IDs if your CSVs don't have them
	# For example, ensure df_roster has "player_id" column for each player
	if "player_id" not in df_roster.columns:
	df_roster["player_id"] = [str(uuid.uuid4()) for _ in range(len(df_roster))]

	# If df_schedule lacks a unique "game_id," add one:
	if "game_id" not in df_schedule.columns:
	df_schedule["game_id"] = [str(uuid.uuid4()) for _ in range(len(df_schedule))]

	# If df_communities lacks a "community_id," add one:
	if "community_id" not in df_communities.columns:
	df_communities["community_id"] = [str(uuid.uuid4()) for _ in range(len(df_communities))]

	return df_communities, df_roster, df_schedule

	# ------------------------------------------------------------
	# 2. GENERATE 2,500 FANS (FAKE DATA)
	# ------------------------------------------------------------
	def generate_synthetic_fans(num_fans: int) -> pd.DataFrame:
	"""
	Create a DataFrame of synthetic fans.
	Each fan has:
	- fan_id (UUID)
	- first_name
	- last_name
	- email
	- favorite_players (list of player_ids)
	- community_memberships (list of community_ids)
	"""
	fake = Faker()
	fans_list = []
	for _ in range(num_fans):
	fan_id = str(uuid.uuid4())
	first_name = fake.first_name()
	last_name = fake.last_name()
	email = fake.email()

	fans_list.append({
	"fan_id": fan_id,
	"first_name": first_name,
	"last_name": last_name,
	"email": email,
	# We'll assign favorite_players & community_memberships below
	"favorite_players": [],
	"community_memberships": []
	})

	return pd.DataFrame(fans_list)

	# ------------------------------------------------------------
	# 3. ASSIGN RANDOM FAVORITE PLAYERS AND COMMUNITIES
	# ------------------------------------------------------------
	def assign_relationships(df_fans: pd.DataFrame,
	df_roster: pd.DataFrame,
	df_communities: pd.DataFrame):
	"""
	- Pick 1-3 random favorite players for each fan from the real roster
	- Assign 0 or 1 community to each fan from the real communities
	"""
	player_ids = df_roster["player_id"].tolist()
	community_ids = df_communities["community_id"].tolist()

	for idx, fan in df_fans.iterrows():
	# Choose 1-3 players
	if len(player_ids) > 0:
	num_players = random.randint(1, 3)
	chosen_players = random.sample(player_ids, k=num_players)
	else:
	chosen_players = []

	# 50% chance to join a community
	chosen_community = []
	if len(community_ids) > 0 and random.choice([True, False]):
	chosen_community = [random.choice(community_ids)]

	# Update the row's columns
	df_fans.at[idx, "favorite_players"] = chosen_players
	df_fans.at[idx, "community_memberships"] = chosen_community

	# ------------------------------------------------------------
	# 4. MAIN PIPELINE
	# ------------------------------------------------------------
	def main():
	# 4.1. Load real data
	df_communities, df_roster, df_schedule = load_real_data()

	# 4.2. Generate 2,500 synthetic fans
	df_fans = generate_synthetic_fans(NUM_FANS)

	# 4.3. Assign random relationships
	assign_relationships(df_fans, df_roster, df_communities)

	# 4.4. Export everything to CSV
	# (If you'd like to keep the original real-data files as is,
	# you can simply re-write them or rename them. Below we do an explicit "to_csv".)

	df_communities.to_csv(os.path.join(OUTPUT_DIR, "fan_communities.csv"), index=False)
	df_roster.to_csv(os.path.join(OUTPUT_DIR, "roster.csv"), index=False)
	df_schedule.to_csv(os.path.join(OUTPUT_DIR, "schedule.csv"), index=False)
	df_fans.to_csv(os.path.join(OUTPUT_DIR, "fans.csv"), index=False)

	print(f"Data generation complete! Files are in {OUTPUT_DIR}")
	print(" - fan_communities.csv (REAL)")
	print(" - roster.csv (REAL)")
	print(" - schedule.csv (REAL)")
	print(" - fans.csv (SYNTHETIC + relationships)")

	if __name__ == "__main__":
	main()