Spaces:

neuronslabs
/

comfyCausalAI

Sleeping

App Files Files Community

comfyCausalAI / data_generator.py

rknl

updated

2130e8d verified about 1 year ago

raw

history blame

10.7 kB

	import pandas as pd
	import numpy as np
	from faker import Faker
	from datetime import datetime, timedelta, date
	import random

	def generate_synthetic_data(num_customers=1000):
	"""
	Generate synthetic customer data for e-commerce analysis.

	This function creates a dataset of customers with various attributes such as
	demographics, purchase history, and preferences. It uses the Faker library to
	generate realistic-looking data for Ukrainian customers.

	Args:
	num_customers (int): The number of customer records to generate (default: 1000)

	Returns:
	pandas.DataFrame: A DataFrame containing the generated customer data
	"""
	# Set up Faker for Ukrainian locale
	fake = Faker('uk_UA')
	Faker.seed(42)
	np.random.seed(42)

	# Define constants
	NUM_CUSTOMERS = num_customers
	START_DATE = date(2019, 1, 1)
	END_DATE = date(2024, 7, 31)

	# Helper functions
	def generate_phone_number():
	"""Generate a realistic Ukrainian phone number."""
	return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"

	def generate_email(name):
	"""Generate an email address based on the customer's name."""
	username = name.lower().replace(' ', '.').replace('\'', '')
	domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
	return f"{username}@{domain}"

	# Define regions and their characteristics
	REGIONS = {
	'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
	'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
	'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
	'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
	'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
	'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
	'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
	'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
	'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
	'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
	}

	# Generate initial customer data
	data = []
	for i in range(NUM_CUSTOMERS):
	customer_id = f"C{str(i+1).zfill(6)}"

	# Region and City
	region = np.random.choice(list(REGIONS.keys()))
	region_info = REGIONS[region].copy() # Create a copy to avoid modifying the original
	is_urban = np.random.random() < region_info['urbanization']
	city = fake.city()
	if not is_urban:
	city = f"смт {city}"

	# Age (dependent on region)
	age = int(np.random.normal(region_info['avg_age'], 10))
	age_noise = np.random.normal(0, 2) # Add noise with mean 0 and std dev 2
	age = max(18, min(80, int(age + age_noise)))

	# Add noise to urbanization and tech adoption
	urbanization_noise = np.random.normal(0, 0.05)
	tech_adoption_noise = np.random.normal(0, 0.05)
	region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise))
	region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise))

	# Gender (slight dependency on age and region)
	gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
	gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3 # Slight increase in urban areas
	gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])

	# Preferred Language (dependent on age and region)
	ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40 # Younger people more likely to prefer Ukrainian
	ukrainian_prob += 0.1 * (1 - region_info['urbanization']) # Rural areas more likely to prefer Ukrainian
	preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])

	# Registration date
	registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)

	# Determine if the customer is active (has made orders)
	is_active = np.random.random() < 0.6 # 60% chance of being an active customer

	if is_active:
	# Total orders and average order value (dependent on various factors)
	base_orders = np.random.poisson(5)
	order_multiplier = 1 + 0.2 * (age - 40) / 40 # Age factor
	order_multiplier = 1 + 0.1 (region_info['tech_adoption'] - 0.6) / 0.2 # Tech adoption factor
	order_multiplier *= 1.1 if gender == 'Female' else 0.9 # Gender factor
	order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
	total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers

	# Add noise to total orders
	total_orders_noise = np.random.poisson(2)
	total_orders = max(1, total_orders + total_orders_noise)

	base_aov = np.random.gamma(shape=5, scale=100)
	aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
	aov_multiplier = 1 + 0.2 (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
	aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
	average_order_value = base_aov * aov_multiplier

	# Add noise to average order value
	aov_noise = np.random.normal(0, average_order_value * 0.1) # 10% noise
	average_order_value = max(0, average_order_value + aov_noise)

	# Last order date
	last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
	else:
	total_orders = 0
	average_order_value = 0
	last_order_date = None

	# Loyalty level based on total orders
	loyalty_level = min(5, max(1, int(total_orders / 2)))

	# Add some randomness to loyalty level
	loyalty_noise = np.random.randint(-1, 2) # -1, 0, or 1
	loyalty_level = max(1, min(5, loyalty_level + loyalty_noise))

	# Newsletter subscription (dependent on age, loyalty, and tech adoption)
	newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
	newsletter_noise = np.random.normal(0, 0.1)
	newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise))
	newsletter_subscription = np.random.random() < newsletter_prob

	# Preferred payment method (dependent on age and urbanization)
	payment_probs = [
	0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'], # Credit Card
	0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'], # Cash on Delivery
	0.15, # Bank Transfer
	0.05 + 0.1 * region_info['tech_adoption'] # PayPal
	]
	payment_probs = [max(0, min(p, 1)) for p in payment_probs]
	payment_probs = [p / sum(payment_probs) for p in payment_probs]
	preferred_payment_method = np.random.choice(
	['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
	p=payment_probs
	)

	# Add some inconsistency to preferred payment method
	if np.random.random() < 0.1: # 10% chance of inconsistent preference
	preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'])

	# Main browsing device (dependent on age and tech adoption)
	device_probs = [
	0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
	0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'], # Mobile
	0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'] # App
	]
	device_probs = [max(0, min(p, 1)) for p in device_probs]
	device_probs = [p / sum(device_probs) for p in device_probs]

	# Add noise to main browsing device probabilities
	device_noise = np.random.normal(0, 0.05, size=3)
	device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)]
	device_probs = [p / sum(device_probs) for p in device_probs]

	main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)

	# Product categories (dependent on age, gender, and browsing device)
	all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
	category_probs = [0.2] * 5
	if age < 30:
	category_probs[2] += 0.1 # Increase Computers
	category_probs[3] += 0.1 # Increase Smartphones
	elif age > 60:
	category_probs[1] += 0.1 # Increase Home Appliances
	category_probs[4] += 0.1 # Increase TV & Audio
	if gender == 'Male':
	category_probs[0] += 0.05 # Slight increase in Electronics
	category_probs[2] += 0.05 # Slight increase in Computers
	if main_browsing_device == 'Mobile':
	category_probs[3] += 0.1 # Increase Smartphones
	category_probs = [p / sum(category_probs) for p in category_probs]
	num_categories = np.random.randint(1, 4)
	product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)

	data.append({
	'customer_id': customer_id,
	'name': fake.name(),
	'email': generate_email(fake.name()),
	'age': age,
	'gender': gender,
	'region': region,
	'city': city,
	'registration_date': registration_date,
	'phone_number': generate_phone_number(),
	'preferred_language': preferred_language,
	'newsletter_subscription': newsletter_subscription,
	'preferred_payment_method': preferred_payment_method,
	'loyalty_level': loyalty_level,
	'main_browsing_device': main_browsing_device,
	'product_categories_of_interest': ', '.join(product_categories),
	'average_order_value': round(average_order_value, 2),
	'total_orders': total_orders,
	'last_order_date': last_order_date
	})

	# Create DataFrame
	df = pd.DataFrame(data)
	return df

	if __name__ == "__main__":
	df = generate_synthetic_data()
	print(df.head())