comfyCausalAI / data_generator.py
rknl's picture
updated
2130e8d verified
raw
history blame
10.7 kB
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random
def generate_synthetic_data(num_customers=1000):
"""
Generate synthetic customer data for e-commerce analysis.
This function creates a dataset of customers with various attributes such as
demographics, purchase history, and preferences. It uses the Faker library to
generate realistic-looking data for Ukrainian customers.
Args:
num_customers (int): The number of customer records to generate (default: 1000)
Returns:
pandas.DataFrame: A DataFrame containing the generated customer data
"""
# Set up Faker for Ukrainian locale
fake = Faker('uk_UA')
Faker.seed(42)
np.random.seed(42)
# Define constants
NUM_CUSTOMERS = num_customers
START_DATE = date(2019, 1, 1)
END_DATE = date(2024, 7, 31)
# Helper functions
def generate_phone_number():
"""Generate a realistic Ukrainian phone number."""
return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"
def generate_email(name):
"""Generate an email address based on the customer's name."""
username = name.lower().replace(' ', '.').replace('\'', '')
domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
return f"{username}@{domain}"
# Define regions and their characteristics
REGIONS = {
'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
}
# Generate initial customer data
data = []
for i in range(NUM_CUSTOMERS):
customer_id = f"C{str(i+1).zfill(6)}"
# Region and City
region = np.random.choice(list(REGIONS.keys()))
region_info = REGIONS[region].copy() # Create a copy to avoid modifying the original
is_urban = np.random.random() < region_info['urbanization']
city = fake.city()
if not is_urban:
city = f"смт {city}"
# Age (dependent on region)
age = int(np.random.normal(region_info['avg_age'], 10))
age_noise = np.random.normal(0, 2) # Add noise with mean 0 and std dev 2
age = max(18, min(80, int(age + age_noise)))
# Add noise to urbanization and tech adoption
urbanization_noise = np.random.normal(0, 0.05)
tech_adoption_noise = np.random.normal(0, 0.05)
region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise))
region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise))
# Gender (slight dependency on age and region)
gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3 # Slight increase in urban areas
gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])
# Preferred Language (dependent on age and region)
ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40 # Younger people more likely to prefer Ukrainian
ukrainian_prob += 0.1 * (1 - region_info['urbanization']) # Rural areas more likely to prefer Ukrainian
preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])
# Registration date
registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)
# Determine if the customer is active (has made orders)
is_active = np.random.random() < 0.6 # 60% chance of being an active customer
if is_active:
# Total orders and average order value (dependent on various factors)
base_orders = np.random.poisson(5)
order_multiplier = 1 + 0.2 * (age - 40) / 40 # Age factor
order_multiplier *= 1 + 0.1 * (region_info['tech_adoption'] - 0.6) / 0.2 # Tech adoption factor
order_multiplier *= 1.1 if gender == 'Female' else 0.9 # Gender factor
order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers
# Add noise to total orders
total_orders_noise = np.random.poisson(2)
total_orders = max(1, total_orders + total_orders_noise)
base_aov = np.random.gamma(shape=5, scale=100)
aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
average_order_value = base_aov * aov_multiplier
# Add noise to average order value
aov_noise = np.random.normal(0, average_order_value * 0.1) # 10% noise
average_order_value = max(0, average_order_value + aov_noise)
# Last order date
last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
else:
total_orders = 0
average_order_value = 0
last_order_date = None
# Loyalty level based on total orders
loyalty_level = min(5, max(1, int(total_orders / 2)))
# Add some randomness to loyalty level
loyalty_noise = np.random.randint(-1, 2) # -1, 0, or 1
loyalty_level = max(1, min(5, loyalty_level + loyalty_noise))
# Newsletter subscription (dependent on age, loyalty, and tech adoption)
newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
newsletter_noise = np.random.normal(0, 0.1)
newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise))
newsletter_subscription = np.random.random() < newsletter_prob
# Preferred payment method (dependent on age and urbanization)
payment_probs = [
0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'], # Credit Card
0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'], # Cash on Delivery
0.15, # Bank Transfer
0.05 + 0.1 * region_info['tech_adoption'] # PayPal
]
payment_probs = [max(0, min(p, 1)) for p in payment_probs]
payment_probs = [p / sum(payment_probs) for p in payment_probs]
preferred_payment_method = np.random.choice(
['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
p=payment_probs
)
# Add some inconsistency to preferred payment method
if np.random.random() < 0.1: # 10% chance of inconsistent preference
preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'])
# Main browsing device (dependent on age and tech adoption)
device_probs = [
0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'], # Mobile
0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'] # App
]
device_probs = [max(0, min(p, 1)) for p in device_probs]
device_probs = [p / sum(device_probs) for p in device_probs]
# Add noise to main browsing device probabilities
device_noise = np.random.normal(0, 0.05, size=3)
device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)]
device_probs = [p / sum(device_probs) for p in device_probs]
main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)
# Product categories (dependent on age, gender, and browsing device)
all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
category_probs = [0.2] * 5
if age < 30:
category_probs[2] += 0.1 # Increase Computers
category_probs[3] += 0.1 # Increase Smartphones
elif age > 60:
category_probs[1] += 0.1 # Increase Home Appliances
category_probs[4] += 0.1 # Increase TV & Audio
if gender == 'Male':
category_probs[0] += 0.05 # Slight increase in Electronics
category_probs[2] += 0.05 # Slight increase in Computers
if main_browsing_device == 'Mobile':
category_probs[3] += 0.1 # Increase Smartphones
category_probs = [p / sum(category_probs) for p in category_probs]
num_categories = np.random.randint(1, 4)
product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)
data.append({
'customer_id': customer_id,
'name': fake.name(),
'email': generate_email(fake.name()),
'age': age,
'gender': gender,
'region': region,
'city': city,
'registration_date': registration_date,
'phone_number': generate_phone_number(),
'preferred_language': preferred_language,
'newsletter_subscription': newsletter_subscription,
'preferred_payment_method': preferred_payment_method,
'loyalty_level': loyalty_level,
'main_browsing_device': main_browsing_device,
'product_categories_of_interest': ', '.join(product_categories),
'average_order_value': round(average_order_value, 2),
'total_orders': total_orders,
'last_order_date': last_order_date
})
# Create DataFrame
df = pd.DataFrame(data)
return df
if __name__ == "__main__":
df = generate_synthetic_data()
print(df.head())