comfyCausalAI / data_generator.py
rknl's picture
app push
5fc7138 verified
raw
history blame
8.36 kB
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random
def generate_synthetic_data(num_customers=1000):
# Set up Faker for Ukrainian locale
fake = Faker('uk_UA')
Faker.seed(42)
np.random.seed(42)
# Define constants
NUM_CUSTOMERS = num_customers
START_DATE = date(2019, 1, 1)
END_DATE = date(2024, 7, 31)
# Helper functions
def generate_phone_number():
return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"
def generate_email(name):
username = name.lower().replace(' ', '.').replace('\'', '')
domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
return f"{username}@{domain}"
# Define regions and their characteristics
REGIONS = {
'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
}
# Generate initial customer data
data = []
for i in range(NUM_CUSTOMERS):
customer_id = f"C{str(i+1).zfill(6)}"
# Region and City
region = np.random.choice(list(REGIONS.keys()))
region_info = REGIONS[region]
is_urban = np.random.random() < region_info['urbanization']
city = fake.city()
if not is_urban:
city = f"смт {city}"
# Age (dependent on region)
age = int(np.random.normal(region_info['avg_age'], 10))
age = max(18, min(80, age)) # Clamp between 18 and 80
# Gender (slight dependency on age and region)
gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3 # Slight increase in urban areas
gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])
# Preferred Language (dependent on age and region)
ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40 # Younger people more likely to prefer Ukrainian
ukrainian_prob += 0.1 * (1 - region_info['urbanization']) # Rural areas more likely to prefer Ukrainian
preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])
# Registration date
registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)
# Determine if the customer is active (has made orders)
is_active = np.random.random() < 0.6 # 60% chance of being an active customer
if is_active:
# Total orders and average order value (dependent on various factors)
base_orders = np.random.poisson(5)
order_multiplier = 1 + 0.2 * (age - 40) / 40 # Age factor
order_multiplier *= 1 + 0.1 * (region_info['tech_adoption'] - 0.6) / 0.2 # Tech adoption factor
order_multiplier *= 1.1 if gender == 'Female' else 0.9 # Gender factor
order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers
base_aov = np.random.gamma(shape=5, scale=100)
aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
average_order_value = base_aov * aov_multiplier
# Last order date
last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
else:
total_orders = 0
average_order_value = 0
last_order_date = None
# Loyalty level based on total orders
loyalty_level = min(5, max(1, int(total_orders+1 / 2)))
# Newsletter subscription (dependent on age, loyalty, and tech adoption)
newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
newsletter_subscription = np.random.random() < newsletter_prob
# Preferred payment method (dependent on age and urbanization)
payment_probs = [
0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'], # Credit Card
0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'], # Cash on Delivery
0.15, # Bank Transfer
0.05 + 0.1 * region_info['tech_adoption'] # PayPal
]
payment_probs = [max(0, min(p, 1)) for p in payment_probs]
payment_probs = [p / sum(payment_probs) for p in payment_probs]
preferred_payment_method = np.random.choice(
['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
p=payment_probs
)
# Main browsing device (dependent on age and tech adoption)
device_probs = [
0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'], # Mobile
0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'] # App
]
device_probs = [max(0, min(p, 1)) for p in device_probs]
device_probs = [p / sum(device_probs) for p in device_probs]
main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)
# Product categories (dependent on age, gender, and browsing device)
all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
category_probs = [0.2] * 5
if age < 30:
category_probs[2] += 0.1 # Increase Computers
category_probs[3] += 0.1 # Increase Smartphones
elif age > 60:
category_probs[1] += 0.1 # Increase Home Appliances
category_probs[4] += 0.1 # Increase TV & Audio
if gender == 'Male':
category_probs[0] += 0.05 # Slight increase in Electronics
category_probs[2] += 0.05 # Slight increase in Computers
if main_browsing_device == 'Mobile':
category_probs[3] += 0.1 # Increase Smartphones
category_probs = [p / sum(category_probs) for p in category_probs]
num_categories = np.random.randint(1, 4)
product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)
data.append({
'customer_id': customer_id,
'name': fake.name(),
'email': generate_email(fake.name()),
'age': age,
'gender': gender,
'region': region,
'city': city,
'registration_date': registration_date,
'phone_number': generate_phone_number(),
'preferred_language': preferred_language,
'newsletter_subscription': newsletter_subscription,
'preferred_payment_method': preferred_payment_method,
'loyalty_level': loyalty_level,
'main_browsing_device': main_browsing_device,
'product_categories_of_interest': ', '.join(product_categories),
'average_order_value': round(average_order_value, 2),
'total_orders': total_orders,
'last_order_date': last_order_date
})
# Create DataFrame
df = pd.DataFrame(data)
return df
if __name__ == "__main__":
df = generate_synthetic_data()
print(df.head())