Spaces:
Sleeping
Sleeping
File size: 10,699 Bytes
5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d 5fc7138 2130e8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random
def generate_synthetic_data(num_customers=1000):
"""
Generate synthetic customer data for e-commerce analysis.
This function creates a dataset of customers with various attributes such as
demographics, purchase history, and preferences. It uses the Faker library to
generate realistic-looking data for Ukrainian customers.
Args:
num_customers (int): The number of customer records to generate (default: 1000)
Returns:
pandas.DataFrame: A DataFrame containing the generated customer data
"""
# Set up Faker for Ukrainian locale
fake = Faker('uk_UA')
Faker.seed(42)
np.random.seed(42)
# Define constants
NUM_CUSTOMERS = num_customers
START_DATE = date(2019, 1, 1)
END_DATE = date(2024, 7, 31)
# Helper functions
def generate_phone_number():
"""Generate a realistic Ukrainian phone number."""
return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"
def generate_email(name):
"""Generate an email address based on the customer's name."""
username = name.lower().replace(' ', '.').replace('\'', '')
domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
return f"{username}@{domain}"
# Define regions and their characteristics
REGIONS = {
'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
}
# Generate initial customer data
data = []
for i in range(NUM_CUSTOMERS):
customer_id = f"C{str(i+1).zfill(6)}"
# Region and City
region = np.random.choice(list(REGIONS.keys()))
region_info = REGIONS[region].copy() # Create a copy to avoid modifying the original
is_urban = np.random.random() < region_info['urbanization']
city = fake.city()
if not is_urban:
city = f"смт {city}"
# Age (dependent on region)
age = int(np.random.normal(region_info['avg_age'], 10))
age_noise = np.random.normal(0, 2) # Add noise with mean 0 and std dev 2
age = max(18, min(80, int(age + age_noise)))
# Add noise to urbanization and tech adoption
urbanization_noise = np.random.normal(0, 0.05)
tech_adoption_noise = np.random.normal(0, 0.05)
region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise))
region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise))
# Gender (slight dependency on age and region)
gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3 # Slight increase in urban areas
gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])
# Preferred Language (dependent on age and region)
ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40 # Younger people more likely to prefer Ukrainian
ukrainian_prob += 0.1 * (1 - region_info['urbanization']) # Rural areas more likely to prefer Ukrainian
preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])
# Registration date
registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)
# Determine if the customer is active (has made orders)
is_active = np.random.random() < 0.6 # 60% chance of being an active customer
if is_active:
# Total orders and average order value (dependent on various factors)
base_orders = np.random.poisson(5)
order_multiplier = 1 + 0.2 * (age - 40) / 40 # Age factor
order_multiplier *= 1 + 0.1 * (region_info['tech_adoption'] - 0.6) / 0.2 # Tech adoption factor
order_multiplier *= 1.1 if gender == 'Female' else 0.9 # Gender factor
order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers
# Add noise to total orders
total_orders_noise = np.random.poisson(2)
total_orders = max(1, total_orders + total_orders_noise)
base_aov = np.random.gamma(shape=5, scale=100)
aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
average_order_value = base_aov * aov_multiplier
# Add noise to average order value
aov_noise = np.random.normal(0, average_order_value * 0.1) # 10% noise
average_order_value = max(0, average_order_value + aov_noise)
# Last order date
last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
else:
total_orders = 0
average_order_value = 0
last_order_date = None
# Loyalty level based on total orders
loyalty_level = min(5, max(1, int(total_orders / 2)))
# Add some randomness to loyalty level
loyalty_noise = np.random.randint(-1, 2) # -1, 0, or 1
loyalty_level = max(1, min(5, loyalty_level + loyalty_noise))
# Newsletter subscription (dependent on age, loyalty, and tech adoption)
newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
newsletter_noise = np.random.normal(0, 0.1)
newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise))
newsletter_subscription = np.random.random() < newsletter_prob
# Preferred payment method (dependent on age and urbanization)
payment_probs = [
0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'], # Credit Card
0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'], # Cash on Delivery
0.15, # Bank Transfer
0.05 + 0.1 * region_info['tech_adoption'] # PayPal
]
payment_probs = [max(0, min(p, 1)) for p in payment_probs]
payment_probs = [p / sum(payment_probs) for p in payment_probs]
preferred_payment_method = np.random.choice(
['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
p=payment_probs
)
# Add some inconsistency to preferred payment method
if np.random.random() < 0.1: # 10% chance of inconsistent preference
preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'])
# Main browsing device (dependent on age and tech adoption)
device_probs = [
0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'], # Mobile
0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'] # App
]
device_probs = [max(0, min(p, 1)) for p in device_probs]
device_probs = [p / sum(device_probs) for p in device_probs]
# Add noise to main browsing device probabilities
device_noise = np.random.normal(0, 0.05, size=3)
device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)]
device_probs = [p / sum(device_probs) for p in device_probs]
main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)
# Product categories (dependent on age, gender, and browsing device)
all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
category_probs = [0.2] * 5
if age < 30:
category_probs[2] += 0.1 # Increase Computers
category_probs[3] += 0.1 # Increase Smartphones
elif age > 60:
category_probs[1] += 0.1 # Increase Home Appliances
category_probs[4] += 0.1 # Increase TV & Audio
if gender == 'Male':
category_probs[0] += 0.05 # Slight increase in Electronics
category_probs[2] += 0.05 # Slight increase in Computers
if main_browsing_device == 'Mobile':
category_probs[3] += 0.1 # Increase Smartphones
category_probs = [p / sum(category_probs) for p in category_probs]
num_categories = np.random.randint(1, 4)
product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)
data.append({
'customer_id': customer_id,
'name': fake.name(),
'email': generate_email(fake.name()),
'age': age,
'gender': gender,
'region': region,
'city': city,
'registration_date': registration_date,
'phone_number': generate_phone_number(),
'preferred_language': preferred_language,
'newsletter_subscription': newsletter_subscription,
'preferred_payment_method': preferred_payment_method,
'loyalty_level': loyalty_level,
'main_browsing_device': main_browsing_device,
'product_categories_of_interest': ', '.join(product_categories),
'average_order_value': round(average_order_value, 2),
'total_orders': total_orders,
'last_order_date': last_order_date
})
# Create DataFrame
df = pd.DataFrame(data)
return df
if __name__ == "__main__":
df = generate_synthetic_data()
print(df.head())
|