File size: 8,357 Bytes
5fc7138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random

def generate_synthetic_data(num_customers=1000):
    # Set up Faker for Ukrainian locale
    fake = Faker('uk_UA')
    Faker.seed(42)
    np.random.seed(42)

    # Define constants
    NUM_CUSTOMERS = num_customers
    START_DATE = date(2019, 1, 1)
    END_DATE = date(2024, 7, 31)

    # Helper functions
    def generate_phone_number():
        return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"

    def generate_email(name):
        username = name.lower().replace(' ', '.').replace('\'', '')
        domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
        return f"{username}@{domain}"

    # Define regions and their characteristics
    REGIONS = {
        'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
        'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
        'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
        'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
        'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
        'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
        'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
        'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
        'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
        'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
    }

    # Generate initial customer data
    data = []
    for i in range(NUM_CUSTOMERS):
        customer_id = f"C{str(i+1).zfill(6)}"

        # Region and City
        region = np.random.choice(list(REGIONS.keys()))
        region_info = REGIONS[region]
        is_urban = np.random.random() < region_info['urbanization']
        city = fake.city()
        if not is_urban:
            city = f"смт {city}"

        # Age (dependent on region)
        age = int(np.random.normal(region_info['avg_age'], 10))
        age = max(18, min(80, age))  # Clamp between 18 and 80

        # Gender (slight dependency on age and region)
        gender_prob = 0.49 + 0.02 * (age - 40) / 40  # Slight increase in male probability with age
        gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3  # Slight increase in urban areas
        gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])

        # Preferred Language (dependent on age and region)
        ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40  # Younger people more likely to prefer Ukrainian
        ukrainian_prob += 0.1 * (1 - region_info['urbanization'])  # Rural areas more likely to prefer Ukrainian
        preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])

        # Registration date
        registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)

        # Determine if the customer is active (has made orders)
        is_active = np.random.random() < 0.6  # 60% chance of being an active customer

        if is_active:
            # Total orders and average order value (dependent on various factors)
            base_orders = np.random.poisson(5)
            order_multiplier = 1 + 0.2 * (age - 40) / 40  # Age factor
            order_multiplier *= 1 + 0.1 * (region_info['tech_adoption'] - 0.6) / 0.2  # Tech adoption factor
            order_multiplier *= 1.1 if gender == 'Female' else 0.9  # Gender factor
            order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9  # Language factor
            total_orders = max(1, int(base_orders * order_multiplier))  # Ensure at least 1 order for active customers

            base_aov = np.random.gamma(shape=5, scale=100)
            aov_multiplier = 1 + 0.3 * (age - 40) / 40  # Age factor
            aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3  # Urbanization factor
            aov_multiplier *= 1.1 if gender == 'Male' else 0.9  # Gender factor
            average_order_value = base_aov * aov_multiplier

            # Last order date
            last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
        else:
            total_orders = 0
            average_order_value = 0
            last_order_date = None

        # Loyalty level based on total orders
        loyalty_level = min(5, max(1, int(total_orders+1 / 2)))

        # Newsletter subscription (dependent on age, loyalty, and tech adoption)
        newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
        newsletter_subscription = np.random.random() < newsletter_prob

        # Preferred payment method (dependent on age and urbanization)
        payment_probs = [
            0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'],  # Credit Card
            0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'],  # Cash on Delivery
            0.15,  # Bank Transfer
            0.05 + 0.1 * region_info['tech_adoption']  # PayPal
        ]
        payment_probs = [max(0, min(p, 1)) for p in payment_probs]
        payment_probs = [p / sum(payment_probs) for p in payment_probs]
        preferred_payment_method = np.random.choice(
            ['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
            p=payment_probs
        )

        # Main browsing device (dependent on age and tech adoption)
        device_probs = [
            0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'],  # Web
            0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'],  # Mobile
            0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption']   # App
        ]
        device_probs = [max(0, min(p, 1)) for p in device_probs]
        device_probs = [p / sum(device_probs) for p in device_probs]
        main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)

        # Product categories (dependent on age, gender, and browsing device)
        all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
        category_probs = [0.2] * 5
        if age < 30:
            category_probs[2] += 0.1  # Increase Computers
            category_probs[3] += 0.1  # Increase Smartphones
        elif age > 60:
            category_probs[1] += 0.1  # Increase Home Appliances
            category_probs[4] += 0.1  # Increase TV & Audio
        if gender == 'Male':
            category_probs[0] += 0.05  # Slight increase in Electronics
            category_probs[2] += 0.05  # Slight increase in Computers
        if main_browsing_device == 'Mobile':
            category_probs[3] += 0.1  # Increase Smartphones
        category_probs = [p / sum(category_probs) for p in category_probs]
        num_categories = np.random.randint(1, 4)
        product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)

        data.append({
            'customer_id': customer_id,
            'name': fake.name(),
            'email': generate_email(fake.name()),
            'age': age,
            'gender': gender,
            'region': region,
            'city': city,
            'registration_date': registration_date,
            'phone_number': generate_phone_number(),
            'preferred_language': preferred_language,
            'newsletter_subscription': newsletter_subscription,
            'preferred_payment_method': preferred_payment_method,
            'loyalty_level': loyalty_level,
            'main_browsing_device': main_browsing_device,
            'product_categories_of_interest': ', '.join(product_categories),
            'average_order_value': round(average_order_value, 2),
            'total_orders': total_orders,
            'last_order_date': last_order_date
        })

    # Create DataFrame
    df = pd.DataFrame(data)
    return df

if __name__ == "__main__":
    df = generate_synthetic_data()
    print(df.head())