File size: 10,699 Bytes
5fc7138
 
 
 
 
 
 
2130e8d
 
 
 
 
 
 
 
 
 
 
 
 
5fc7138
 
 
 
 
 
 
 
 
 
 
 
2130e8d
5fc7138
 
 
2130e8d
5fc7138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130e8d
5fc7138
 
 
 
 
 
 
2130e8d
 
 
 
 
 
 
 
5fc7138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130e8d
 
 
 
5fc7138
 
 
 
 
 
2130e8d
 
 
 
5fc7138
 
 
 
 
 
 
 
2130e8d
 
 
 
 
5fc7138
 
 
2130e8d
 
5fc7138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130e8d
 
 
 
5fc7138
 
 
 
 
 
 
 
2130e8d
 
 
 
 
 
5fc7138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2130e8d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random

def generate_synthetic_data(num_customers=1000):
    """
    Generate synthetic customer data for e-commerce analysis.

    This function creates a dataset of customers with various attributes such as
    demographics, purchase history, and preferences. It uses the Faker library to
    generate realistic-looking data for Ukrainian customers.

    Args:
        num_customers (int): The number of customer records to generate (default: 1000)

    Returns:
        pandas.DataFrame: A DataFrame containing the generated customer data
    """
    # Set up Faker for Ukrainian locale
    fake = Faker('uk_UA')
    Faker.seed(42)
    np.random.seed(42)

    # Define constants
    NUM_CUSTOMERS = num_customers
    START_DATE = date(2019, 1, 1)
    END_DATE = date(2024, 7, 31)

    # Helper functions
    def generate_phone_number():
        """Generate a realistic Ukrainian phone number."""
        return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"

    def generate_email(name):
        """Generate an email address based on the customer's name."""
        username = name.lower().replace(' ', '.').replace('\'', '')
        domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
        return f"{username}@{domain}"

    # Define regions and their characteristics
    REGIONS = {
        'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
        'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
        'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
        'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
        'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
        'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
        'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
        'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
        'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
        'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
    }

    # Generate initial customer data
    data = []
    for i in range(NUM_CUSTOMERS):
        customer_id = f"C{str(i+1).zfill(6)}"

        # Region and City
        region = np.random.choice(list(REGIONS.keys()))
        region_info = REGIONS[region].copy()  # Create a copy to avoid modifying the original
        is_urban = np.random.random() < region_info['urbanization']
        city = fake.city()
        if not is_urban:
            city = f"смт {city}"

        # Age (dependent on region)
        age = int(np.random.normal(region_info['avg_age'], 10))
        age_noise = np.random.normal(0, 2)  # Add noise with mean 0 and std dev 2
        age = max(18, min(80, int(age + age_noise)))

        # Add noise to urbanization and tech adoption
        urbanization_noise = np.random.normal(0, 0.05)
        tech_adoption_noise = np.random.normal(0, 0.05)
        region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise))
        region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise))

        # Gender (slight dependency on age and region)
        gender_prob = 0.49 + 0.02 * (age - 40) / 40  # Slight increase in male probability with age
        gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3  # Slight increase in urban areas
        gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])

        # Preferred Language (dependent on age and region)
        ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40  # Younger people more likely to prefer Ukrainian
        ukrainian_prob += 0.1 * (1 - region_info['urbanization'])  # Rural areas more likely to prefer Ukrainian
        preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])

        # Registration date
        registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)

        # Determine if the customer is active (has made orders)
        is_active = np.random.random() < 0.6  # 60% chance of being an active customer

        if is_active:
            # Total orders and average order value (dependent on various factors)
            base_orders = np.random.poisson(5)
            order_multiplier = 1 + 0.2 * (age - 40) / 40  # Age factor
            order_multiplier *= 1 + 0.1 * (region_info['tech_adoption'] - 0.6) / 0.2  # Tech adoption factor
            order_multiplier *= 1.1 if gender == 'Female' else 0.9  # Gender factor
            order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9  # Language factor
            total_orders = max(1, int(base_orders * order_multiplier))  # Ensure at least 1 order for active customers

            # Add noise to total orders
            total_orders_noise = np.random.poisson(2)
            total_orders = max(1, total_orders + total_orders_noise)

            base_aov = np.random.gamma(shape=5, scale=100)
            aov_multiplier = 1 + 0.3 * (age - 40) / 40  # Age factor
            aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3  # Urbanization factor
            aov_multiplier *= 1.1 if gender == 'Male' else 0.9  # Gender factor
            average_order_value = base_aov * aov_multiplier

            # Add noise to average order value
            aov_noise = np.random.normal(0, average_order_value * 0.1)  # 10% noise
            average_order_value = max(0, average_order_value + aov_noise)

            # Last order date
            last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
        else:
            total_orders = 0
            average_order_value = 0
            last_order_date = None

        # Loyalty level based on total orders
        loyalty_level = min(5, max(1, int(total_orders / 2)))

        # Add some randomness to loyalty level
        loyalty_noise = np.random.randint(-1, 2)  # -1, 0, or 1
        loyalty_level = max(1, min(5, loyalty_level + loyalty_noise))

        # Newsletter subscription (dependent on age, loyalty, and tech adoption)
        newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
        newsletter_noise = np.random.normal(0, 0.1)
        newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise))
        newsletter_subscription = np.random.random() < newsletter_prob

        # Preferred payment method (dependent on age and urbanization)
        payment_probs = [
            0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'],  # Credit Card
            0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'],  # Cash on Delivery
            0.15,  # Bank Transfer
            0.05 + 0.1 * region_info['tech_adoption']  # PayPal
        ]
        payment_probs = [max(0, min(p, 1)) for p in payment_probs]
        payment_probs = [p / sum(payment_probs) for p in payment_probs]
        preferred_payment_method = np.random.choice(
            ['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
            p=payment_probs
        )

        # Add some inconsistency to preferred payment method
        if np.random.random() < 0.1:  # 10% chance of inconsistent preference
            preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'])

        # Main browsing device (dependent on age and tech adoption)
        device_probs = [
            0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'],  # Web
            0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'],  # Mobile
            0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption']   # App
        ]
        device_probs = [max(0, min(p, 1)) for p in device_probs]
        device_probs = [p / sum(device_probs) for p in device_probs]

        # Add noise to main browsing device probabilities
        device_noise = np.random.normal(0, 0.05, size=3)
        device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)]
        device_probs = [p / sum(device_probs) for p in device_probs]

        main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)

        # Product categories (dependent on age, gender, and browsing device)
        all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
        category_probs = [0.2] * 5
        if age < 30:
            category_probs[2] += 0.1  # Increase Computers
            category_probs[3] += 0.1  # Increase Smartphones
        elif age > 60:
            category_probs[1] += 0.1  # Increase Home Appliances
            category_probs[4] += 0.1  # Increase TV & Audio
        if gender == 'Male':
            category_probs[0] += 0.05  # Slight increase in Electronics
            category_probs[2] += 0.05  # Slight increase in Computers
        if main_browsing_device == 'Mobile':
            category_probs[3] += 0.1  # Increase Smartphones
        category_probs = [p / sum(category_probs) for p in category_probs]
        num_categories = np.random.randint(1, 4)
        product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)

        data.append({
            'customer_id': customer_id,
            'name': fake.name(),
            'email': generate_email(fake.name()),
            'age': age,
            'gender': gender,
            'region': region,
            'city': city,
            'registration_date': registration_date,
            'phone_number': generate_phone_number(),
            'preferred_language': preferred_language,
            'newsletter_subscription': newsletter_subscription,
            'preferred_payment_method': preferred_payment_method,
            'loyalty_level': loyalty_level,
            'main_browsing_device': main_browsing_device,
            'product_categories_of_interest': ', '.join(product_categories),
            'average_order_value': round(average_order_value, 2),
            'total_orders': total_orders,
            'last_order_date': last_order_date
        })

    # Create DataFrame
    df = pd.DataFrame(data)
    return df

if __name__ == "__main__":
    df = generate_synthetic_data()
    print(df.head())