rknl commited on
Commit
2130e8d
·
verified ·
1 Parent(s): bf52b89
Files changed (5) hide show
  1. README.md +66 -2
  2. app.py +425 -46
  3. data_generator.py +50 -4
  4. rct_analyzer.py +37 -26
  5. rct_simulator.py +69 -14
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: ComfyCausalAI
3
  emoji: 🐨
4
  colorFrom: blue
5
  colorTo: yellow
@@ -9,4 +9,68 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Example2
3
  emoji: 🐨
4
  colorFrom: blue
5
  colorTo: yellow
 
9
  pinned: false
10
  ---
11
 
12
+ # Causal AI - Synthetic Customer Data Generator and RCT Simulator
13
+
14
+ ## Project Overview
15
+
16
+ This project provides a comprehensive toolkit for generating synthetic e-commerce customer data, simulating Randomized Control Trials (RCTs), and analyzing the results using causal inference techniques. It's designed to help data scientists and analysts explore the impact of various discount strategies on customer behavior and business metrics.
17
+
18
+ ## Key Features
19
+
20
+ 1. **Synthetic Data Generation**: Create realistic customer profiles with various attributes.
21
+ 2. **RCT Simulation**: Run experiments with different discount levels.
22
+ 3. **Results Analysis**: Analyze the impact of discounts on key business metrics.
23
+ 4. **Uplift Modeling**: Build and evaluate uplift models to predict individual treatment effects.
24
+ 5. **Targeted Policy Simulation**: Test targeting strategies based on uplift model predictions.
25
+
26
+ ## Project Structure
27
+
28
+ - `app.py`: Main application file with Gradio interface
29
+ - `data_generator.py`: Synthetic customer data generation
30
+ - `rct_simulator.py`: Randomized Control Trial simulation
31
+ - `rct_analyzer.py`: Analysis of RCT results
32
+ - `requirements.txt`: List of required Python packages
33
+
34
+ ## Installation
35
+
36
+ 1. Clone the repository:
37
+ ```
38
+ git clone https://github.com/neurons-lab/causal-ai.git
39
+ cd causal-ai
40
+ ```
41
+
42
+ 2. Create a virtual environment (optional but recommended):
43
+ ```
44
+ python -m venv venv
45
+ source venv/bin/activate # On Windows, use `venv\Scripts\activate`
46
+ ```
47
+
48
+ 3. Install the required packages:
49
+ ```
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ Run the main application:
56
+
57
+ ```
58
+ python app.py
59
+ ```
60
+
61
+ This will start the Gradio interface, which you can access through your web browser. The interface is divided into several tabs:
62
+
63
+ 1. **Generate Customer Data**: Create synthetic customer profiles.
64
+ 2. **Run RCT Simulation**: Simulate a Randomized Control Trial with different discount levels.
65
+ 3. **Analyze RCT Results**: View and analyze the results of the RCT.
66
+ 4. **Exploratory Data Analysis**: Explore the impact of discounts on different customer segments.
67
+ 5. **Build Uplift Model**: Create uplift models to predict individual treatment effects.
68
+ 6. **Run Targeting Policy**: Test targeting strategies based on uplift model predictions.
69
+
70
+ ## Customization
71
+
72
+ You can customize various aspects of the simulation:
73
+
74
+ - Modify the `electronics_products` list in `rct_simulator.py` to change the available products.
75
+ - Adjust the `calculate_purchase_probability` function in `rct_simulator.py` to alter how customer attributes affect purchase likelihood.
76
+ - Update the `REGIONS` dictionary in `data_generator.py` to change the demographic characteristics of different regions.
app.py CHANGED
@@ -6,18 +6,34 @@ import numpy as np
6
  import plotly
7
  import plotly.graph_objs as go
8
  from plotly.subplots import make_subplots
 
9
  from sklearn.preprocessing import StandardScaler
10
  from causalml.inference.meta import BaseTClassifier
11
  from sklearn.ensemble import RandomForestClassifier
12
  from data_generator import generate_synthetic_data
13
- from rct_simulator import run_rct_simulation
14
  from rct_analyzer import analyze_rct_results
 
 
15
 
16
  # Global variables to store generated data and RCT results
17
  generated_data = None
18
  rct_results = None
 
 
19
 
20
  def perform_eda(discount_level):
 
 
 
 
 
 
 
 
 
 
 
 
21
  global rct_results, generated_data
22
  if rct_results is None or generated_data is None:
23
  return "Please generate customer data and run RCT simulation first.", None, None, None, None
@@ -47,6 +63,19 @@ def perform_eda(discount_level):
47
  newsletter_results, payment_results, newsletter_fig, payment_fig)
48
 
49
  def analyze_feature(df, feature):
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  control_df = df[df['variant'] == 'Control']
51
  treatment_df = df[df['variant'] != 'Control']
52
 
@@ -67,6 +96,17 @@ def analyze_feature(df, feature):
67
  return results
68
 
69
  def create_bar_plot(data, feature, discount_level):
 
 
 
 
 
 
 
 
 
 
 
70
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
71
 
72
  data[feature] = data[feature].astype(str) # Ensure the feature is treated as a string
@@ -86,8 +126,16 @@ def create_bar_plot(data, feature, discount_level):
86
  plt.tight_layout()
87
  return fig
88
 
89
-
90
  def generate_and_display_data(num_customers):
 
 
 
 
 
 
 
 
 
91
  global generated_data
92
  generated_data = generate_synthetic_data(num_customers=num_customers)
93
 
@@ -106,6 +154,15 @@ def generate_and_display_data(num_customers):
106
  f"Generated {num_customers} records. Displaying samples of 10 rows for each dataset.")
107
 
108
  def run_and_display_rct(experiment_duration):
 
 
 
 
 
 
 
 
 
109
  global generated_data, rct_results
110
  if generated_data is None:
111
  return None, None, "Please generate customer data first."
@@ -120,6 +177,12 @@ def run_and_display_rct(experiment_duration):
120
  f"Ran RCT simulation for {experiment_duration} days. Displaying samples of 10 rows for each dataset.")
121
 
122
  def analyze_and_display_results():
 
 
 
 
 
 
123
  global rct_results
124
  if rct_results is None:
125
  return None, None, None, "Please run the RCT simulation first."
@@ -129,6 +192,21 @@ def analyze_and_display_results():
129
  return overall_df, variant_df, fig, "Analysis complete. Displaying results and visualizations."
130
 
131
  def build_uplift_model(data, features, treatment, control):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  # Prepare the data
133
  treatment_data = data[data['variant'] == treatment]
134
  control_data = data[data['variant'] == control]
@@ -174,50 +252,325 @@ def build_uplift_model(data, features, treatment, control):
174
  elif uplift_scores.shape[1] == 1:
175
  uplift_scores = uplift_scores.flatten()
176
 
177
- return uplift_scores, feature_importance_df
178
 
179
- def build_model_and_display(selected_features, treatment):
180
- global rct_results, generated_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  if rct_results is None or generated_data is None:
182
  return "Please generate customer data and run RCT simulation first.", None, None
183
-
184
  transactions_df, variant_assignments_df = rct_results
185
 
186
  # Prepare the data
187
  df_with_variant = pd.merge(generated_data, variant_assignments_df, on='customer_id', how='inner')
188
  transactions_df['purchase'] = 1
189
  final_df = pd.merge(df_with_variant, transactions_df, on=['customer_id', 'variant'], how='left')
190
- columns_to_fill = ['purchase', 'price', 'discounted_price', 'cost', 'profit']
191
- final_df[columns_to_fill] = final_df[columns_to_fill].fillna(0)
192
-
193
- # Build the model
194
- uplift_scores, feature_importance_df = build_uplift_model(final_df, selected_features, treatment, 'Control')
195
-
196
- # Calculate statistics
197
- stats = pd.DataFrame({
198
- 'Metric': ['Mean', 'Std', 'Min', 'Max'],
199
- 'Value': [
200
- np.mean(uplift_scores),
201
- np.std(uplift_scores),
202
- np.min(uplift_scores),
203
- np.max(uplift_scores)
204
- ]
205
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  # Create feature importance plot
208
- fig, ax = plt.subplots(figsize=(10, 6))
209
- sns.barplot(x='importance', y='feature', data=feature_importance_df.head(10), ax=ax)
210
- ax.set_title(f'Top 10 Feature Importance for {treatment} vs Control')
 
 
 
 
 
 
211
  ax.set_xlabel('Importance')
212
  ax.set_ylabel('Feature')
213
  plt.tight_layout()
214
 
215
- info = f"Uplift model built using {len(selected_features)} features.\n"
216
- info += f"Treatment: {treatment} vs Control\n"
217
- info += f"Number of samples: {len(uplift_scores)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- return info, stats, fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  with gr.Blocks() as demo:
222
  gr.Markdown("# Causal AI - Synthetic Customer Data Generator and RCT Simulator")
223
 
@@ -227,7 +580,7 @@ with gr.Blocks() as demo:
227
  gr.Markdown("First we generate some basic attributes that are defined when the customer first registers, such as Name, City or Preferred Language.")
228
  gr.Markdown("Then we add some extra information that is usually the result of the customer past behavior, such as Loyalty Level, Past Purchases or Categories of interest.")
229
  gr.Markdown("## Select the number of customers that you want to Generate")
230
- num_customers_input = gr.Slider(minimum=10000, maximum=500000, value=50000, step=1000, label="Number of Customer Records")
231
  generate_btn = gr.Button("Generate Customer Data")
232
  gr.Markdown("## Basic Customer Info Sample")
233
  basic_info_output = gr.DataFrame()
@@ -301,34 +654,60 @@ with gr.Blocks() as demo:
301
  )
302
 
303
  with gr.Tab("Build Uplift Model"):
304
- gr.Markdown("## Build Uplift Model")
305
 
306
  # Feature selection
307
  feature_checklist = gr.CheckboxGroup(
308
  choices=['age', 'gender', 'region', 'preferred_language', 'newsletter_subscription',
309
  'preferred_payment_method', 'loyalty_level', 'main_browsing_device',
310
  'average_order_value', 'total_orders'],
311
- label="Select features for the model",
312
  value=['age', 'gender', 'loyalty_level', 'average_order_value', 'total_orders']
313
  )
314
 
315
- # Dropdown for selecting treatment
316
- treatment_dropdown = gr.Dropdown(
317
- choices=['5% discount', '10% discount', '15% discount'],
318
- label="Select treatment",
319
- value='10% discount'
320
- )
321
-
322
- build_model_btn = gr.Button("Build Uplift Model")
323
 
324
  model_info = gr.Textbox(label="Model Information")
325
- uplift_stats = gr.Dataframe(label="Uplift Score Statistics")
326
- feature_importance_plot = gr.Plot(label="Feature Importance")
327
 
328
  build_model_btn.click(
329
- fn=build_model_and_display,
330
- inputs=[feature_checklist, treatment_dropdown],
331
- outputs=[model_info, uplift_stats, feature_importance_plot]
332
  )
333
 
334
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import plotly
7
  import plotly.graph_objs as go
8
  from plotly.subplots import make_subplots
9
+ import plotly.subplots as sp
10
  from sklearn.preprocessing import StandardScaler
11
  from causalml.inference.meta import BaseTClassifier
12
  from sklearn.ensemble import RandomForestClassifier
13
  from data_generator import generate_synthetic_data
 
14
  from rct_analyzer import analyze_rct_results
15
+ from sklearn.model_selection import train_test_split
16
+ from rct_simulator import run_rct_simulation, electronics_products, calculate_purchase_probability
17
 
18
  # Global variables to store generated data and RCT results
19
  generated_data = None
20
  rct_results = None
21
+ uplift_models = {}
22
+ last_used_features = None
23
 
24
  def perform_eda(discount_level):
25
+ """
26
+ Perform Exploratory Data Analysis on the RCT results for a specific discount level.
27
+
28
+ This function analyzes the impact of newsletter subscription and preferred payment method
29
+ on purchase behavior and profitability for the selected discount level.
30
+
31
+ Args:
32
+ discount_level (str): The discount level to analyze ('5% discount', '10% discount', or '15% discount')
33
+
34
+ Returns:
35
+ tuple: Contains EDA results, including newsletter and payment method analysis dataframes and plots
36
+ """
37
  global rct_results, generated_data
38
  if rct_results is None or generated_data is None:
39
  return "Please generate customer data and run RCT simulation first.", None, None, None, None
 
63
  newsletter_results, payment_results, newsletter_fig, payment_fig)
64
 
65
  def analyze_feature(df, feature):
66
+ """
67
+ Analyze the impact of a specific feature on purchase behavior and profitability.
68
+
69
+ This function calculates incremental purchases and profits for different values of the feature,
70
+ comparing the treatment group (discount) to the control group.
71
+
72
+ Args:
73
+ df (pandas.DataFrame): The dataset containing customer and transaction data
74
+ feature (str): The feature to analyze (e.g., 'newsletter_subscription' or 'preferred_payment_method')
75
+
76
+ Returns:
77
+ pandas.DataFrame: Results of the feature analysis, including incremental purchases and profits
78
+ """
79
  control_df = df[df['variant'] == 'Control']
80
  treatment_df = df[df['variant'] != 'Control']
81
 
 
96
  return results
97
 
98
  def create_bar_plot(data, feature, discount_level):
99
+ """
100
+ Create a bar plot to visualize the impact of a feature on incremental purchases and profits.
101
+
102
+ Args:
103
+ data (pandas.DataFrame): The data to plot
104
+ feature (str): The feature being analyzed
105
+ discount_level (str): The discount level being analyzed
106
+
107
+ Returns:
108
+ matplotlib.figure.Figure: The created bar plot
109
+ """
110
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
111
 
112
  data[feature] = data[feature].astype(str) # Ensure the feature is treated as a string
 
126
  plt.tight_layout()
127
  return fig
128
 
 
129
  def generate_and_display_data(num_customers):
130
+ """
131
+ Generate synthetic customer data and display samples of basic and extra customer information.
132
+
133
+ Args:
134
+ num_customers (int): The number of customer records to generate
135
+
136
+ Returns:
137
+ tuple: Contains sample dataframes of basic and extra customer information, and generation info
138
+ """
139
  global generated_data
140
  generated_data = generate_synthetic_data(num_customers=num_customers)
141
 
 
154
  f"Generated {num_customers} records. Displaying samples of 10 rows for each dataset.")
155
 
156
  def run_and_display_rct(experiment_duration):
157
+ """
158
+ Run a Randomized Control Trial (RCT) simulation and display sample results.
159
+
160
+ Args:
161
+ experiment_duration (int): The duration of the experiment in days
162
+
163
+ Returns:
164
+ tuple: Contains sample dataframes of variant assignments and transactions, and simulation info
165
+ """
166
  global generated_data, rct_results
167
  if generated_data is None:
168
  return None, None, "Please generate customer data first."
 
177
  f"Ran RCT simulation for {experiment_duration} days. Displaying samples of 10 rows for each dataset.")
178
 
179
  def analyze_and_display_results():
180
+ """
181
+ Analyze the results of the RCT simulation and display overall metrics, variant metrics, and visualizations.
182
+
183
+ Returns:
184
+ tuple: Contains overall metrics dataframe, variant metrics dataframe, visualization, and analysis info
185
+ """
186
  global rct_results
187
  if rct_results is None:
188
  return None, None, None, "Please run the RCT simulation first."
 
192
  return overall_df, variant_df, fig, "Analysis complete. Displaying results and visualizations."
193
 
194
  def build_uplift_model(data, features, treatment, control):
195
+ """
196
+ Build an uplift model to predict the impact of a treatment on customer behavior.
197
+
198
+ This function prepares the data, creates dummy variables for categorical features,
199
+ standardizes numerical features, and fits a RandomForestClassifier and a BaseTClassifier model.
200
+
201
+ Args:
202
+ data (pandas.DataFrame): The dataset containing customer and transaction data
203
+ features (list): List of features to use in the model
204
+ treatment (str): The treatment variant (e.g., '10% discount')
205
+ control (str): The control variant
206
+
207
+ Returns:
208
+ tuple: Contains the fitted model, uplift scores, feature importance dataframe, and prepared features
209
+ """
210
  # Prepare the data
211
  treatment_data = data[data['variant'] == treatment]
212
  control_data = data[data['variant'] == control]
 
252
  elif uplift_scores.shape[1] == 1:
253
  uplift_scores = uplift_scores.flatten()
254
 
255
+ return model, uplift_scores, feature_importance_df, X
256
 
257
+ def calculate_incremental_metrics(data, uplift_scores, treatment, threshold):
258
+ """
259
+ Calculate incremental purchases and profits based on uplift scores and a threshold.
260
+
261
+ Args:
262
+ data (pandas.DataFrame): The dataset containing customer and transaction data
263
+ uplift_scores (numpy.array): The uplift scores for each customer
264
+ treatment (str): The treatment variant (e.g., '10% discount')
265
+ threshold (float): The uplift score threshold for targeting
266
+
267
+ Returns:
268
+ tuple: Contains incremental purchases and incremental profits
269
+ """
270
+ treated = data[data['variant'] == treatment]
271
+ control = data[data['variant'] == 'Control']
272
+
273
+ targeted = data[uplift_scores > threshold]
274
+ targeted_treated = targeted[targeted['variant'] == treatment]
275
+ targeted_control = targeted[targeted['variant'] == 'Control']
276
+
277
+ inc_purchases = (targeted_treated['purchase'].mean() - targeted_control['purchase'].mean()) * len(targeted)
278
+ inc_profit = (targeted_treated['profit'].mean() - targeted_control['profit'].mean()) * len(targeted)
279
+
280
+ return inc_purchases, inc_profit
281
+
282
+
283
+ def build_models_and_display(selected_features):
284
+ """
285
+ Build uplift models for all discount levels and display results.
286
+
287
+ This function builds uplift models for 5%, 10%, and 15% discounts, calculates feature importance,
288
+ and creates visualizations to compare model performance.
289
+
290
+ Args:
291
+ selected_features (list): List of features to use in the models
292
+
293
+ Returns:
294
+ tuple: Contains model information, feature importance plot, and uplift plot
295
+ """
296
+ global rct_results, generated_data, uplift_models, last_used_features
297
  if rct_results is None or generated_data is None:
298
  return "Please generate customer data and run RCT simulation first.", None, None
299
+
300
  transactions_df, variant_assignments_df = rct_results
301
 
302
  # Prepare the data
303
  df_with_variant = pd.merge(generated_data, variant_assignments_df, on='customer_id', how='inner')
304
  transactions_df['purchase'] = 1
305
  final_df = pd.merge(df_with_variant, transactions_df, on=['customer_id', 'variant'], how='left')
306
+ final_df[['purchase', 'price', 'discounted_price', 'cost', 'profit']] = final_df[['purchase', 'price', 'discounted_price', 'cost', 'profit']].fillna(0)
307
+
308
+ # Perform train/test split at customer ID level
309
+ train_ids, test_ids = train_test_split(final_df['customer_id'].unique(), test_size=0.5, random_state=42)
310
+ train_df = final_df[final_df['customer_id'].isin(train_ids)]
311
+ test_df = final_df[final_df['customer_id'].isin(test_ids)]
312
+
313
+ treatments = ['5% discount', '10% discount', '15% discount']
314
+ colors = ['blue', 'green', 'purple']
315
+
316
+ all_feature_importance = []
317
+ uplift_models = {} # Store models for each treatment
318
+
319
+ # Create subplots for train and test
320
+ fig_uplift = make_subplots(rows=2, cols=1, subplot_titles=("Train Set Performance", "Test Set Performance"),
321
+ vertical_spacing=0.1, row_heights=[0.5, 0.5])
322
+
323
+ for treatment, color in zip(treatments, colors):
324
+ model, train_uplift_scores, feature_importance_df, X_train = build_uplift_model(train_df, selected_features, treatment, 'Control')
325
+ uplift_models[treatment] = model # Store the model
326
+
327
+ feature_importance_df['treatment'] = treatment
328
+ all_feature_importance.append(feature_importance_df)
329
+
330
+ X_test = pd.get_dummies(test_df[selected_features], columns=[f for f in selected_features if test_df[f].dtype == 'object'])
331
+ X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
332
+
333
+ scaler = StandardScaler()
334
+ X_test.loc[:, X_test.dtypes != 'uint8'] = scaler.fit_transform(X_test.loc[:, X_test.dtypes != 'uint8'])
335
+
336
+ test_uplift_scores = model.predict(X_test.values)
337
+ if test_uplift_scores.ndim == 2:
338
+ test_uplift_scores = test_uplift_scores[:, 1] - test_uplift_scores[:, 0] if test_uplift_scores.shape[1] == 2 else test_uplift_scores.flatten()
339
+
340
+ for i, (dataset, uplift_scores) in enumerate([(train_df, train_uplift_scores), (test_df, test_uplift_scores)]):
341
+ thresholds = np.linspace(np.min(uplift_scores), np.max(uplift_scores), 100)
342
+ inc_purchases, inc_profits = zip(*[calculate_incremental_metrics(dataset, uplift_scores, treatment, threshold) for threshold in thresholds])
343
+
344
+ fig_uplift.add_trace(go.Scatter(x=inc_purchases, y=inc_profits, mode='lines', name=f'{treatment} Model', line=dict(color=color, width=2)), row=i+1, col=1)
345
+ fig_uplift.add_trace(go.Scatter(x=[0, inc_purchases[0]], y=[0, inc_profits[0]], mode='lines', name=f'{treatment} Random', line=dict(color=color, width=2, dash='dash')), row=i+1, col=1)
346
 
347
  # Create feature importance plot
348
+ fig_importance, ax = plt.subplots(figsize=(12, 8))
349
+ combined_feature_importance = pd.concat(all_feature_importance)
350
+ treatment_order = ['5% discount', '10% discount', '15% discount']
351
+ feature_order = combined_feature_importance[combined_feature_importance['treatment'] == '5% discount'].sort_values('importance', ascending=False)['feature'].unique()
352
+
353
+ sns.barplot(x='importance', y='feature', hue='treatment', data=combined_feature_importance,
354
+ hue_order=treatment_order, order=feature_order, ax=ax)
355
+
356
+ ax.set_title('Feature Importance for All Treatments vs Control (Train Set)')
357
  ax.set_xlabel('Importance')
358
  ax.set_ylabel('Feature')
359
  plt.tight_layout()
360
 
361
+ # Improve uplift plot appearance
362
+ fig_uplift.update_layout(
363
+ height=1200, width=1000,
364
+ title={'text': 'Incremental Profit vs Incremental Purchases (All Treatments)', 'y':0.98, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
365
+ legend=dict(orientation='v', yanchor='bottom', y=0.02, xanchor='left', x=0.02, bgcolor='rgba(255, 255, 255, 0.5)')
366
+ )
367
+
368
+ for i in range(1, 3):
369
+ fig_uplift.update_xaxes(title_text="Incremental Purchases", row=i, col=1)
370
+ fig_uplift.update_yaxes(title_text="Incremental Profit", row=i, col=1)
371
+ fig_uplift.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', row=i, col=1)
372
+ fig_uplift.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', row=i, col=1)
373
+
374
+ last_used_features = selected_features # Store the last used features
375
+
376
+ info = f"Uplift models built using {len(selected_features)} features.\n"
377
+ info += f"Treatments: 5%, 10%, and 15% discounts vs Control\n"
378
+ info += f"Number of samples: Train set - {len(train_df)}, Test set - {len(test_df)}\n"
379
+ info += f"Displaying results for both Train and Test sets"
380
+
381
+ return info, fig_importance, fig_uplift
382
+
383
+
384
+ def run_targeting_policy(discount_level, target_percentage, experiment_duration):
385
+ """
386
+ Run a targeting policy experiment based on the uplift model predictions.
387
+
388
+ This function applies the uplift model to predict customer responses to a discount,
389
+ targets a specified percentage of customers, and simulates the experiment results.
390
+
391
+ Args:
392
+ discount_level (str): The discount level to apply (e.g., '10% discount')
393
+ target_percentage (float): The percentage of customers to target with the discount
394
+ experiment_duration (int): The duration of the experiment in days
395
+
396
+ Returns:
397
+ tuple: Contains the experiment results DataFrame, transactions DataFrame, and experiment info
398
+ """
399
+ global generated_data, uplift_models, last_used_features
400
+ if generated_data is None or not uplift_models:
401
+ return None, "Please generate customer data and build uplift models first."
402
+
403
+ # Prepare the data
404
+ df = generated_data.copy()
405
+
406
+ # Use the uplift model to make predictions
407
+ model = uplift_models.get(discount_level)
408
+ if model is None:
409
+ return None, f"No uplift model found for {discount_level}. Please build the model first."
410
+
411
+ # Prepare features for prediction
412
+ X = pd.get_dummies(df[last_used_features], columns=[f for f in last_used_features if df[f].dtype == 'object'])
413
+
414
+ # Standardize numerical features
415
+ numerical_features = [f for f in last_used_features if df[f].dtype in ['int64', 'float64']]
416
+ scaler = StandardScaler()
417
+ X[numerical_features] = scaler.fit_transform(X[numerical_features])
418
+
419
+ # Get uplift scores
420
+ uplift_scores = model.predict(X.values)
421
+ if uplift_scores.ndim == 2:
422
+ uplift_scores = uplift_scores[:, 1] - uplift_scores[:, 0] if uplift_scores.shape[1] == 2 else uplift_scores.flatten()
423
+
424
+ # Calculate the threshold based on the target percentage
425
+ model_threshold = get_threshold_for_percentage(uplift_scores, target_percentage)
426
+
427
+ # Assign variants
428
+ all_variants = ['Control', '5% discount', '10% discount', '15% discount', 'Targeted']
429
+ variant_probabilities = [0.2] * 5 # Equal probability for all variants
430
+
431
+ df['experiment_variant'] = np.random.choice(all_variants, size=len(df), p=variant_probabilities)
432
+
433
+ # For customers in the Targeted group, determine if they get a discount
434
+ df['gets_targeted_discount'] = (df['experiment_variant'] == 'Targeted') & (uplift_scores > model_threshold)
435
+
436
+ # Run simulation
437
+ transactions_df = run_targeted_simulation(df, experiment_duration, discount_level)
438
+
439
+ return df, transactions_df, f"Ran targeting policy simulation for {experiment_duration} days with {discount_level} and targeting {target_percentage}% of the audience."
440
+
441
+ def run_targeted_simulation(df, experiment_duration, discount_level):
442
+ """
443
+ Run a targeted simulation based on the assigned variants and uplift scores.
444
+
445
+ This function simulates customer purchases during the experiment period,
446
+ taking into account the assigned variants and whether customers receive targeted discounts.
447
+
448
+ Args:
449
+ df (pandas.DataFrame): The customer data with assigned variants and targeting information
450
+ experiment_duration (int): The duration of the experiment in days
451
+ discount_level (str): The discount level being applied (e.g., '10% discount')
452
+
453
+ Returns:
454
+ pandas.DataFrame: A DataFrame containing the simulated transactions
455
+ """
456
+ transactions = []
457
+ for _, customer in df.iterrows():
458
+ variant = customer['experiment_variant']
459
+ if variant == 'Targeted':
460
+ discount = float(discount_level.split('%')[0]) / 100 if customer['gets_targeted_discount'] else 0
461
+ elif '%' in variant:
462
+ discount = float(variant.split('%')[0]) / 100
463
+ else:
464
+ discount = 0
465
+
466
+ # Simulate purchases
467
+ num_purchases = np.random.poisson(experiment_duration / 10)
468
+ for _ in range(num_purchases):
469
+ product = np.random.choice(electronics_products)
470
+ if np.random.random() < calculate_purchase_probability(customer, discount):
471
+ price = product['price']
472
+ discounted_price = price * (1 - discount)
473
+ cost = product['cost']
474
+ profit = discounted_price - cost
475
+ transactions.append({
476
+ 'customer_id': customer['customer_id'],
477
+ 'variant': variant,
478
+ 'product': product['name'],
479
+ 'price': price,
480
+ 'discounted_price': discounted_price,
481
+ 'cost': cost,
482
+ 'profit': profit
483
+ })
484
+
485
+ return pd.DataFrame(transactions)
486
+
487
+ def analyze_targeting_results(assignment_df, transactions_df):
488
+ """
489
+ Analyze the results of the targeting policy experiment.
490
+
491
+ This function calculates various metrics for each variant, including conversion rates,
492
+ average revenue and profit per customer, and incremental purchases and profits.
493
+
494
+ Args:
495
+ assignment_df (pandas.DataFrame): The DataFrame containing variant assignments
496
+ transactions_df (pandas.DataFrame): The DataFrame containing transaction data
497
+
498
+ Returns:
499
+ tuple: Contains a DataFrame with variant metrics and a plotly Figure object
500
+ """
501
+ # Calculate metrics for assigned customers
502
+ assigned_customers = assignment_df.groupby('experiment_variant')['customer_id'].nunique().reset_index()
503
+ assigned_customers.columns = ['variant', 'assigned_customers']
504
 
505
+ # Calculate metrics for purchases
506
+ purchase_metrics = transactions_df.groupby('variant').agg({
507
+ 'customer_id': 'nunique',
508
+ 'discounted_price': 'sum',
509
+ 'profit': 'sum'
510
+ }).reset_index()
511
+ purchase_metrics.columns = ['variant', 'purchasing_customers', 'revenue', 'profit']
512
+
513
+ # Merge assigned customers with purchase metrics
514
+ variant_metrics = pd.merge(assigned_customers, purchase_metrics, on='variant', how='left')
515
+ variant_metrics = variant_metrics.fillna(0) # Fill NaN values with 0 for variants with no purchases
516
+
517
+ # Calculate additional metrics
518
+ variant_metrics['conversion_rate'] = variant_metrics['purchasing_customers'] / variant_metrics['assigned_customers']
519
+ variant_metrics['avg_revenue_per_customer'] = variant_metrics['revenue'] / variant_metrics['assigned_customers']
520
+ variant_metrics['avg_profit_per_customer'] = variant_metrics['profit'] / variant_metrics['assigned_customers']
521
+
522
+ # Calculate incremental metrics compared to control
523
+ control_metrics = variant_metrics[variant_metrics['variant'] == 'Control'].iloc[0]
524
+ variant_metrics['incremental_purchases'] = variant_metrics['purchasing_customers'] - control_metrics['purchasing_customers']
525
+ variant_metrics['incremental_profit'] = variant_metrics['profit'] - control_metrics['profit']
526
+
527
+ # Create visualization
528
+ fig = go.Figure()
529
+
530
+ colors = {'Control': 'blue', '5% discount': 'green', '10% discount': 'orange',
531
+ '15% discount': 'red', 'Targeted': 'purple'}
532
+
533
+ for variant in variant_metrics['variant']:
534
+ variant_data = variant_metrics[variant_metrics['variant'] == variant]
535
+ fig.add_trace(go.Scatter(
536
+ x=[variant_data['incremental_purchases'].values[0]],
537
+ y=[variant_data['incremental_profit'].values[0]],
538
+ mode='markers+text',
539
+ name=variant,
540
+ text=[variant],
541
+ textposition="top center",
542
+ marker=dict(size=12, color=colors.get(variant, 'gray'))
543
+ ))
544
+
545
+ fig.update_layout(
546
+ title='Incremental Profit vs Incremental Purchases by Variant',
547
+ xaxis_title='Incremental Purchases',
548
+ yaxis_title='Incremental Profit',
549
+ showlegend=True
550
+ )
551
+
552
+ return variant_metrics, fig
553
 
554
+ def get_threshold_for_percentage(uplift_scores, percentage):
555
+ """
556
+ Calculate the threshold that targets the specified percentage of the audience.
557
+
558
+ Args:
559
+ uplift_scores (numpy.array): Array of uplift scores for all customers
560
+ percentage (float): The desired percentage of customers to target
561
+
562
+ Returns:
563
+ float: The uplift score threshold that targets the specified percentage of customers
564
+ """
565
+ if percentage == 100:
566
+ return np.min(uplift_scores) - 1e-10 # Return a value slightly lower than the minimum
567
+ elif percentage == 0:
568
+ return np.max(uplift_scores) + 1e-10 # Return a value slightly higher than the maximum
569
+ else:
570
+ sorted_scores = np.sort(uplift_scores)[::-1] # Sort in descending order
571
+ index = int(len(sorted_scores) * (percentage / 100)) - 1 # Subtract 1 to avoid index out of bounds
572
+ return sorted_scores[index]
573
+
574
  with gr.Blocks() as demo:
575
  gr.Markdown("# Causal AI - Synthetic Customer Data Generator and RCT Simulator")
576
 
 
580
  gr.Markdown("First we generate some basic attributes that are defined when the customer first registers, such as Name, City or Preferred Language.")
581
  gr.Markdown("Then we add some extra information that is usually the result of the customer past behavior, such as Loyalty Level, Past Purchases or Categories of interest.")
582
  gr.Markdown("## Select the number of customers that you want to Generate")
583
+ num_customers_input = gr.Slider(minimum=10000, maximum=500000, value=200000, step=1000, label="Number of Customer Records")
584
  generate_btn = gr.Button("Generate Customer Data")
585
  gr.Markdown("## Basic Customer Info Sample")
586
  basic_info_output = gr.DataFrame()
 
654
  )
655
 
656
  with gr.Tab("Build Uplift Model"):
657
+ gr.Markdown("## Build Uplift Models for All Discount Levels")
658
 
659
  # Feature selection
660
  feature_checklist = gr.CheckboxGroup(
661
  choices=['age', 'gender', 'region', 'preferred_language', 'newsletter_subscription',
662
  'preferred_payment_method', 'loyalty_level', 'main_browsing_device',
663
  'average_order_value', 'total_orders'],
664
+ label="Select features for the models",
665
  value=['age', 'gender', 'loyalty_level', 'average_order_value', 'total_orders']
666
  )
667
 
668
+ build_model_btn = gr.Button("Build Uplift Models")
 
 
 
 
 
 
 
669
 
670
  model_info = gr.Textbox(label="Model Information")
671
+ feature_importance_plot = gr.Plot(label="Feature Importance for All Models (Train Set)")
672
+ uplift_plot = gr.Plot(label="Incremental Profit vs Incremental Purchases (All Models)")
673
 
674
  build_model_btn.click(
675
+ fn=build_models_and_display,
676
+ inputs=[feature_checklist],
677
+ outputs=[model_info, feature_importance_plot, uplift_plot]
678
  )
679
 
680
+ with gr.Tab("Run Targeting Policy"):
681
+ gr.Markdown("# Run Targeting Policy Experiment")
682
+ gr.Markdown("In this section, we run an experiment using a targeted policy based on the uplift models.")
683
+
684
+ discount_level = gr.Dropdown(
685
+ choices=['5% discount', '10% discount', '15% discount'],
686
+ label="Select discount level for targeting",
687
+ value='10% discount',
688
+ interactive=True
689
+ )
690
+ target_percentage = gr.Slider(minimum=0, maximum=100, value=40, step=1, label="Percentage of Audience Targeted", interactive=True)
691
+ experiment_duration = gr.Slider(minimum=10, maximum=60, value=30, step=1, label="Experiment Duration (days)", interactive=True)
692
+
693
+ run_targeting_btn = gr.Button("Run Targeting Policy Experiment")
694
+ targeting_info = gr.Textbox(label="Targeting Experiment Info")
695
+
696
+ gr.Markdown("## Experiment Results")
697
+ targeting_results = gr.DataFrame(label="Targeting Experiment Results")
698
+ targeting_plot = gr.Plot(label="Incremental Profit vs Incremental Purchases by Variant")
699
+
700
+ def run_and_analyze_targeting(discount, percentage, duration):
701
+ assignment_df, transactions_df, info = run_targeting_policy(discount, percentage, duration)
702
+ if transactions_df is None:
703
+ return info, None, None
704
+ results, plot = analyze_targeting_results(assignment_df, transactions_df)
705
+ return info, results, plot
706
+
707
+ run_targeting_btn.click(
708
+ fn=run_and_analyze_targeting,
709
+ inputs=[discount_level, target_percentage, experiment_duration],
710
+ outputs=[targeting_info, targeting_results, targeting_plot]
711
+ )
712
+
713
+ demo.launch()
data_generator.py CHANGED
@@ -5,6 +5,19 @@ from datetime import datetime, timedelta, date
5
  import random
6
 
7
  def generate_synthetic_data(num_customers=1000):
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # Set up Faker for Ukrainian locale
9
  fake = Faker('uk_UA')
10
  Faker.seed(42)
@@ -17,9 +30,11 @@ def generate_synthetic_data(num_customers=1000):
17
 
18
  # Helper functions
19
  def generate_phone_number():
 
20
  return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"
21
 
22
  def generate_email(name):
 
23
  username = name.lower().replace(' ', '.').replace('\'', '')
24
  domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
25
  return f"{username}@{domain}"
@@ -45,7 +60,7 @@ def generate_synthetic_data(num_customers=1000):
45
 
46
  # Region and City
47
  region = np.random.choice(list(REGIONS.keys()))
48
- region_info = REGIONS[region]
49
  is_urban = np.random.random() < region_info['urbanization']
50
  city = fake.city()
51
  if not is_urban:
@@ -53,7 +68,14 @@ def generate_synthetic_data(num_customers=1000):
53
 
54
  # Age (dependent on region)
55
  age = int(np.random.normal(region_info['avg_age'], 10))
56
- age = max(18, min(80, age)) # Clamp between 18 and 80
 
 
 
 
 
 
 
57
 
58
  # Gender (slight dependency on age and region)
59
  gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
@@ -80,12 +102,20 @@ def generate_synthetic_data(num_customers=1000):
80
  order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
81
  total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers
82
 
 
 
 
 
83
  base_aov = np.random.gamma(shape=5, scale=100)
84
  aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
85
  aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
86
  aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
87
  average_order_value = base_aov * aov_multiplier
88
 
 
 
 
 
89
  # Last order date
90
  last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
91
  else:
@@ -94,10 +124,16 @@ def generate_synthetic_data(num_customers=1000):
94
  last_order_date = None
95
 
96
  # Loyalty level based on total orders
97
- loyalty_level = min(5, max(1, int(total_orders+1 / 2)))
 
 
 
 
98
 
99
  # Newsletter subscription (dependent on age, loyalty, and tech adoption)
100
  newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
 
 
101
  newsletter_subscription = np.random.random() < newsletter_prob
102
 
103
  # Preferred payment method (dependent on age and urbanization)
@@ -114,6 +150,10 @@ def generate_synthetic_data(num_customers=1000):
114
  p=payment_probs
115
  )
116
 
 
 
 
 
117
  # Main browsing device (dependent on age and tech adoption)
118
  device_probs = [
119
  0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
@@ -122,6 +162,12 @@ def generate_synthetic_data(num_customers=1000):
122
  ]
123
  device_probs = [max(0, min(p, 1)) for p in device_probs]
124
  device_probs = [p / sum(device_probs) for p in device_probs]
 
 
 
 
 
 
125
  main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)
126
 
127
  # Product categories (dependent on age, gender, and browsing device)
@@ -169,4 +215,4 @@ def generate_synthetic_data(num_customers=1000):
169
 
170
  if __name__ == "__main__":
171
  df = generate_synthetic_data()
172
- print(df.head())
 
5
  import random
6
 
7
  def generate_synthetic_data(num_customers=1000):
8
+ """
9
+ Generate synthetic customer data for e-commerce analysis.
10
+
11
+ This function creates a dataset of customers with various attributes such as
12
+ demographics, purchase history, and preferences. It uses the Faker library to
13
+ generate realistic-looking data for Ukrainian customers.
14
+
15
+ Args:
16
+ num_customers (int): The number of customer records to generate (default: 1000)
17
+
18
+ Returns:
19
+ pandas.DataFrame: A DataFrame containing the generated customer data
20
+ """
21
  # Set up Faker for Ukrainian locale
22
  fake = Faker('uk_UA')
23
  Faker.seed(42)
 
30
 
31
  # Helper functions
32
  def generate_phone_number():
33
+ """Generate a realistic Ukrainian phone number."""
34
  return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"
35
 
36
  def generate_email(name):
37
+ """Generate an email address based on the customer's name."""
38
  username = name.lower().replace(' ', '.').replace('\'', '')
39
  domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
40
  return f"{username}@{domain}"
 
60
 
61
  # Region and City
62
  region = np.random.choice(list(REGIONS.keys()))
63
+ region_info = REGIONS[region].copy() # Create a copy to avoid modifying the original
64
  is_urban = np.random.random() < region_info['urbanization']
65
  city = fake.city()
66
  if not is_urban:
 
68
 
69
  # Age (dependent on region)
70
  age = int(np.random.normal(region_info['avg_age'], 10))
71
+ age_noise = np.random.normal(0, 2) # Add noise with mean 0 and std dev 2
72
+ age = max(18, min(80, int(age + age_noise)))
73
+
74
+ # Add noise to urbanization and tech adoption
75
+ urbanization_noise = np.random.normal(0, 0.05)
76
+ tech_adoption_noise = np.random.normal(0, 0.05)
77
+ region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise))
78
+ region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise))
79
 
80
  # Gender (slight dependency on age and region)
81
  gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
 
102
  order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
103
  total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers
104
 
105
+ # Add noise to total orders
106
+ total_orders_noise = np.random.poisson(2)
107
+ total_orders = max(1, total_orders + total_orders_noise)
108
+
109
  base_aov = np.random.gamma(shape=5, scale=100)
110
  aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
111
  aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
112
  aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
113
  average_order_value = base_aov * aov_multiplier
114
 
115
+ # Add noise to average order value
116
+ aov_noise = np.random.normal(0, average_order_value * 0.1) # 10% noise
117
+ average_order_value = max(0, average_order_value + aov_noise)
118
+
119
  # Last order date
120
  last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
121
  else:
 
124
  last_order_date = None
125
 
126
  # Loyalty level based on total orders
127
+ loyalty_level = min(5, max(1, int(total_orders / 2)))
128
+
129
+ # Add some randomness to loyalty level
130
+ loyalty_noise = np.random.randint(-1, 2) # -1, 0, or 1
131
+ loyalty_level = max(1, min(5, loyalty_level + loyalty_noise))
132
 
133
  # Newsletter subscription (dependent on age, loyalty, and tech adoption)
134
  newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
135
+ newsletter_noise = np.random.normal(0, 0.1)
136
+ newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise))
137
  newsletter_subscription = np.random.random() < newsletter_prob
138
 
139
  # Preferred payment method (dependent on age and urbanization)
 
150
  p=payment_probs
151
  )
152
 
153
+ # Add some inconsistency to preferred payment method
154
+ if np.random.random() < 0.1: # 10% chance of inconsistent preference
155
+ preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'])
156
+
157
  # Main browsing device (dependent on age and tech adoption)
158
  device_probs = [
159
  0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
 
162
  ]
163
  device_probs = [max(0, min(p, 1)) for p in device_probs]
164
  device_probs = [p / sum(device_probs) for p in device_probs]
165
+
166
+ # Add noise to main browsing device probabilities
167
+ device_noise = np.random.normal(0, 0.05, size=3)
168
+ device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)]
169
+ device_probs = [p / sum(device_probs) for p in device_probs]
170
+
171
  main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)
172
 
173
  # Product categories (dependent on age, gender, and browsing device)
 
215
 
216
  if __name__ == "__main__":
217
  df = generate_synthetic_data()
218
+ print(df.head())
rct_analyzer.py CHANGED
@@ -2,6 +2,18 @@ import pandas as pd
2
  import matplotlib.pyplot as plt
3
 
4
  def calculate_metrics(df):
 
 
 
 
 
 
 
 
 
 
 
 
5
  total_customers = len(df['customer_id'].unique())
6
  total_purchases = len(df)
7
  total_revenue = df['discounted_price'].sum()
@@ -19,6 +31,19 @@ def calculate_metrics(df):
19
  })
20
 
21
  def analyze_rct_results(transactions_df, variant_assignments_df):
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  overall_metrics = calculate_metrics(transactions_df)
23
  variant_metrics = transactions_df.groupby('variant').apply(calculate_metrics).reset_index()
24
 
@@ -36,34 +61,20 @@ def analyze_rct_results(transactions_df, variant_assignments_df):
36
  variant_metrics['variant'] = pd.Categorical(variant_metrics['variant'], categories=variant_order, ordered=True)
37
  variant_metrics = variant_metrics.sort_values('variant')
38
 
39
- # # Create plots
40
- # fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
41
 
42
- # # Create plots
43
- fig, ax2 = plt.subplots(1, 1, figsize=(10, 6))
44
- #
45
  # Incremental Total Profit vs Incremental Total Purchases
46
  non_control = variant_metrics[variant_metrics['variant'] != 'Control']
47
- # ax1.scatter(non_control['Incremental Purchases'], non_control['Incremental Profit'])
48
- # for _, row in non_control.iterrows():
49
- # ax1.annotate(row['variant'], (row['Incremental Purchases'], row['Incremental Profit']))
50
- # ax1.set_xlabel('Incremental Total Purchases')
51
- # ax1.set_ylabel('Incremental Total Profit')
52
- # ax1.set_title('Incremental Total Profit vs Incremental Total Purchases')
53
- # ax1.axhline(y=0, color='r', linestyle='--')
54
- # ax1.axvline(x=0, color='r', linestyle='--')
55
- # ax1.grid(True, linestyle=':', alpha=0.7)
56
-
57
- # # Incremental Total Profit per Incremental Purchases vs Incremental Total Purchases
58
- ax2.scatter(non_control['Incremental Purchases'], non_control['Profit per Incremental Purchase'])
59
  for _, row in non_control.iterrows():
60
- ax2.annotate(row['variant'], (row['Incremental Purchases'], row['Profit per Incremental Purchase']))
61
- ax2.set_xlabel('Incremental Total Purchases')
62
- ax2.set_ylabel('Incremental Total Profit per Incremental Purchase')
63
- ax2.set_title('Incremental Profit per Purchase vs Incremental Purchases')
64
- ax2.axhline(y=0, color='r', linestyle='--')
65
- ax2.axvline(x=0, color='r', linestyle='--')
66
- ax2.grid(True, linestyle=':', alpha=0.7)
67
-
68
  plt.tight_layout()
69
- return overall_df, variant_metrics, fig
 
2
  import matplotlib.pyplot as plt
3
 
4
  def calculate_metrics(df):
5
+ """
6
+ Calculate key metrics from the RCT results.
7
+
8
+ This function computes various metrics such as total customers, purchases,
9
+ revenue, profit, conversion rate, and average order value.
10
+
11
+ Args:
12
+ df (pandas.DataFrame): The DataFrame containing RCT results
13
+
14
+ Returns:
15
+ pandas.Series: A series containing calculated metrics
16
+ """
17
  total_customers = len(df['customer_id'].unique())
18
  total_purchases = len(df)
19
  total_revenue = df['discounted_price'].sum()
 
31
  })
32
 
33
  def analyze_rct_results(transactions_df, variant_assignments_df):
34
+ """
35
+ Analyze the results of the Randomized Control Trial (RCT).
36
+
37
+ This function calculates overall metrics, metrics per variant, and creates
38
+ visualizations to compare the performance of different discount levels.
39
+
40
+ Args:
41
+ transactions_df (pandas.DataFrame): DataFrame containing transaction data
42
+ variant_assignments_df (pandas.DataFrame): DataFrame containing variant assignments
43
+
44
+ Returns:
45
+ tuple: Contains overall metrics DataFrame, variant metrics DataFrame, and a matplotlib Figure
46
+ """
47
  overall_metrics = calculate_metrics(transactions_df)
48
  variant_metrics = transactions_df.groupby('variant').apply(calculate_metrics).reset_index()
49
 
 
61
  variant_metrics['variant'] = pd.Categorical(variant_metrics['variant'], categories=variant_order, ordered=True)
62
  variant_metrics = variant_metrics.sort_values('variant')
63
 
64
+ # Create plots
65
+ fig, ax1 = plt.subplots(1, 1, figsize=(10, 6))
66
 
 
 
 
67
  # Incremental Total Profit vs Incremental Total Purchases
68
  non_control = variant_metrics[variant_metrics['variant'] != 'Control']
69
+ ax1.scatter(non_control['Incremental Purchases'], non_control['Incremental Profit'])
 
 
 
 
 
 
 
 
 
 
 
70
  for _, row in non_control.iterrows():
71
+ ax1.annotate(row['variant'], (row['Incremental Purchases'], row['Incremental Profit']))
72
+ ax1.set_xlabel('Incremental Total Purchases')
73
+ ax1.set_ylabel('Incremental Total Profit')
74
+ ax1.set_title('Incremental Total Profit vs Incremental Total Purchases')
75
+ ax1.axhline(y=0, color='r', linestyle='--')
76
+ ax1.axvline(x=0, color='r', linestyle='--')
77
+ ax1.grid(True, linestyle=':', alpha=0.7)
78
+
79
  plt.tight_layout()
80
+ return overall_df, variant_metrics, fig
rct_simulator.py CHANGED
@@ -31,8 +31,21 @@ electronics_products = [
31
  variants = ['Control', '5% discount', '10% discount', '15% discount']
32
  discount_rates = [0, 0.05, 0.10, 0.15]
33
 
34
- # Function to calculate purchase probability with increased feature dependency
35
  def calculate_purchase_probability(customer, discount, base_prob=0.1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  prob = base_prob
37
 
38
  # Age factor (younger customers more sensitive to discounts)
@@ -74,33 +87,70 @@ def calculate_purchase_probability(customer, discount, base_prob=0.1):
74
  # Adjust probability based on discount with increased sensitivity
75
  discount_sensitivity = 1 + age_factor - loyalty_factor + (0.5 if customer['newsletter_subscription'] else 0)
76
  if discount == 0.05:
77
- prob *= (1 + discount * 3.5 * discount_sensitivity)
78
  elif discount == 0.1:
79
- prob *= (1 + discount * 4.5 * discount_sensitivity)
80
  elif discount == 0.15:
81
- prob *= (1 + discount * 4.3 * discount_sensitivity)
82
 
83
- return min(max(prob, 0), 1) # Ensure probability is between 0 and 1
 
 
 
 
84
 
85
- # Function to simulate purchases
86
  def simulate_purchase(customer, variant_index, product):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  discount = discount_rates[variant_index]
88
  prob = calculate_purchase_probability(customer, discount)
89
 
90
  if np.random.random() < prob:
91
- discounted_price = product['price'] * (1 - discount)
 
 
 
 
 
 
 
92
  return {
93
  'customer_id': customer['customer_id'],
94
  'variant': variants[variant_index],
95
  'product': product['name'],
96
  'price': product['price'],
97
  'discounted_price': discounted_price,
98
- 'cost': product['cost'],
99
- 'profit': discounted_price - product['cost']
100
  }
101
  return None
102
 
103
  def run_rct_simulation(df, experiment_duration=30):
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # Set random seed for reproducibility
105
  np.random.seed(42)
106
  random.seed(42)
@@ -110,10 +160,14 @@ def run_rct_simulation(df, experiment_duration=30):
110
  end_date = start_date + timedelta(days=experiment_duration)
111
 
112
  results = []
113
- variant_assignments = [] # New list to store variant assignments
114
 
115
  for _, customer in df.iterrows():
116
- variant_index = np.random.randint(0, 4) # Randomly assign variant
 
 
 
 
117
 
118
  # Record variant assignment for all eligible customers
119
  variant_assignments.append({
@@ -121,8 +175,9 @@ def run_rct_simulation(df, experiment_duration=30):
121
  'variant': variants[variant_index]
122
  })
123
 
124
- # Simulate multiple purchase opportunities
125
- for _ in range(round(experiment_duration/10)):
 
126
  product = random.choice(electronics_products)
127
  purchase = simulate_purchase(customer, variant_index, product)
128
  if purchase:
@@ -135,4 +190,4 @@ def run_rct_simulation(df, experiment_duration=30):
135
  # Create DataFrame from variant assignments
136
  variant_assignments_df = pd.DataFrame(variant_assignments)
137
 
138
- return transactions_df, variant_assignments_df
 
31
  variants = ['Control', '5% discount', '10% discount', '15% discount']
32
  discount_rates = [0, 0.05, 0.10, 0.15]
33
 
 
34
  def calculate_purchase_probability(customer, discount, base_prob=0.1):
35
+ """
36
+ Calculate the probability of a customer making a purchase based on various factors.
37
+
38
+ This function considers customer attributes such as age, loyalty, past behavior,
39
+ and the applied discount to determine the likelihood of a purchase.
40
+
41
+ Args:
42
+ customer (dict): A dictionary containing customer attributes
43
+ discount (float): The discount rate applied (e.g., 0.05 for 5% discount)
44
+ base_prob (float): The base probability of purchase (default: 0.1)
45
+
46
+ Returns:
47
+ float: The calculated probability of purchase
48
+ """
49
  prob = base_prob
50
 
51
  # Age factor (younger customers more sensitive to discounts)
 
87
  # Adjust probability based on discount with increased sensitivity
88
  discount_sensitivity = 1 + age_factor - loyalty_factor + (0.5 if customer['newsletter_subscription'] else 0)
89
  if discount == 0.05:
90
+ prob *= (1 + discount * 3.5 * discount_sensitivity)
91
  elif discount == 0.1:
92
+ prob *= (1 + discount * 4.5 * discount_sensitivity)
93
  elif discount == 0.15:
94
+ prob *= (1 + discount * 4.3 * discount_sensitivity)
95
 
96
+ # Add random noise to the probability
97
+ noise = np.random.normal(0, 0.02) # Add noise with mean 0 and std dev 0.02
98
+ prob = max(0, min(1, prob + noise))
99
+
100
+ return prob
101
 
 
102
  def simulate_purchase(customer, variant_index, product):
103
+ """
104
+ Simulate a purchase based on the customer, variant, and product.
105
+
106
+ This function determines if a purchase is made, and if so, calculates
107
+ the discounted price, cost, and profit.
108
+
109
+ Args:
110
+ customer (dict): A dictionary containing customer attributes
111
+ variant_index (int): The index of the variant (discount level)
112
+ product (dict): A dictionary containing product information
113
+
114
+ Returns:
115
+ dict or None: A dictionary with purchase details if a purchase is made, None otherwise
116
+ """
117
  discount = discount_rates[variant_index]
118
  prob = calculate_purchase_probability(customer, discount)
119
 
120
  if np.random.random() < prob:
121
+ # Add some noise to the discounted price
122
+ price_noise = np.random.normal(0, product['price'] * 0.05) # 5% noise
123
+ discounted_price = max(0, product['price'] * (1 - discount) + price_noise)
124
+
125
+ # Add some noise to the cost
126
+ cost_noise = np.random.normal(0, product['cost'] * 0.03) # 3% noise
127
+ adjusted_cost = max(0, product['cost'] + cost_noise)
128
+
129
  return {
130
  'customer_id': customer['customer_id'],
131
  'variant': variants[variant_index],
132
  'product': product['name'],
133
  'price': product['price'],
134
  'discounted_price': discounted_price,
135
+ 'cost': adjusted_cost,
136
+ 'profit': discounted_price - adjusted_cost
137
  }
138
  return None
139
 
140
  def run_rct_simulation(df, experiment_duration=30):
141
+ """
142
+ Run a Randomized Control Trial (RCT) simulation.
143
+
144
+ This function simulates an RCT by assigning customers to different variants
145
+ and simulating purchases over the experiment duration.
146
+
147
+ Args:
148
+ df (pandas.DataFrame): The customer data
149
+ experiment_duration (int): The duration of the experiment in days (default: 30)
150
+
151
+ Returns:
152
+ tuple: Contains two DataFrames - transactions and variant assignments
153
+ """
154
  # Set random seed for reproducibility
155
  np.random.seed(42)
156
  random.seed(42)
 
160
  end_date = start_date + timedelta(days=experiment_duration)
161
 
162
  results = []
163
+ variant_assignments = []
164
 
165
  for _, customer in df.iterrows():
166
+ # Add some randomness to variant assignment
167
+ if np.random.random() < 0.05: # 5% chance of random assignment
168
+ variant_index = np.random.randint(0, 4)
169
+ else:
170
+ variant_index = np.random.randint(0, 4) # Original random assignment
171
 
172
  # Record variant assignment for all eligible customers
173
  variant_assignments.append({
 
175
  'variant': variants[variant_index]
176
  })
177
 
178
+ # Simulate multiple purchase opportunities with varying frequency
179
+ num_opportunities = np.random.poisson(experiment_duration / 10)
180
+ for _ in range(num_opportunities):
181
  product = random.choice(electronics_products)
182
  purchase = simulate_purchase(customer, variant_index, product)
183
  if purchase:
 
190
  # Create DataFrame from variant assignments
191
  variant_assignments_df = pd.DataFrame(variant_assignments)
192
 
193
+ return transactions_df, variant_assignments_df