Spaces:
Sleeping
Sleeping
File size: 16,772 Bytes
89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f 89edfc8 b75c41f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 |
# main.py
# ----------------------------------------------------------------------------
# Import necessary libraries
# ----------------------------------------------------------------------------
# pip install gradio numpy pandas matplotlib scipy transformers torch sentencepiece
# ----------------------------------------------------------------------------
import gradio as gr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
from transformers import pipeline
import warnings
import os
# Suppress warnings for a cleaner output
warnings.filterwarnings("ignore")
# Set Matplotlib backend to a non-interactive one to avoid display issues in some environments
plt.switch_backend('Agg')
# ----------------------------------------------------------------------------
# Global Variables and Initial Setup
# ----------------------------------------------------------------------------
# Initialize the Hugging Face pipeline for text generation.
# We use a small, efficient model to ensure the app runs smoothly.
try:
explanation_generator = pipeline('text2text-generation', model='google/flan-t5-small')
print("Hugging Face model loaded successfully.")
except Exception as e:
print(f"Could not load Hugging Face model. Explanations will be disabled. Error: {e}")
explanation_generator = None
# Create a sample dataset for demonstration purposes.
# This simulates the uncertain costs (in thousands of $) for different tasks in a project.
sample_project_costs = pd.DataFrame({
'task_cost_thousands': [12, 15, 10, 13, 18, 9, 22, 14, 16, 11, 17, 20]
})
SAMPLE_CSV_PATH = 'sample_project_costs.csv'
sample_project_costs.to_csv(SAMPLE_CSV_PATH, index=False)
# ----------------------------------------------------------------------------
# Core Logic Functions
# ----------------------------------------------------------------------------
def create_error_plot(message):
"""Creates a matplotlib plot with a specified error message."""
fig, ax = plt.subplots(figsize=(8, 5))
ax.text(0.5, 0.5, message, ha='center', va='center', wrap=True, color='red', fontsize=12)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
return fig
def process_input_data(file_obj, example_choice, manual_mean, manual_std):
"""
Processes the user's input from the UI.
It prioritizes input in the order: File Upload > Example Dataset > Manual Entry.
It validates the data to ensure it's a single column of numbers.
Args:
file_obj (File object): The uploaded file from gr.File.
example_choice (str): The name of the chosen example dataset.
manual_mean (float): Manually entered mean.
manual_std (float): Manually entered standard deviation.
Returns:
tuple: A tuple containing:
- A pandas DataFrame with the processed data.
- A Matplotlib figure showing the data distribution.
- A string with summary statistics.
- A string with a validation message.
"""
data = None
source_info = ""
# 1. Prioritize input source
if file_obj is not None:
try:
df = pd.read_csv(file_obj.name)
source_info = f"from uploaded file: {os.path.basename(file_obj.name)}"
data = df
except Exception as e:
return None, create_error_plot(f"Error reading file: {e}"), None, f"Error reading file: {e}. Please ensure it's a valid CSV."
elif example_choice and example_choice == "Project Cost Estimation":
df = pd.read_csv(SAMPLE_CSV_PATH)
source_info = "from the 'Project Cost Estimation' example"
data = df
elif manual_mean is not None and manual_std is not None:
if manual_std <= 0:
return None, create_error_plot("Standard Deviation must be positive."), None, "Manual Input Error: Standard Deviation must be positive."
stats_text = (f"Source: Manual Input\n"
f"Mean: {manual_mean:.2f}\n"
f"Standard Deviation: {manual_std:.2f}")
fig, ax = plt.subplots()
ax.text(0.5, 0.5, 'Manual input:\nNo data to plot.\nSimulation will use\nthe provided Mean/Std.',
ha='center', va='center', fontsize=12)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
manual_df = pd.DataFrame({'mean': [manual_mean], 'std': [manual_std]})
return manual_df, fig, stats_text, "Manual parameters accepted. Ready to run simulation."
if data is None:
return None, create_error_plot("No data source provided."), None, "No data source provided. Please upload a file, choose an example, or enter parameters."
# 2. Validate data structure
if data.shape[1] != 1 or not pd.api.types.is_numeric_dtype(data.iloc[:, 0]):
error_msg = (f"Data Error: The data {source_info} is not compatible. "
"The app requires a CSV with a single column of numerical data. "
f"Detected {data.shape[1]} columns.")
return None, create_error_plot(error_msg), None, error_msg
# 3. Process valid data
series = data.iloc[:, 0].dropna()
mean = series.mean()
std = series.std()
if std == 0:
error_msg = "Data Error: All values are the same. Standard deviation is zero, cannot simulate uncertainty."
return None, create_error_plot(error_msg), None, error_msg
# 4. Generate visualization and stats
fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(series, bins='auto', density=True, alpha=0.7, label='Input Data Distribution')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
ax.plot(x, p, 'k', linewidth=2, label='Fitted Normal Curve')
ax.set_title(f"Distribution of Input Data")
ax.set_xlabel(series.name)
ax.set_ylabel("Density")
ax.legend()
ax.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
stats_text = (f"Source: {source_info}\n"
f"Number of Data Points: {len(series)}\n"
f"Mean: {mean:.2f}\n"
f"Standard Deviation: {std:.2f}\n"
f"Min: {series.min():.2f}\n"
f"Max: {series.max():.2f}")
validation_message = "Data loaded and validated successfully! Ready to run the simulation."
return data, fig, stats_text, validation_message
def run_monte_carlo_simulation(data, num_simulations, target_value):
"""
Performs the Monte Carlo simulation based on the processed data.
"""
# **NEW**: Check for valid data at the beginning and return clear error plots if invalid.
if data is None:
error_message = "ERROR: No valid data available.\nPlease go to Step 1 & 2 and click 'Prepare Simulation' first."
error_plot = create_error_plot(error_message)
return error_plot, error_plot, "Simulation failed. See plot for details."
num_simulations = int(num_simulations)
if 'mean' in data.columns and 'std' in data.columns and data.shape[0] == 1:
mean = data['mean'].iloc[0]
std = data['std'].iloc[0]
data_name = "Value"
else:
series = data.iloc[:, 0]
mean = series.mean()
std = series.std()
data_name = series.name
simulation_results = np.random.normal(mean, std, num_simulations)
fig_hist, ax_hist = plt.subplots(figsize=(8, 5))
ax_hist.hist(simulation_results, bins=50, density=True, alpha=0.8, color='skyblue', edgecolor='black')
sim_mean = np.mean(simulation_results)
p5 = np.percentile(simulation_results, 5)
p95 = np.percentile(simulation_results, 95)
ax_hist.axvline(sim_mean, color='red', linestyle='--', linewidth=2, label=f'Mean: {sim_mean:.2f}')
ax_hist.axvline(p5, color='green', linestyle=':', linewidth=2, label=f'5th Percentile (P5): {p5:.2f}')
ax_hist.axvline(p95, color='green', linestyle=':', linewidth=2, label=f'95th Percentile (P95): {p95:.2f}')
ax_hist.set_title(f'Monte Carlo Simulation Results ({num_simulations:,} Iterations)', fontsize=14)
ax_hist.set_xlabel(f'Simulated {data_name}')
ax_hist.set_ylabel('Probability Density')
ax_hist.legend()
ax_hist.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
fig_cdf, ax_cdf = plt.subplots(figsize=(8, 5))
sorted_results = np.sort(simulation_results)
yvals = np.arange(len(sorted_results)) / float(len(sorted_results) - 1)
ax_cdf.plot(sorted_results, yvals, label='CDF')
p50 = np.percentile(simulation_results, 50)
ax_cdf.plot(p5, 0.05, 'go', ms=8, label=f'P5: {p5:.2f}')
ax_cdf.plot(p50, 0.50, 'ro', ms=8, label=f'Median (P50): {p50:.2f}')
ax_cdf.plot(p95, 0.95, 'go', ms=8, label=f'P95: {p95:.2f}')
ax_cdf.set_title('Cumulative Distribution Function (CDF)', fontsize=14)
ax_cdf.set_xlabel(f'Simulated {data_name}')
ax_cdf.set_ylabel('Cumulative Probability')
ax_cdf.grid(True, linestyle='--', alpha=0.6)
ax_cdf.legend()
plt.tight_layout()
prob_achieved = 0
if target_value is not None:
prob_achieved = np.sum(simulation_results <= target_value) / num_simulations * 100
results_summary = (
f"Simulation Summary ({num_simulations:,} iterations):\n"
f"--------------------------------------------------\n"
f"Mean (Average Outcome): {sim_mean:.2f}\n"
f"Standard Deviation: {np.std(simulation_results):.2f}\n\n"
f"Percentiles (Confidence Range):\n"
f" - 5th Percentile (P5): {p5:.2f}\n"
f" - 50th Percentile (Median): {p50:.2f}\n"
f" - 95th Percentile (P95): {p95:.2f}\n"
f"This means there is a 90% probability the outcome will be between {p5:.2f} and {p95:.2f}.\n\n"
)
if target_value is not None:
results_summary += (
f"Probability Analysis:\n"
f" - Probability of outcome being less than or equal to {target_value:.2f}: {prob_achieved:.2f}%\n"
)
return fig_hist, fig_cdf, results_summary
def generate_explanation(results_summary):
"""
Uses a Hugging Face model to explain the simulation results in simple terms.
"""
if explanation_generator is None:
return "LLM model not loaded. Cannot generate explanation."
# **NEW**: More robust check for failed simulation runs.
if not results_summary or "Please process valid data" in results_summary or "Simulation failed" in results_summary:
return "Could not generate explanation. Please run a successful simulation first."
prompt = f"""
Explain the following Monte Carlo simulation results to a non-technical manager.
Focus on what the numbers mean in terms of risk and decision-making. Be concise and clear.
Results:
{results_summary}
Explanation:
"""
try:
response = explanation_generator(prompt, max_length=200, num_beams=3, no_repeat_ngram_size=2)
return response[0]['generated_text']
except Exception as e:
return f"Error generating explanation: {e}"
# ----------------------------------------------------------------------------
# Gradio UI Layout
# ----------------------------------------------------------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="Monte Carlo Simulation Explorer") as app:
gr.Markdown(
"""
# Welcome to the Monte Carlo Simulation Explorer!
This tool helps you understand and perform a Monte Carlo simulation, a powerful technique for modeling uncertainty.
**How it works:** Instead of guessing a single outcome, you provide a range of possible inputs (or a distribution). The simulation then runs thousands of trials with random values from that input, creating a probability distribution of all possible outcomes.
**Get started:**
1. **Provide Data:** Use one of the methods in the "Data Collection" box below.
2. **Prepare Simulation:** Click the "Prepare Simulation" button to validate and visualize your input.
3. **Run Simulation:** Adjust the settings and click "Run Simulation".
4. **Interpret:** Analyze the resulting plots and get an AI-powered explanation.
"""
)
# --- Row 1: Data Input and Preparation ---
with gr.Row():
with gr.Column(scale=1):
with gr.Group():
gr.Markdown("### 1. Data Collection")
gr.Markdown("Choose **one** method below.")
with gr.Tabs():
with gr.TabItem("Upload File"):
file_input = gr.File(label="Upload a Single-Column CSV File", file_types=[".csv"])
with gr.TabItem("Use Example"):
example_input = gr.Dropdown(
["Project Cost Estimation"], label="Select an Example Dataset"
)
with gr.TabItem("Manual Input"):
gr.Markdown("Define a normal distribution manually.")
manual_mean_input = gr.Number(label="Mean (Average)")
manual_std_input = gr.Number(label="Standard Deviation (Spread)")
prepare_button = gr.Button("Prepare Simulation", variant="secondary")
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("### 2. Preparation & Visualization")
validation_output = gr.Textbox(label="Validation Status", interactive=False, lines=3)
input_stats_output = gr.Textbox(label="Input Data Statistics", interactive=False, lines=6)
input_plot_output = gr.Plot(label="Input Data Distribution")
# --- Row 2: Simulation Controls and Results ---
with gr.Row():
with gr.Group():
gr.Markdown("### 3. Simulation Run & Results")
with gr.Row():
with gr.Column(scale=1, min_width=250):
gr.Markdown("**Simulation Settings**")
num_simulations_input = gr.Slider(
minimum=1000, maximum=50000, value=10000, step=1000,
label="Number of Simulations"
)
target_value_input = gr.Number(
label="Target Value (Optional)",
info="Calculate the probability of the result being <= this value."
)
run_button = gr.Button("Run Simulation", variant="primary")
with gr.Column(scale=3):
with gr.Tabs():
with gr.TabItem("Results Histogram"):
results_plot_output = gr.Plot(label="Simulation Outcome Distribution")
with gr.TabItem("Cumulative Probability (CDF)"):
cdf_plot_output = gr.Plot(label="Cumulative Distribution Function")
with gr.TabItem("Numerical Summary"):
results_summary_output = gr.Textbox(label="Detailed Results", interactive=False, lines=12)
# --- Row 3: AI-Powered Explanation ---
with gr.Row():
with gr.Group():
gr.Markdown("### 4. AI-Powered Explanation")
explain_button = gr.Button("Explain the Takeaways", variant="secondary")
explanation_output = gr.Textbox(
label="Key Takeaways from the LLM",
interactive=False,
lines=5,
placeholder="Click the button above to generate an explanation of the results..."
)
# ----------------------------------------------------------------------------
# Define UI Component Interactions
# ----------------------------------------------------------------------------
processed_data_state = gr.State()
prepare_button.click(
fn=process_input_data,
inputs=[file_input, example_input, manual_mean_input, manual_std_input],
outputs=[processed_data_state, input_plot_output, input_stats_output, validation_output]
)
run_button.click(
fn=run_monte_carlo_simulation,
inputs=[processed_data_state, num_simulations_input, target_value_input],
outputs=[results_plot_output, cdf_plot_output, results_summary_output]
)
explain_button.click(
fn=generate_explanation,
inputs=[results_summary_output],
outputs=[explanation_output]
)
# ----------------------------------------------------------------------------
# Launch the Gradio App
# ----------------------------------------------------------------------------
if __name__ == "__main__":
app.launch(debug=True)
|