Spaces:
Runtime error
Runtime error
cosmoruler
commited on
Commit
Β·
5269c7e
1
Parent(s):
532a561
first draft
Browse files- README.md +202 -2
- analyze.py +126 -0
- app.py +234 -0
- config.py +62 -0
- requirements.txt +11 -0
- test_setup.py +38 -0
- upload.py +40 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
@@ -9,4 +9,204 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: AI Data Analysis with SmoLagent
|
3 |
+
emoji: π€
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# AI Data Analysis with SmoLagent
|
13 |
+
|
14 |
+
An intelligent data analysis application that uses SmoLagent for AI-powered insights on CSV data.
|
15 |
+
|
16 |
+
## Features
|
17 |
+
|
18 |
+
π€ **AI-Powered Analysis**: Uses SmoLagent for natural language queries about your data
|
19 |
+
π **Interactive Visualizations**: Correlation heatmaps, distribution plots, and more
|
20 |
+
π **Statistical Analysis**: Comprehensive statistical summaries and insights
|
21 |
+
π **Web Interface**: User-friendly Gradio interface for easy interaction
|
22 |
+
π§ **Flexible**: Works with various LLM models (OpenAI, local models, etc.)
|
23 |
+
|
24 |
+
## Quick Start
|
25 |
+
|
26 |
+
1. **Install Dependencies**
|
27 |
+
|
28 |
+
```bash
|
29 |
+
pip install -r requirements.txt
|
30 |
+
```
|
31 |
+
|
32 |
+
2. **Configure Your Model** (Optional for basic features)
|
33 |
+
|
34 |
+
- Edit `config.py` to set up your preferred LLM
|
35 |
+
- Supported: OpenAI, Ollama, Hugging Face, and more
|
36 |
+
|
37 |
+
3. **Run the Application**
|
38 |
+
|
39 |
+
```bash
|
40 |
+
python app.py
|
41 |
+
```
|
42 |
+
|
43 |
+
4. **Access the Interface**
|
44 |
+
- Open your browser to the displayed URL (usually http://localhost:7860)
|
45 |
+
|
46 |
+
## Files Overview
|
47 |
+
|
48 |
+
- `app.py` - Main Gradio application with AI analysis features
|
49 |
+
- `upload.py` - Data loading and exploration script
|
50 |
+
- `analyze.py` - Example script showing SmoLagent usage
|
51 |
+
- `config.py` - Configuration file for model setup
|
52 |
+
- `requirements.txt` - Python dependencies
|
53 |
+
|
54 |
+
## Model Configuration
|
55 |
+
|
56 |
+
### OpenAI Models
|
57 |
+
|
58 |
+
```python
|
59 |
+
from smolagents.models import OpenAIServerModel
|
60 |
+
model = OpenAIServerModel(
|
61 |
+
model_id="gpt-4",
|
62 |
+
api_key="your-openai-api-key"
|
63 |
+
)
|
64 |
+
```
|
65 |
+
|
66 |
+
### Local Models (Ollama)
|
67 |
+
|
68 |
+
```python
|
69 |
+
from smolagents.models import LiteLLMModel
|
70 |
+
model = LiteLLMModel(
|
71 |
+
model_id="ollama/llama2",
|
72 |
+
api_base="http://localhost:11434"
|
73 |
+
)
|
74 |
+
```
|
75 |
+
|
76 |
+
### Hugging Face Models
|
77 |
+
|
78 |
+
```python
|
79 |
+
from smolagents.models import HfApiModel
|
80 |
+
model = HfApiModel(
|
81 |
+
model_id="microsoft/DialoGPT-medium",
|
82 |
+
token="your-hf-token"
|
83 |
+
)
|
84 |
+
```
|
85 |
+
|
86 |
+
## Usage Examples
|
87 |
+
|
88 |
+
### Basic Data Exploration
|
89 |
+
|
90 |
+
```python
|
91 |
+
python upload.py # Load and explore your CSV data
|
92 |
+
```
|
93 |
+
|
94 |
+
### Interactive Analysis
|
95 |
+
|
96 |
+
```python
|
97 |
+
python app.py # Start the web interface
|
98 |
+
```
|
99 |
+
|
100 |
+
### Programmatic Analysis
|
101 |
+
|
102 |
+
```python
|
103 |
+
python analyze.py # Run example analysis scripts
|
104 |
+
```
|
105 |
+
|
106 |
+
## Features Available
|
107 |
+
|
108 |
+
### 1. Data Overview Tab
|
109 |
+
|
110 |
+
- Dataset shape and structure
|
111 |
+
- Column information and data types
|
112 |
+
- Missing value analysis
|
113 |
+
- Memory usage statistics
|
114 |
+
|
115 |
+
### 2. Basic Statistics Tab
|
116 |
+
|
117 |
+
- Descriptive statistics for all columns
|
118 |
+
- Summary statistics (mean, median, std, etc.)
|
119 |
+
- Data distribution insights
|
120 |
+
|
121 |
+
### 3. Visualizations Tab
|
122 |
+
|
123 |
+
- **Correlation Heatmap**: Shows relationships between numerical variables
|
124 |
+
- **Distribution Plots**: Histograms for all numerical columns
|
125 |
+
|
126 |
+
### 4. AI Analysis Tab
|
127 |
+
|
128 |
+
- Natural language queries about your data
|
129 |
+
- AI-powered insights and recommendations
|
130 |
+
- Automated pattern detection
|
131 |
+
- Outlier identification
|
132 |
+
|
133 |
+
## Example AI Queries
|
134 |
+
|
135 |
+
Ask SmoLagent questions like:
|
136 |
+
|
137 |
+
- "What are the main trends in this data?"
|
138 |
+
- "Find any outliers or anomalies"
|
139 |
+
- "Suggest the best features for prediction"
|
140 |
+
- "Identify data quality issues"
|
141 |
+
- "Perform clustering analysis"
|
142 |
+
- "Find seasonal patterns"
|
143 |
+
|
144 |
+
## Data Requirements
|
145 |
+
|
146 |
+
- CSV format
|
147 |
+
- Update the file path in `config.py` or `upload.py`
|
148 |
+
- Supports various data types (numerical, categorical, datetime)
|
149 |
+
|
150 |
+
## Troubleshooting
|
151 |
+
|
152 |
+
### Common Issues:
|
153 |
+
|
154 |
+
1. **File Not Found Error**
|
155 |
+
|
156 |
+
- Check the CSV file path in `config.py`
|
157 |
+
- Ensure the file exists and is accessible
|
158 |
+
|
159 |
+
2. **Model Configuration Error**
|
160 |
+
|
161 |
+
- Verify your API keys in `config.py`
|
162 |
+
- Check model availability and configuration
|
163 |
+
|
164 |
+
3. **Dependency Issues**
|
165 |
+
- Run `pip install -r requirements.txt`
|
166 |
+
- Ensure Python 3.8+ is installed
|
167 |
+
|
168 |
+
### Getting Help:
|
169 |
+
|
170 |
+
- Check the console output for detailed error messages
|
171 |
+
- Verify your model configuration in `config.py`
|
172 |
+
- Ensure your CSV file is properly formatted
|
173 |
+
|
174 |
+
## Advanced Usage
|
175 |
+
|
176 |
+
### Custom Analysis Functions
|
177 |
+
|
178 |
+
You can extend the application by adding custom analysis functions:
|
179 |
+
|
180 |
+
```python
|
181 |
+
def custom_analysis(df):
|
182 |
+
# Your custom analysis logic here
|
183 |
+
return results
|
184 |
+
```
|
185 |
+
|
186 |
+
### Adding New Visualizations
|
187 |
+
|
188 |
+
Add new plotting functions to create additional visualizations:
|
189 |
+
|
190 |
+
```python
|
191 |
+
def create_custom_plot(df):
|
192 |
+
# Your plotting logic here
|
193 |
+
return plot_image
|
194 |
+
```
|
195 |
+
|
196 |
+
## Dependencies
|
197 |
+
|
198 |
+
- smolagents - AI agent framework
|
199 |
+
- gradio - Web interface
|
200 |
+
- pandas - Data manipulation
|
201 |
+
- numpy - Numerical computing
|
202 |
+
- matplotlib/seaborn - Plotting
|
203 |
+
- plotly - Interactive visualizations
|
204 |
+
- scikit-learn - Machine learning tools
|
205 |
+
|
206 |
+
## License
|
207 |
+
|
208 |
+
This project is open source and available under the MIT License.
|
209 |
+
|
210 |
+
## Configuration Reference
|
211 |
+
|
212 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
analyze.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Example script demonstrating SmoLagent data analysis
|
3 |
+
===================================================
|
4 |
+
|
5 |
+
This script shows how to use SmoLagent for automated data analysis
|
6 |
+
"""
|
7 |
+
|
8 |
+
import pandas as pd
|
9 |
+
from smolagents import CodeAgent, PythonCodeTool
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import seaborn as sns
|
12 |
+
|
13 |
+
# Configuration
|
14 |
+
CSV_FILE_PATH = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"
|
15 |
+
|
16 |
+
def simple_data_analysis():
|
17 |
+
"""Perform basic data analysis without AI agent"""
|
18 |
+
print("=== LOADING DATA ===")
|
19 |
+
try:
|
20 |
+
df = pd.read_csv(CSV_FILE_PATH)
|
21 |
+
print(f"β
Data loaded successfully! Shape: {df.shape}")
|
22 |
+
except Exception as e:
|
23 |
+
print(f"β Error loading data: {e}")
|
24 |
+
return
|
25 |
+
|
26 |
+
print("\n=== BASIC INFO ===")
|
27 |
+
print(f"Columns: {list(df.columns)}")
|
28 |
+
print(f"Data types:\n{df.dtypes}")
|
29 |
+
print(f"\nMissing values:\n{df.isnull().sum()}")
|
30 |
+
|
31 |
+
print("\n=== STATISTICAL SUMMARY ===")
|
32 |
+
print(df.describe())
|
33 |
+
|
34 |
+
# Create some basic plots
|
35 |
+
numeric_columns = df.select_dtypes(include=['number']).columns
|
36 |
+
|
37 |
+
if len(numeric_columns) > 0:
|
38 |
+
print(f"\n=== CREATING PLOTS FOR {len(numeric_columns)} NUMERIC COLUMNS ===")
|
39 |
+
|
40 |
+
# Distribution plots
|
41 |
+
plt.figure(figsize=(15, 10))
|
42 |
+
for i, col in enumerate(numeric_columns[:6]): # Limit to first 6 columns
|
43 |
+
plt.subplot(2, 3, i+1)
|
44 |
+
df[col].hist(bins=30, alpha=0.7)
|
45 |
+
plt.title(f'Distribution of {col}')
|
46 |
+
plt.xlabel(col)
|
47 |
+
plt.ylabel('Frequency')
|
48 |
+
|
49 |
+
plt.tight_layout()
|
50 |
+
plt.savefig('distributions.png', dpi=300, bbox_inches='tight')
|
51 |
+
plt.show()
|
52 |
+
print("β
Distribution plots saved as 'distributions.png'")
|
53 |
+
|
54 |
+
# Correlation heatmap
|
55 |
+
if len(numeric_columns) > 1:
|
56 |
+
plt.figure(figsize=(12, 8))
|
57 |
+
correlation_matrix = df[numeric_columns].corr()
|
58 |
+
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
|
59 |
+
plt.title('Correlation Heatmap')
|
60 |
+
plt.tight_layout()
|
61 |
+
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
|
62 |
+
plt.show()
|
63 |
+
print("β
Correlation heatmap saved as 'correlation_heatmap.png'")
|
64 |
+
|
65 |
+
return df
|
66 |
+
|
67 |
+
def analyze_with_smolagent_example():
|
68 |
+
"""Example of how to use SmoLagent (requires model configuration)"""
|
69 |
+
print("\n=== SMOLAGENT ANALYSIS EXAMPLE ===")
|
70 |
+
print("Note: This requires proper model configuration in config.py")
|
71 |
+
|
72 |
+
# This is a template - you need to configure your model
|
73 |
+
try:
|
74 |
+
# Uncomment and configure based on your model choice:
|
75 |
+
|
76 |
+
# For OpenAI:
|
77 |
+
# from smolagents.models import OpenAIServerModel
|
78 |
+
# model = OpenAIServerModel(model_id="gpt-4", api_key="your-api-key")
|
79 |
+
|
80 |
+
# For local Ollama:
|
81 |
+
# from smolagents.models import LiteLLMModel
|
82 |
+
# model = LiteLLMModel(model_id="ollama/llama2", api_base="http://localhost:11434")
|
83 |
+
|
84 |
+
# Create agent
|
85 |
+
# python_tool = PythonCodeTool()
|
86 |
+
# agent = CodeAgent(tools=[python_tool], model=model)
|
87 |
+
|
88 |
+
# Load data for analysis
|
89 |
+
df = pd.read_csv(CSV_FILE_PATH)
|
90 |
+
|
91 |
+
# Example queries you could ask:
|
92 |
+
example_queries = [
|
93 |
+
"Analyze the distribution of numerical columns and identify any outliers",
|
94 |
+
"Find correlations between variables and suggest interesting patterns",
|
95 |
+
"Perform clustering analysis on the data",
|
96 |
+
"Identify trends and seasonality in time-series data",
|
97 |
+
"Suggest data quality improvements",
|
98 |
+
]
|
99 |
+
|
100 |
+
print("Example queries you can ask SmoLagent:")
|
101 |
+
for i, query in enumerate(example_queries, 1):
|
102 |
+
print(f"{i}. {query}")
|
103 |
+
|
104 |
+
print("\nTo use SmoLagent:")
|
105 |
+
print("1. Configure your model in config.py")
|
106 |
+
print("2. Uncomment the model initialization code above")
|
107 |
+
print("3. Run the agent with your queries")
|
108 |
+
|
109 |
+
# Example usage (commented out until model is configured):
|
110 |
+
# response = agent.run(f"Analyze this dataset: {df.head().to_string()}")
|
111 |
+
# print(f"AI Analysis: {response}")
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
print(f"SmoLagent setup needed: {e}")
|
115 |
+
|
116 |
+
if __name__ == "__main__":
|
117 |
+
# Run basic analysis
|
118 |
+
df = simple_data_analysis()
|
119 |
+
|
120 |
+
# Show SmoLagent example
|
121 |
+
analyze_with_smolagent_example()
|
122 |
+
|
123 |
+
print("\n=== NEXT STEPS ===")
|
124 |
+
print("1. Configure your AI model in config.py")
|
125 |
+
print("2. Run 'python app.py' to start the Gradio interface")
|
126 |
+
print("3. Use the web interface for interactive analysis")
|
app.py
CHANGED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import plotly.express as px
|
7 |
+
import plotly.graph_objects as go
|
8 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, PythonCodeTool
|
9 |
+
from smolagents.models import OpenAIServerModel
|
10 |
+
import io
|
11 |
+
import base64
|
12 |
+
from PIL import Image
|
13 |
+
|
14 |
+
# Configure the CSV file path
|
15 |
+
CSV_FILE_PATH = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"
|
16 |
+
|
17 |
+
class DataAnalysisAgent:
|
18 |
+
def __init__(self):
|
19 |
+
"""Initialize the data analysis agent with SmoLagent"""
|
20 |
+
# Initialize tools
|
21 |
+
self.python_tool = PythonCodeTool()
|
22 |
+
self.search_tool = DuckDuckGoSearchTool()
|
23 |
+
|
24 |
+
# Note: You'll need to set up your LLM model here
|
25 |
+
# For this example, I'm using a placeholder - replace with your actual model
|
26 |
+
try:
|
27 |
+
# Replace with your actual model configuration
|
28 |
+
# model = OpenAIServerModel(model_id="gpt-4", api_key="your-api-key")
|
29 |
+
# self.agent = CodeAgent(tools=[self.python_tool, self.search_tool], model=model)
|
30 |
+
pass
|
31 |
+
except:
|
32 |
+
self.agent = None
|
33 |
+
|
34 |
+
self.df = None
|
35 |
+
self.load_data()
|
36 |
+
|
37 |
+
def load_data(self):
|
38 |
+
"""Load the CSV data"""
|
39 |
+
try:
|
40 |
+
self.df = pd.read_csv(CSV_FILE_PATH)
|
41 |
+
return f"Data loaded successfully! Shape: {self.df.shape}"
|
42 |
+
except Exception as e:
|
43 |
+
return f"Error loading data: {str(e)}"
|
44 |
+
|
45 |
+
def get_data_overview(self):
|
46 |
+
"""Get basic overview of the dataset"""
|
47 |
+
if self.df is None:
|
48 |
+
return "No data loaded"
|
49 |
+
|
50 |
+
overview = {
|
51 |
+
"shape": self.df.shape,
|
52 |
+
"columns": list(self.df.columns),
|
53 |
+
"dtypes": self.df.dtypes.to_dict(),
|
54 |
+
"missing_values": self.df.isnull().sum().to_dict(),
|
55 |
+
"memory_usage": f"{self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
|
56 |
+
}
|
57 |
+
|
58 |
+
return overview
|
59 |
+
|
60 |
+
def generate_basic_stats(self):
|
61 |
+
"""Generate basic statistical summary"""
|
62 |
+
if self.df is None:
|
63 |
+
return "No data loaded"
|
64 |
+
|
65 |
+
return self.df.describe(include='all').to_html()
|
66 |
+
|
67 |
+
def create_correlation_heatmap(self):
|
68 |
+
"""Create correlation heatmap for numerical columns"""
|
69 |
+
if self.df is None:
|
70 |
+
return None
|
71 |
+
|
72 |
+
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
|
73 |
+
if len(numeric_cols) < 2:
|
74 |
+
return "Not enough numerical columns for correlation analysis"
|
75 |
+
|
76 |
+
plt.figure(figsize=(12, 8))
|
77 |
+
correlation_matrix = self.df[numeric_cols].corr()
|
78 |
+
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
|
79 |
+
plt.title('Correlation Heatmap')
|
80 |
+
plt.tight_layout()
|
81 |
+
|
82 |
+
# Save plot to bytes
|
83 |
+
img_buffer = io.BytesIO()
|
84 |
+
plt.savefig(img_buffer, format='png', dpi=300, bbox_inches='tight')
|
85 |
+
img_buffer.seek(0)
|
86 |
+
plt.close()
|
87 |
+
|
88 |
+
return img_buffer
|
89 |
+
|
90 |
+
def create_distribution_plots(self):
|
91 |
+
"""Create distribution plots for numerical columns"""
|
92 |
+
if self.df is None:
|
93 |
+
return None
|
94 |
+
|
95 |
+
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
|
96 |
+
if len(numeric_cols) == 0:
|
97 |
+
return "No numerical columns found"
|
98 |
+
|
99 |
+
n_cols = min(3, len(numeric_cols))
|
100 |
+
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
|
101 |
+
|
102 |
+
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
|
103 |
+
if n_rows == 1 and n_cols == 1:
|
104 |
+
axes = [axes]
|
105 |
+
elif n_rows == 1 or n_cols == 1:
|
106 |
+
axes = axes.flatten()
|
107 |
+
else:
|
108 |
+
axes = axes.flatten()
|
109 |
+
|
110 |
+
for i, col in enumerate(numeric_cols):
|
111 |
+
if i < len(axes):
|
112 |
+
self.df[col].hist(bins=30, ax=axes[i], alpha=0.7)
|
113 |
+
axes[i].set_title(f'Distribution of {col}')
|
114 |
+
axes[i].set_xlabel(col)
|
115 |
+
axes[i].set_ylabel('Frequency')
|
116 |
+
|
117 |
+
# Hide empty subplots
|
118 |
+
for i in range(len(numeric_cols), len(axes)):
|
119 |
+
axes[i].set_visible(False)
|
120 |
+
|
121 |
+
plt.tight_layout()
|
122 |
+
|
123 |
+
img_buffer = io.BytesIO()
|
124 |
+
plt.savefig(img_buffer, format='png', dpi=300, bbox_inches='tight')
|
125 |
+
img_buffer.seek(0)
|
126 |
+
plt.close()
|
127 |
+
|
128 |
+
return img_buffer
|
129 |
+
|
130 |
+
def analyze_with_smolagent(self, query):
|
131 |
+
"""Use SmoLagent to analyze data based on user query"""
|
132 |
+
if self.agent is None:
|
133 |
+
return "SmoLagent not configured. Please set up your LLM model."
|
134 |
+
|
135 |
+
# Prepare context about the dataset
|
136 |
+
data_context = f"""
|
137 |
+
Dataset shape: {self.df.shape}
|
138 |
+
Columns: {list(self.df.columns)}
|
139 |
+
Data types: {self.df.dtypes.to_dict()}
|
140 |
+
First few rows: {self.df.head().to_string()}
|
141 |
+
"""
|
142 |
+
|
143 |
+
prompt = f"""
|
144 |
+
You have access to a pandas DataFrame with the following information:
|
145 |
+
{data_context}
|
146 |
+
|
147 |
+
User query: {query}
|
148 |
+
|
149 |
+
Please analyze the data and provide insights. Use the PythonCodeTool to write and execute code for analysis.
|
150 |
+
"""
|
151 |
+
|
152 |
+
try:
|
153 |
+
response = self.agent.run(prompt)
|
154 |
+
return response
|
155 |
+
except Exception as e:
|
156 |
+
return f"Error in SmoLagent analysis: {str(e)}"
|
157 |
+
|
158 |
+
# Initialize the agent
|
159 |
+
data_agent = DataAnalysisAgent()
|
160 |
+
|
161 |
+
def analyze_data_overview():
|
162 |
+
"""Gradio function for data overview"""
|
163 |
+
overview = data_agent.get_data_overview()
|
164 |
+
return str(overview)
|
165 |
+
|
166 |
+
def generate_statistics():
|
167 |
+
"""Gradio function for basic statistics"""
|
168 |
+
return data_agent.generate_basic_stats()
|
169 |
+
|
170 |
+
def create_correlation_plot():
|
171 |
+
"""Gradio function for correlation heatmap"""
|
172 |
+
img_buffer = data_agent.create_correlation_heatmap()
|
173 |
+
if isinstance(img_buffer, str):
|
174 |
+
return None
|
175 |
+
return Image.open(img_buffer)
|
176 |
+
|
177 |
+
def create_distribution_plot():
|
178 |
+
"""Gradio function for distribution plots"""
|
179 |
+
img_buffer = data_agent.create_distribution_plots()
|
180 |
+
if isinstance(img_buffer, str):
|
181 |
+
return None
|
182 |
+
return Image.open(img_buffer)
|
183 |
+
|
184 |
+
def smolagent_analysis(query):
|
185 |
+
"""Gradio function for SmoLagent analysis"""
|
186 |
+
return data_agent.analyze_with_smolagent(query)
|
187 |
+
|
188 |
+
# Create Gradio interface
|
189 |
+
with gr.Blocks(title="AI Data Analysis with SmoLagent") as demo:
|
190 |
+
gr.Markdown("# AI Data Analysis Dashboard")
|
191 |
+
gr.Markdown("Analyze your CSV data using AI-powered insights with SmoLagent")
|
192 |
+
|
193 |
+
with gr.Tab("Data Overview"):
|
194 |
+
gr.Markdown("## Dataset Overview")
|
195 |
+
overview_btn = gr.Button("Get Data Overview")
|
196 |
+
overview_output = gr.Textbox(label="Dataset Information", lines=10)
|
197 |
+
overview_btn.click(analyze_data_overview, outputs=overview_output)
|
198 |
+
|
199 |
+
with gr.Tab("Basic Statistics"):
|
200 |
+
gr.Markdown("## Statistical Summary")
|
201 |
+
stats_btn = gr.Button("Generate Statistics")
|
202 |
+
stats_output = gr.HTML(label="Statistical Summary")
|
203 |
+
stats_btn.click(generate_statistics, outputs=stats_output)
|
204 |
+
|
205 |
+
with gr.Tab("Visualizations"):
|
206 |
+
gr.Markdown("## Data Visualizations")
|
207 |
+
|
208 |
+
with gr.Row():
|
209 |
+
corr_btn = gr.Button("Generate Correlation Heatmap")
|
210 |
+
dist_btn = gr.Button("Generate Distribution Plots")
|
211 |
+
|
212 |
+
with gr.Row():
|
213 |
+
corr_plot = gr.Image(label="Correlation Heatmap")
|
214 |
+
dist_plot = gr.Image(label="Distribution Plots")
|
215 |
+
|
216 |
+
corr_btn.click(create_correlation_plot, outputs=corr_plot)
|
217 |
+
dist_btn.click(create_distribution_plot, outputs=dist_plot)
|
218 |
+
|
219 |
+
with gr.Tab("AI Analysis"):
|
220 |
+
gr.Markdown("## SmoLagent AI Analysis")
|
221 |
+
gr.Markdown("Ask questions about your data and get AI-powered insights")
|
222 |
+
|
223 |
+
query_input = gr.Textbox(
|
224 |
+
label="Enter your analysis question",
|
225 |
+
placeholder="e.g., 'What are the main trends in this data?' or 'Find outliers and anomalies'",
|
226 |
+
lines=3
|
227 |
+
)
|
228 |
+
analyze_btn = gr.Button("Analyze with AI")
|
229 |
+
ai_output = gr.Textbox(label="AI Analysis Results", lines=15)
|
230 |
+
|
231 |
+
analyze_btn.click(smolagent_analysis, inputs=query_input, outputs=ai_output)
|
232 |
+
|
233 |
+
if __name__ == "__main__":
|
234 |
+
demo.launch()
|
config.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Configuration file for SmoLagent setup
|
2 |
+
|
3 |
+
"""
|
4 |
+
SmoLagent Configuration Guide
|
5 |
+
============================
|
6 |
+
|
7 |
+
To use the AI analysis features, you need to configure a Language Model.
|
8 |
+
SmoLagent supports various models including:
|
9 |
+
|
10 |
+
1. OpenAI Models (GPT-3.5, GPT-4)
|
11 |
+
2. Local models (Ollama, LMStudio)
|
12 |
+
3. Hugging Face models
|
13 |
+
4. Other API-compatible models
|
14 |
+
|
15 |
+
Example configurations:
|
16 |
+
|
17 |
+
# For OpenAI:
|
18 |
+
from smolagents.models import OpenAIServerModel
|
19 |
+
model = OpenAIServerModel(
|
20 |
+
model_id="gpt-4",
|
21 |
+
api_key="your-openai-api-key"
|
22 |
+
)
|
23 |
+
|
24 |
+
# For local Ollama:
|
25 |
+
from smolagents.models import LiteLLMModel
|
26 |
+
model = LiteLLMModel(
|
27 |
+
model_id="ollama/llama2",
|
28 |
+
api_base="http://localhost:11434"
|
29 |
+
)
|
30 |
+
|
31 |
+
# For Hugging Face:
|
32 |
+
from smolagents.models import HfApiModel
|
33 |
+
model = HfApiModel(
|
34 |
+
model_id="microsoft/DialoGPT-medium",
|
35 |
+
token="your-hf-token"
|
36 |
+
)
|
37 |
+
|
38 |
+
Instructions:
|
39 |
+
1. Choose your preferred model from above
|
40 |
+
2. Get the necessary API keys/tokens
|
41 |
+
3. Update the model configuration in app.py
|
42 |
+
4. Replace the placeholder model initialization with your chosen configuration
|
43 |
+
"""
|
44 |
+
|
45 |
+
# CSV file path configuration
|
46 |
+
CSV_FILE_PATH = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"
|
47 |
+
|
48 |
+
# Model configuration (update with your preferred settings)
|
49 |
+
MODEL_CONFIG = {
|
50 |
+
"provider": "openai", # Change to your preferred provider
|
51 |
+
"model_id": "gpt-4", # Change to your preferred model
|
52 |
+
"api_key": "your-api-key-here", # Add your actual API key
|
53 |
+
"api_base": None, # For local models, set the base URL
|
54 |
+
}
|
55 |
+
|
56 |
+
# Analysis settings
|
57 |
+
ANALYSIS_SETTINGS = {
|
58 |
+
"max_rows_display": 1000,
|
59 |
+
"plot_style": "seaborn",
|
60 |
+
"figure_size": (12, 8),
|
61 |
+
"dpi": 300,
|
62 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
smolagents>=0.3.0
|
2 |
+
gradio>=5.37.0
|
3 |
+
pandas>=2.0.0
|
4 |
+
numpy>=1.24.0
|
5 |
+
matplotlib>=3.7.0
|
6 |
+
seaborn>=0.12.0
|
7 |
+
plotly>=5.15.0
|
8 |
+
Pillow>=10.0.0
|
9 |
+
scikit-learn>=1.3.0
|
10 |
+
openai>=1.0.0
|
11 |
+
requests>=2.31.0
|
test_setup.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
print("Python version:", sys.version)
|
4 |
+
print("Current directory:", os.getcwd())
|
5 |
+
|
6 |
+
try:
|
7 |
+
import pandas as pd
|
8 |
+
print("β
Pandas imported successfully")
|
9 |
+
except ImportError as e:
|
10 |
+
print("β Pandas import failed:", e)
|
11 |
+
|
12 |
+
try:
|
13 |
+
import smolagents
|
14 |
+
print("β
SmoLagents imported successfully")
|
15 |
+
except ImportError as e:
|
16 |
+
print("β SmoLagents import failed:", e)
|
17 |
+
|
18 |
+
try:
|
19 |
+
import gradio as gr
|
20 |
+
print("β
Gradio imported successfully")
|
21 |
+
except ImportError as e:
|
22 |
+
print("β Gradio import failed:", e)
|
23 |
+
|
24 |
+
# Check if CSV file exists
|
25 |
+
csv_path = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"
|
26 |
+
if os.path.exists(csv_path):
|
27 |
+
print(f"β
CSV file found at: {csv_path}")
|
28 |
+
try:
|
29 |
+
df = pd.read_csv(csv_path)
|
30 |
+
print(f"β
CSV loaded successfully. Shape: {df.shape}")
|
31 |
+
print(f"Columns: {list(df.columns)}")
|
32 |
+
except Exception as e:
|
33 |
+
print(f"β Error loading CSV: {e}")
|
34 |
+
else:
|
35 |
+
print(f"β CSV file not found at: {csv_path}")
|
36 |
+
print("Please check the file path and ensure the file exists.")
|
37 |
+
|
38 |
+
print("\nπ Setup verification complete!")
|
upload.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Replace 'your_file.csv' with your CSV file path
|
5 |
+
csv_file_path = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"
|
6 |
+
|
7 |
+
def load_and_explore_data():
|
8 |
+
"""Load and explore the CSV data"""
|
9 |
+
try:
|
10 |
+
# Check if file exists
|
11 |
+
if not os.path.exists(csv_file_path):
|
12 |
+
print(f"Error: File not found at {csv_file_path}")
|
13 |
+
return None
|
14 |
+
|
15 |
+
# Read the CSV file into a DataFrame
|
16 |
+
df = pd.read_csv(csv_file_path)
|
17 |
+
|
18 |
+
print("=== DATA LOADED SUCCESSFULLY ===")
|
19 |
+
print(f"Dataset shape: {df.shape}")
|
20 |
+
print(f"Columns: {list(df.columns)}")
|
21 |
+
print("\n=== FIRST 5 ROWS ===")
|
22 |
+
print(df.head())
|
23 |
+
|
24 |
+
print("\n=== DATA TYPES ===")
|
25 |
+
print(df.dtypes)
|
26 |
+
|
27 |
+
print("\n=== MISSING VALUES ===")
|
28 |
+
print(df.isnull().sum())
|
29 |
+
|
30 |
+
print("\n=== BASIC STATISTICS ===")
|
31 |
+
print(df.describe())
|
32 |
+
|
33 |
+
return df
|
34 |
+
|
35 |
+
except Exception as e:
|
36 |
+
print(f"Error loading data: {str(e)}")
|
37 |
+
return None
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
df = load_and_explore_data()
|