Spaces:
Runtime error
Runtime error
import pandas as pd | |
import os | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from smolagents import CodeAgent, DuckDuckGoSearchTool | |
import warnings | |
warnings.filterwarnings('ignore') | |
# Replace 'your_file.csv' with your CSV file path | |
csv_file_path = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv" | |
def set_csv_file_path(new_path): | |
"""Update the CSV file path""" | |
global csv_file_path | |
csv_file_path = new_path | |
print(f"β CSV file path updated to: {csv_file_path}") | |
def get_csv_file_path(): | |
"""Get the current CSV file path""" | |
return csv_file_path | |
class EnhancedDataExplorer: | |
"""Enhanced data explorer with SmoLagent AI capabilities""" | |
def __init__(self, csv_path=csv_file_path): | |
self.csv_path = csv_path | |
self.df = None | |
self.agent = None | |
print("π Enhanced Data Explorer initialized!") | |
print("π‘ AI setup will be done when first needed (option 4)") | |
# Don't call setup_agent() here to avoid hanging | |
def setup_agent(self): | |
"""Setup SmoLagent AI agent with simple configuration""" | |
print("π€ Setting up SmoLagent AI agent...") | |
print("π Trying multiple model configurations...") | |
try: | |
# Try with Ollama using direct ollama package (fast and local) | |
try: | |
print("π Attempting Ollama setup...") | |
import ollama | |
# Quick test if Ollama is available (without generation test) | |
models = ollama.list() | |
if models and 'models' in models and len(models['models']) > 0: | |
print("β Ollama is running and accessible!") | |
print(f"π¦ Found model: {models['models'][0].get('name', 'llama2')}") | |
else: | |
raise Exception("No models found") | |
# Create a custom model class for Ollama compatible with smolagents | |
class OllamaModel: | |
def __init__(self, model_name="llama2"): | |
self.model_name = model_name | |
import ollama | |
self.ollama = ollama | |
def __call__(self, messages, **kwargs): | |
try: | |
# Convert messages to Ollama format | |
if isinstance(messages, str): | |
prompt = messages | |
elif isinstance(messages, list): | |
# Handle different message formats | |
if len(messages) > 0 and isinstance(messages[0], dict): | |
# Extract content from message dictionaries | |
prompt = "\n".join([ | |
msg.get('content', str(msg)) if isinstance(msg, dict) else str(msg) | |
for msg in messages | |
]) | |
else: | |
prompt = "\n".join([str(msg) for msg in messages]) | |
else: | |
prompt = str(messages) | |
# Add timeout to prevent hanging | |
import signal | |
import time | |
def timeout_handler(signum, frame): | |
raise TimeoutError("Ollama response timeout") | |
# Set a 30-second timeout for Windows (using threading instead) | |
import threading | |
result = {'response': None, 'error': None} | |
def generate_with_timeout(): | |
try: | |
response = self.ollama.generate(model=self.model_name, prompt=prompt) | |
result['response'] = response['response'] | |
except Exception as e: | |
result['error'] = str(e) | |
thread = threading.Thread(target=generate_with_timeout) | |
thread.daemon = True | |
thread.start() | |
thread.join(timeout=30) # 30 second timeout | |
if thread.is_alive(): | |
return "Error: Ollama response timed out after 30 seconds. Try a simpler query." | |
elif result['error']: | |
return f"Error generating response with Ollama: {result['error']}" | |
elif result['response']: | |
return result['response'] | |
else: | |
return "Error: No response received from Ollama" | |
except Exception as e: | |
return f"Error generating response with Ollama: {e}" | |
def generate(self, messages, **kwargs): | |
"""Alternative method name that might be expected""" | |
return self.__call__(messages, **kwargs) | |
model = OllamaModel("llama2") | |
self.agent = CodeAgent( | |
tools=[DuckDuckGoSearchTool()], | |
model=model | |
) | |
print("β SmoLagent configured successfully with Ollama!") | |
print("π‘ Local AI model ready for analysis (with 30s timeout)") | |
return | |
except Exception as e: | |
print(f"β οΈ Ollama setup failed: {e}") | |
print("π‘ Make sure Ollama is running: ollama serve") | |
# Try OpenAI if API key is available | |
try: | |
print("π Checking for OpenAI API key...") | |
import os | |
from smolagents import OpenAIModel | |
if os.getenv('OPENAI_API_KEY'): | |
model = OpenAIModel(model_id="gpt-3.5-turbo") | |
self.agent = CodeAgent( | |
tools=[DuckDuckGoSearchTool()], | |
model=model | |
) | |
print("β SmoLagent configured successfully with OpenAI!") | |
return | |
else: | |
print("β οΈ OpenAI API key not found") | |
except Exception as e: | |
print(f"β οΈ OpenAI setup failed: {e}") | |
# Fallback to Transformers model (smaller version) | |
try: | |
print("π Attempting HuggingFace Transformers model...") | |
from smolagents import TransformersModel | |
model = TransformersModel(model_id="microsoft/DialoGPT-small") # Smaller model | |
self.agent = CodeAgent( | |
tools=[DuckDuckGoSearchTool()], | |
model=model | |
) | |
print("β SmoLagent configured successfully with HuggingFace model!") | |
print("π‘ Note: First use may take time to download model") | |
return | |
except Exception as e: | |
print(f"β οΈ HuggingFace setup failed: {e}") | |
print(" Make sure transformers are installed: pip install 'smolagents[transformers]'") | |
# If all models fail | |
print("\nβ No AI model could be configured.") | |
print("π To fix this:") | |
print(" 1. For local AI: Install Ollama and run 'ollama serve'") | |
print(" 2. For OpenAI: Set OPENAI_API_KEY environment variable") | |
print(" 3. For basic use: pip install 'smolagents[transformers]'") | |
print("\nβ You can still use all non-AI features!") | |
self.agent = None | |
except Exception as e: | |
print(f"β οΈ Agent setup failed: {e}") | |
print("π‘ Try using: python fixed_upload.py") | |
self.agent = None | |
def configure_model_helper(self): | |
"""Helper function to guide model configuration""" | |
print("\nπ€ AI Model Configuration Helper") | |
print("=" * 40) | |
print("1. OpenAI (Recommended - Most capable)") | |
print("2. Ollama (Free - Runs locally)") | |
print("3. Hugging Face (Free - API based)") | |
print("4. Skip AI features") | |
choice = input("Choose your model (1-4): ").strip() | |
if choice == "1": | |
print("\nπ OpenAI Setup:") | |
print("1. Get API key from: https://platform.openai.com/") | |
print("2. Set environment variable: OPENAI_API_KEY=your_key") | |
print("3. Or edit the setup_agent() method with your key") | |
elif choice == "2": | |
print("\nπ Ollama Setup:") | |
print("1. Install Ollama from: https://ollama.ai/") | |
print("2. Run: ollama pull llama2") | |
print("3. Start server: ollama serve") | |
print("4. Script is already configured to use SmoLagents' native OllamaModel") | |
print("5. Just make sure Ollama is running and try the AI analysis!") | |
elif choice == "3": | |
print("\nπ Hugging Face Setup:") | |
print("1. Create account at: https://huggingface.co/") | |
print("2. Get token from: https://huggingface.co/settings/tokens") | |
print("3. Set environment variable: HF_TOKEN=your_token") | |
print("4. Uncomment HF lines in setup_agent() method") | |
elif choice == "4": | |
print("β You can still use all non-AI features!") | |
print("\nπ‘ Tip: Set environment variables in your system or use a .env file") | |
return choice | |
def load_data(self): | |
"""Load the CSV data (keeping your original functionality)""" | |
print(f"\nπ Loading data from: {self.csv_path}") | |
try: | |
# Check if file exists | |
if not os.path.exists(self.csv_path): | |
print(f"β Error: File not found at {self.csv_path}") | |
print("π‘ Use option 7 to change the file path") | |
return None | |
# Read the CSV file into a DataFrame | |
self.df = pd.read_csv(self.csv_path) | |
print("=== DATA LOADED SUCCESSFULLY ===") | |
print(f"π File: {os.path.basename(self.csv_path)}") | |
print(f"π Dataset shape: {self.df.shape}") | |
print(f"π Columns: {list(self.df.columns)}") | |
print("\n=== FIRST 5 ROWS ===") | |
print(self.df.head()) | |
print("\n=== DATA TYPES ===") | |
print(self.df.dtypes) | |
print("\n=== MISSING VALUES ===") | |
print(self.df.isnull().sum()) | |
print("\n=== BASIC STATISTICS ===") | |
print(self.df.describe()) | |
return self.df | |
except Exception as e: | |
print(f"Error loading data: {str(e)}") | |
return None | |
def create_visualizations(self): | |
"""Create basic visualizations""" | |
if self.df is None: | |
print("β No data loaded. Run load_data() first.") | |
return | |
try: | |
# Set up plotting style | |
plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default') | |
# Get numeric columns | |
numeric_cols = self.df.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) == 0: | |
print("β οΈ No numeric columns found for visualization") | |
return | |
print(f"\n=== CREATING VISUALIZATIONS FOR {len(numeric_cols)} NUMERIC COLUMNS ===") | |
# 1. Distribution plots | |
n_cols = min(3, len(numeric_cols)) | |
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols | |
plt.figure(figsize=(15, 5*n_rows)) | |
for i, col in enumerate(numeric_cols): | |
plt.subplot(n_rows, n_cols, i+1) | |
self.df[col].hist(bins=30, alpha=0.7, edgecolor='black') | |
plt.title(f'Distribution of {col}') | |
plt.xlabel(col) | |
plt.ylabel('Frequency') | |
plt.tight_layout() | |
plt.savefig('data_distributions.png', dpi=300, bbox_inches='tight') | |
plt.show() | |
print("β Distribution plots saved as 'data_distributions.png'") | |
# 2. Correlation heatmap (if more than 1 numeric column) | |
if len(numeric_cols) > 1: | |
plt.figure(figsize=(12, 8)) | |
correlation_matrix = self.df[numeric_cols].corr() | |
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, | |
square=True, linewidths=0.5) | |
plt.title('Correlation Heatmap') | |
plt.tight_layout() | |
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight') | |
plt.show() | |
print("β Correlation heatmap saved as 'correlation_heatmap.png'") | |
except Exception as e: | |
print(f"β Error creating visualizations: {e}") | |
def analyze_data_quality(self): | |
"""Analyze data quality issues""" | |
if self.df is None: | |
print("β No data loaded. Run load_data() first.") | |
return | |
print("\n=== DATA QUALITY ANALYSIS ===") | |
# Missing data analysis | |
missing_data = self.df.isnull().sum() | |
missing_percentage = (missing_data / len(self.df)) * 100 | |
quality_report = pd.DataFrame({ | |
'Column': self.df.columns, | |
'Missing_Count': missing_data.values, | |
'Missing_Percentage': missing_percentage.values, | |
'Data_Type': self.df.dtypes.values | |
}) | |
print("Missing Data Summary:") | |
print(quality_report[quality_report['Missing_Count'] > 0]) | |
# Duplicate rows | |
duplicates = self.df.duplicated().sum() | |
print(f"\nDuplicate rows: {duplicates}") | |
# Memory usage | |
memory_usage = self.df.memory_usage(deep=True).sum() / 1024**2 | |
print(f"Memory usage: {memory_usage:.2f} MB") | |
return quality_report | |
def ai_analysis(self, query): | |
"""Use SmoLagent for AI-powered analysis""" | |
print(f"\nπ Checking prerequisites for AI analysis...") | |
if self.agent is None: | |
print("β AI agent not configured. Please set up SmoLagent first.") | |
print("π‘ Try running one of these alternatives:") | |
print(" β’ python fixed_upload.py") | |
print(" β’ python quick_ai_demo.py") | |
return | |
if self.df is None: | |
print("β No data loaded. Please load data first!") | |
print("π‘ Choose option 1 in the main menu to load your data.") | |
return | |
print("β Data loaded successfully") | |
print("β AI agent configured") | |
print(f"β Processing query: '{query}'") | |
# Prepare context about the dataset | |
try: | |
data_context = f""" | |
Dataset Analysis Request: | |
- Dataset Shape: {self.df.shape} | |
- Columns: {list(self.df.columns)} | |
- Data Types: {dict(self.df.dtypes)} | |
- Missing Values: {dict(self.df.isnull().sum())} | |
Sample Data: | |
{self.df.head(3).to_string()} | |
Statistical Summary: | |
{self.df.describe().to_string()} | |
User Question: {query} | |
""" | |
print(f"\nπ€ SmoLagent is analyzing your data...") | |
print("β³ This may take 5-15 seconds...") | |
# Use the agent with the data context and query | |
response = self.agent.run(data_context) | |
print("\n" + "="*60) | |
print("β AI ANALYSIS COMPLETE") | |
print("="*60) | |
print(response) | |
print("="*60) | |
return response | |
except Exception as e: | |
print(f"\nβ AI analysis failed: {e}") | |
print("\nπ‘ Troubleshooting suggestions:") | |
print(" β’ Check your internet connection") | |
print(" β’ Try: python fixed_upload.py") | |
print(" β’ Use basic analysis features (options 2-3)") | |
return None | |
def check_status(self): | |
"""Check the status of data and AI setup""" | |
print("\nπ SYSTEM STATUS CHECK") | |
print("="*40) | |
# Check file path | |
print(f"π CSV File: {self.csv_path}") | |
if os.path.exists(self.csv_path): | |
print(f"β File exists: {os.path.basename(self.csv_path)}") | |
else: | |
print(f"β File not found") | |
# Check data status | |
if self.df is not None: | |
print(f"β Data loaded: {self.df.shape[0]} rows, {self.df.shape[1]} columns") | |
print(f"π Columns: {list(self.df.columns)}") | |
else: | |
print("β No data loaded") | |
# Check AI agent status | |
if self.agent is not None: | |
print("β AI agent configured and ready") | |
else: | |
print("β AI agent not configured") | |
print("="*40) | |
def change_csv_file(self, new_path=None): | |
"""Change the CSV file path""" | |
if new_path is None: | |
print(f"\nπ Current file path: {self.csv_path}") | |
new_path = input("Enter new CSV file path: ").strip() | |
if os.path.exists(new_path): | |
self.csv_path = new_path | |
self.df = None # Clear current data | |
print(f"β CSV file path updated to: {self.csv_path}") | |
print("π‘ Data cleared. Use option 1 to load the new file.") | |
else: | |
print(f"β File not found: {new_path}") | |
print("π‘ Please check the file path and try again.") | |
def interactive_menu(self): | |
"""Interactive menu for data exploration""" | |
# Show initial status | |
self.check_status() | |
while True: | |
print("\n" + "="*50) | |
print("π€ ENHANCED DATA EXPLORER WITH AI") | |
print("="*50) | |
print("1. Load and explore data") | |
print("2. Create visualizations") | |
print("3. Analyze data quality") | |
print("4. AI-powered analysis") | |
print("5. Show data summary") | |
print("6. Check system status") | |
print("7. Change CSV file path") | |
print("8. Exit") | |
print("="*50) | |
print(f"π Current file: {os.path.basename(self.csv_path)}") | |
choice = input("Enter your choice (1-8): ").strip() | |
if choice == '1': | |
self.load_data() | |
elif choice == '2': | |
self.create_visualizations() | |
elif choice == '3': | |
self.analyze_data_quality() | |
elif choice == '4': | |
if self.df is None: | |
print("\nβ No data loaded. Please load data first!") | |
print("π‘ Choose option 1 to load your data before using AI analysis.") | |
input("\nPress Enter to continue...") | |
else: | |
# Setup AI on demand if not already done | |
if self.agent is None: | |
print("\nπ€ Setting up AI for first use...") | |
self.setup_agent() | |
if self.agent is None: | |
print("\nβ AI features not available. Please configure a model first.") | |
print("Edit the setup_agent() method to add your API keys.") | |
self.configure_model_helper() | |
else: | |
print("\nπ€ AI Analysis - Ask me anything about your data!") | |
print("Example queries:") | |
print(" β’ 'What are the main trends in this data?'") | |
print(" β’ 'Find any outliers or anomalies'") | |
print(" β’ 'Suggest data quality improvements'") | |
print(" β’ 'Perform correlation analysis'") | |
print(" β’ 'Identify seasonal patterns'") | |
print(" β’ 'Recommend preprocessing steps'") | |
query = input("\n㪠Your question: ").strip() | |
if query: | |
self.ai_analysis(query) | |
# Wait for user to read the results before returning to menu | |
input("\nπ Press Enter to return to main menu...") | |
else: | |
print("β No question entered.") | |
input("\nPress Enter to continue...") | |
elif choice == '5': | |
if self.df is not None: | |
print(f"\nπ Dataset Summary:") | |
print(f"Shape: {self.df.shape}") | |
print(f"Columns: {list(self.df.columns)}") | |
print(f"Memory: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") | |
else: | |
print("β No data loaded.") | |
elif choice == '6': | |
self.check_status() | |
elif choice == '7': | |
self.change_csv_file() | |
elif choice == '8': | |
print("π Goodbye!") | |
break | |
else: | |
print("β Invalid choice. Please try again.") | |
def load_and_explore_data(): | |
"""Load and explore the CSV data (keeping your original function)""" | |
print(f"\nπ Loading data from: {csv_file_path}") | |
try: | |
# Check if file exists | |
if not os.path.exists(csv_file_path): | |
print(f"β Error: File not found at {csv_file_path}") | |
print("π‘ Update the csv_file_path variable at the top of this file") | |
return None | |
# Read the CSV file into a DataFrame | |
df = pd.read_csv(csv_file_path) | |
print("=== DATA LOADED SUCCESSFULLY ===") | |
print(f"π File: {os.path.basename(csv_file_path)}") | |
print(f"π Dataset shape: {df.shape}") | |
print(f"π Columns: {list(df.columns)}") | |
print("\n=== FIRST 5 ROWS ===") | |
print(df.head()) | |
print("\n=== DATA TYPES ===") | |
print(df.dtypes) | |
print("\n=== MISSING VALUES ===") | |
print(df.isnull().sum()) | |
print("\n=== BASIC STATISTICS ===") | |
print(df.describe()) | |
return df | |
except Exception as e: | |
print(f"Error loading data: {str(e)}") | |
return None | |
if __name__ == "__main__": | |
print("π Enhanced Data Explorer with SmoLagent AI") | |
print("Choose your preferred mode:") | |
print("1. Original function (load_and_explore_data)") | |
print("2. Enhanced interactive mode with AI") | |
mode = input("Enter mode (1 or 2): ").strip() | |
if mode == "1": | |
# Run your original function | |
df = load_and_explore_data() | |
elif mode == "2": | |
# Run enhanced mode with AI capabilities | |
explorer = EnhancedDataExplorer() | |
explorer.interactive_menu() | |
else: | |
print("Invalid choice. Running original function...") | |
df = load_and_explore_data() | |