Singtel_Use_Case1 / upload.py
cosmoruler
stuck already
db6dcad
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from smolagents import CodeAgent, DuckDuckGoSearchTool
import warnings
warnings.filterwarnings('ignore')
# Replace 'your_file.csv' with your CSV file path
csv_file_path = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"
def set_csv_file_path(new_path):
"""Update the CSV file path"""
global csv_file_path
csv_file_path = new_path
print(f"βœ… CSV file path updated to: {csv_file_path}")
def get_csv_file_path():
"""Get the current CSV file path"""
return csv_file_path
class EnhancedDataExplorer:
"""Enhanced data explorer with SmoLagent AI capabilities"""
def __init__(self, csv_path=csv_file_path):
self.csv_path = csv_path
self.df = None
self.agent = None
print("πŸš€ Enhanced Data Explorer initialized!")
print("πŸ’‘ AI setup will be done when first needed (option 4)")
# Don't call setup_agent() here to avoid hanging
def setup_agent(self):
"""Setup SmoLagent AI agent with simple configuration"""
print("πŸ€– Setting up SmoLagent AI agent...")
print("πŸ”„ Trying multiple model configurations...")
try:
# Try with Ollama using direct ollama package (fast and local)
try:
print("πŸ”„ Attempting Ollama setup...")
import ollama
# Quick test if Ollama is available (without generation test)
models = ollama.list()
if models and 'models' in models and len(models['models']) > 0:
print("βœ… Ollama is running and accessible!")
print(f"πŸ“¦ Found model: {models['models'][0].get('name', 'llama2')}")
else:
raise Exception("No models found")
# Create a custom model class for Ollama compatible with smolagents
class OllamaModel:
def __init__(self, model_name="llama2"):
self.model_name = model_name
import ollama
self.ollama = ollama
def __call__(self, messages, **kwargs):
try:
# Convert messages to Ollama format
if isinstance(messages, str):
prompt = messages
elif isinstance(messages, list):
# Handle different message formats
if len(messages) > 0 and isinstance(messages[0], dict):
# Extract content from message dictionaries
prompt = "\n".join([
msg.get('content', str(msg)) if isinstance(msg, dict) else str(msg)
for msg in messages
])
else:
prompt = "\n".join([str(msg) for msg in messages])
else:
prompt = str(messages)
# Add timeout to prevent hanging
import signal
import time
def timeout_handler(signum, frame):
raise TimeoutError("Ollama response timeout")
# Set a 30-second timeout for Windows (using threading instead)
import threading
result = {'response': None, 'error': None}
def generate_with_timeout():
try:
response = self.ollama.generate(model=self.model_name, prompt=prompt)
result['response'] = response['response']
except Exception as e:
result['error'] = str(e)
thread = threading.Thread(target=generate_with_timeout)
thread.daemon = True
thread.start()
thread.join(timeout=30) # 30 second timeout
if thread.is_alive():
return "Error: Ollama response timed out after 30 seconds. Try a simpler query."
elif result['error']:
return f"Error generating response with Ollama: {result['error']}"
elif result['response']:
return result['response']
else:
return "Error: No response received from Ollama"
except Exception as e:
return f"Error generating response with Ollama: {e}"
def generate(self, messages, **kwargs):
"""Alternative method name that might be expected"""
return self.__call__(messages, **kwargs)
model = OllamaModel("llama2")
self.agent = CodeAgent(
tools=[DuckDuckGoSearchTool()],
model=model
)
print("βœ… SmoLagent configured successfully with Ollama!")
print("πŸ’‘ Local AI model ready for analysis (with 30s timeout)")
return
except Exception as e:
print(f"⚠️ Ollama setup failed: {e}")
print("πŸ’‘ Make sure Ollama is running: ollama serve")
# Try OpenAI if API key is available
try:
print("πŸ”„ Checking for OpenAI API key...")
import os
from smolagents import OpenAIModel
if os.getenv('OPENAI_API_KEY'):
model = OpenAIModel(model_id="gpt-3.5-turbo")
self.agent = CodeAgent(
tools=[DuckDuckGoSearchTool()],
model=model
)
print("βœ… SmoLagent configured successfully with OpenAI!")
return
else:
print("⚠️ OpenAI API key not found")
except Exception as e:
print(f"⚠️ OpenAI setup failed: {e}")
# Fallback to Transformers model (smaller version)
try:
print("πŸ”„ Attempting HuggingFace Transformers model...")
from smolagents import TransformersModel
model = TransformersModel(model_id="microsoft/DialoGPT-small") # Smaller model
self.agent = CodeAgent(
tools=[DuckDuckGoSearchTool()],
model=model
)
print("βœ… SmoLagent configured successfully with HuggingFace model!")
print("πŸ’‘ Note: First use may take time to download model")
return
except Exception as e:
print(f"⚠️ HuggingFace setup failed: {e}")
print(" Make sure transformers are installed: pip install 'smolagents[transformers]'")
# If all models fail
print("\n❌ No AI model could be configured.")
print("πŸ“‹ To fix this:")
print(" 1. For local AI: Install Ollama and run 'ollama serve'")
print(" 2. For OpenAI: Set OPENAI_API_KEY environment variable")
print(" 3. For basic use: pip install 'smolagents[transformers]'")
print("\nβœ… You can still use all non-AI features!")
self.agent = None
except Exception as e:
print(f"⚠️ Agent setup failed: {e}")
print("πŸ’‘ Try using: python fixed_upload.py")
self.agent = None
def configure_model_helper(self):
"""Helper function to guide model configuration"""
print("\nπŸ€– AI Model Configuration Helper")
print("=" * 40)
print("1. OpenAI (Recommended - Most capable)")
print("2. Ollama (Free - Runs locally)")
print("3. Hugging Face (Free - API based)")
print("4. Skip AI features")
choice = input("Choose your model (1-4): ").strip()
if choice == "1":
print("\nπŸ“ OpenAI Setup:")
print("1. Get API key from: https://platform.openai.com/")
print("2. Set environment variable: OPENAI_API_KEY=your_key")
print("3. Or edit the setup_agent() method with your key")
elif choice == "2":
print("\nπŸ“ Ollama Setup:")
print("1. Install Ollama from: https://ollama.ai/")
print("2. Run: ollama pull llama2")
print("3. Start server: ollama serve")
print("4. Script is already configured to use SmoLagents' native OllamaModel")
print("5. Just make sure Ollama is running and try the AI analysis!")
elif choice == "3":
print("\nπŸ“ Hugging Face Setup:")
print("1. Create account at: https://huggingface.co/")
print("2. Get token from: https://huggingface.co/settings/tokens")
print("3. Set environment variable: HF_TOKEN=your_token")
print("4. Uncomment HF lines in setup_agent() method")
elif choice == "4":
print("βœ… You can still use all non-AI features!")
print("\nπŸ’‘ Tip: Set environment variables in your system or use a .env file")
return choice
def load_data(self):
"""Load the CSV data (keeping your original functionality)"""
print(f"\nπŸ“ Loading data from: {self.csv_path}")
try:
# Check if file exists
if not os.path.exists(self.csv_path):
print(f"❌ Error: File not found at {self.csv_path}")
print("πŸ’‘ Use option 7 to change the file path")
return None
# Read the CSV file into a DataFrame
self.df = pd.read_csv(self.csv_path)
print("=== DATA LOADED SUCCESSFULLY ===")
print(f"πŸ“ File: {os.path.basename(self.csv_path)}")
print(f"πŸ“Š Dataset shape: {self.df.shape}")
print(f"πŸ“‹ Columns: {list(self.df.columns)}")
print("\n=== FIRST 5 ROWS ===")
print(self.df.head())
print("\n=== DATA TYPES ===")
print(self.df.dtypes)
print("\n=== MISSING VALUES ===")
print(self.df.isnull().sum())
print("\n=== BASIC STATISTICS ===")
print(self.df.describe())
return self.df
except Exception as e:
print(f"Error loading data: {str(e)}")
return None
def create_visualizations(self):
"""Create basic visualizations"""
if self.df is None:
print("❌ No data loaded. Run load_data() first.")
return
try:
# Set up plotting style
plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default')
# Get numeric columns
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) == 0:
print("⚠️ No numeric columns found for visualization")
return
print(f"\n=== CREATING VISUALIZATIONS FOR {len(numeric_cols)} NUMERIC COLUMNS ===")
# 1. Distribution plots
n_cols = min(3, len(numeric_cols))
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
plt.figure(figsize=(15, 5*n_rows))
for i, col in enumerate(numeric_cols):
plt.subplot(n_rows, n_cols, i+1)
self.df[col].hist(bins=30, alpha=0.7, edgecolor='black')
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('data_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print("βœ… Distribution plots saved as 'data_distributions.png'")
# 2. Correlation heatmap (if more than 1 numeric column)
if len(numeric_cols) > 1:
plt.figure(figsize=(12, 8))
correlation_matrix = self.df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
square=True, linewidths=0.5)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()
print("βœ… Correlation heatmap saved as 'correlation_heatmap.png'")
except Exception as e:
print(f"❌ Error creating visualizations: {e}")
def analyze_data_quality(self):
"""Analyze data quality issues"""
if self.df is None:
print("❌ No data loaded. Run load_data() first.")
return
print("\n=== DATA QUALITY ANALYSIS ===")
# Missing data analysis
missing_data = self.df.isnull().sum()
missing_percentage = (missing_data / len(self.df)) * 100
quality_report = pd.DataFrame({
'Column': self.df.columns,
'Missing_Count': missing_data.values,
'Missing_Percentage': missing_percentage.values,
'Data_Type': self.df.dtypes.values
})
print("Missing Data Summary:")
print(quality_report[quality_report['Missing_Count'] > 0])
# Duplicate rows
duplicates = self.df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")
# Memory usage
memory_usage = self.df.memory_usage(deep=True).sum() / 1024**2
print(f"Memory usage: {memory_usage:.2f} MB")
return quality_report
def ai_analysis(self, query):
"""Use SmoLagent for AI-powered analysis"""
print(f"\nπŸ” Checking prerequisites for AI analysis...")
if self.agent is None:
print("❌ AI agent not configured. Please set up SmoLagent first.")
print("πŸ’‘ Try running one of these alternatives:")
print(" β€’ python fixed_upload.py")
print(" β€’ python quick_ai_demo.py")
return
if self.df is None:
print("❌ No data loaded. Please load data first!")
print("πŸ’‘ Choose option 1 in the main menu to load your data.")
return
print("βœ… Data loaded successfully")
print("βœ… AI agent configured")
print(f"βœ… Processing query: '{query}'")
# Prepare context about the dataset
try:
data_context = f"""
Dataset Analysis Request:
- Dataset Shape: {self.df.shape}
- Columns: {list(self.df.columns)}
- Data Types: {dict(self.df.dtypes)}
- Missing Values: {dict(self.df.isnull().sum())}
Sample Data:
{self.df.head(3).to_string()}
Statistical Summary:
{self.df.describe().to_string()}
User Question: {query}
"""
print(f"\nπŸ€– SmoLagent is analyzing your data...")
print("⏳ This may take 5-15 seconds...")
# Use the agent with the data context and query
response = self.agent.run(data_context)
print("\n" + "="*60)
print("βœ… AI ANALYSIS COMPLETE")
print("="*60)
print(response)
print("="*60)
return response
except Exception as e:
print(f"\n❌ AI analysis failed: {e}")
print("\nπŸ’‘ Troubleshooting suggestions:")
print(" β€’ Check your internet connection")
print(" β€’ Try: python fixed_upload.py")
print(" β€’ Use basic analysis features (options 2-3)")
return None
def check_status(self):
"""Check the status of data and AI setup"""
print("\nπŸ” SYSTEM STATUS CHECK")
print("="*40)
# Check file path
print(f"πŸ“ CSV File: {self.csv_path}")
if os.path.exists(self.csv_path):
print(f"βœ… File exists: {os.path.basename(self.csv_path)}")
else:
print(f"❌ File not found")
# Check data status
if self.df is not None:
print(f"βœ… Data loaded: {self.df.shape[0]} rows, {self.df.shape[1]} columns")
print(f"πŸ“‹ Columns: {list(self.df.columns)}")
else:
print("❌ No data loaded")
# Check AI agent status
if self.agent is not None:
print("βœ… AI agent configured and ready")
else:
print("❌ AI agent not configured")
print("="*40)
def change_csv_file(self, new_path=None):
"""Change the CSV file path"""
if new_path is None:
print(f"\nπŸ“ Current file path: {self.csv_path}")
new_path = input("Enter new CSV file path: ").strip()
if os.path.exists(new_path):
self.csv_path = new_path
self.df = None # Clear current data
print(f"βœ… CSV file path updated to: {self.csv_path}")
print("πŸ’‘ Data cleared. Use option 1 to load the new file.")
else:
print(f"❌ File not found: {new_path}")
print("πŸ’‘ Please check the file path and try again.")
def interactive_menu(self):
"""Interactive menu for data exploration"""
# Show initial status
self.check_status()
while True:
print("\n" + "="*50)
print("πŸ€– ENHANCED DATA EXPLORER WITH AI")
print("="*50)
print("1. Load and explore data")
print("2. Create visualizations")
print("3. Analyze data quality")
print("4. AI-powered analysis")
print("5. Show data summary")
print("6. Check system status")
print("7. Change CSV file path")
print("8. Exit")
print("="*50)
print(f"πŸ“ Current file: {os.path.basename(self.csv_path)}")
choice = input("Enter your choice (1-8): ").strip()
if choice == '1':
self.load_data()
elif choice == '2':
self.create_visualizations()
elif choice == '3':
self.analyze_data_quality()
elif choice == '4':
if self.df is None:
print("\n❌ No data loaded. Please load data first!")
print("πŸ’‘ Choose option 1 to load your data before using AI analysis.")
input("\nPress Enter to continue...")
else:
# Setup AI on demand if not already done
if self.agent is None:
print("\nπŸ€– Setting up AI for first use...")
self.setup_agent()
if self.agent is None:
print("\n❌ AI features not available. Please configure a model first.")
print("Edit the setup_agent() method to add your API keys.")
self.configure_model_helper()
else:
print("\nπŸ€– AI Analysis - Ask me anything about your data!")
print("Example queries:")
print(" β€’ 'What are the main trends in this data?'")
print(" β€’ 'Find any outliers or anomalies'")
print(" β€’ 'Suggest data quality improvements'")
print(" β€’ 'Perform correlation analysis'")
print(" β€’ 'Identify seasonal patterns'")
print(" β€’ 'Recommend preprocessing steps'")
query = input("\nπŸ’¬ Your question: ").strip()
if query:
self.ai_analysis(query)
# Wait for user to read the results before returning to menu
input("\nπŸ“‹ Press Enter to return to main menu...")
else:
print("❌ No question entered.")
input("\nPress Enter to continue...")
elif choice == '5':
if self.df is not None:
print(f"\nπŸ“Š Dataset Summary:")
print(f"Shape: {self.df.shape}")
print(f"Columns: {list(self.df.columns)}")
print(f"Memory: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
else:
print("❌ No data loaded.")
elif choice == '6':
self.check_status()
elif choice == '7':
self.change_csv_file()
elif choice == '8':
print("πŸ‘‹ Goodbye!")
break
else:
print("❌ Invalid choice. Please try again.")
def load_and_explore_data():
"""Load and explore the CSV data (keeping your original function)"""
print(f"\nπŸ“ Loading data from: {csv_file_path}")
try:
# Check if file exists
if not os.path.exists(csv_file_path):
print(f"❌ Error: File not found at {csv_file_path}")
print("πŸ’‘ Update the csv_file_path variable at the top of this file")
return None
# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)
print("=== DATA LOADED SUCCESSFULLY ===")
print(f"πŸ“ File: {os.path.basename(csv_file_path)}")
print(f"πŸ“Š Dataset shape: {df.shape}")
print(f"πŸ“‹ Columns: {list(df.columns)}")
print("\n=== FIRST 5 ROWS ===")
print(df.head())
print("\n=== DATA TYPES ===")
print(df.dtypes)
print("\n=== MISSING VALUES ===")
print(df.isnull().sum())
print("\n=== BASIC STATISTICS ===")
print(df.describe())
return df
except Exception as e:
print(f"Error loading data: {str(e)}")
return None
if __name__ == "__main__":
print("πŸš€ Enhanced Data Explorer with SmoLagent AI")
print("Choose your preferred mode:")
print("1. Original function (load_and_explore_data)")
print("2. Enhanced interactive mode with AI")
mode = input("Enter mode (1 or 2): ").strip()
if mode == "1":
# Run your original function
df = load_and_explore_data()
elif mode == "2":
# Run enhanced mode with AI capabilities
explorer = EnhancedDataExplorer()
explorer.interactive_menu()
else:
print("Invalid choice. Running original function...")
df = load_and_explore_data()