Spaces:

NTU-Peak-2
/

Singtel_Use_Case1

Runtime error

Singtel_Use_Case1 / upload.py

cosmoruler

stuck already

db6dcad about 1 month ago

24.1 kB

	import pandas as pd
	import os
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from smolagents import CodeAgent, DuckDuckGoSearchTool
	import warnings
	warnings.filterwarnings('ignore')

	# Replace 'your_file.csv' with your CSV file path
	csv_file_path = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv"

	def set_csv_file_path(new_path):
	"""Update the CSV file path"""
	global csv_file_path
	csv_file_path = new_path
	print(f"✅ CSV file path updated to: {csv_file_path}")

	def get_csv_file_path():
	"""Get the current CSV file path"""
	return csv_file_path

	class EnhancedDataExplorer:
	"""Enhanced data explorer with SmoLagent AI capabilities"""

	def __init__(self, csv_path=csv_file_path):
	self.csv_path = csv_path
	self.df = None
	self.agent = None
	print("🚀 Enhanced Data Explorer initialized!")
	print("💡 AI setup will be done when first needed (option 4)")
	# Don't call setup_agent() here to avoid hanging

	def setup_agent(self):
	"""Setup SmoLagent AI agent with simple configuration"""
	print("🤖 Setting up SmoLagent AI agent...")
	print("🔄 Trying multiple model configurations...")

	try:
	# Try with Ollama using direct ollama package (fast and local)
	try:
	print("🔄 Attempting Ollama setup...")
	import ollama
	# Quick test if Ollama is available (without generation test)
	models = ollama.list()
	if models and 'models' in models and len(models['models']) > 0:
	print("✅ Ollama is running and accessible!")
	print(f"📦 Found model: {models['models'][0].get('name', 'llama2')}")
	else:
	raise Exception("No models found")

	# Create a custom model class for Ollama compatible with smolagents
	class OllamaModel:
	def __init__(self, model_name="llama2"):
	self.model_name = model_name
	import ollama
	self.ollama = ollama

	def __call__(self, messages, **kwargs):
	try:
	# Convert messages to Ollama format
	if isinstance(messages, str):
	prompt = messages
	elif isinstance(messages, list):
	# Handle different message formats
	if len(messages) > 0 and isinstance(messages[0], dict):
	# Extract content from message dictionaries
	prompt = "\n".join([
	msg.get('content', str(msg)) if isinstance(msg, dict) else str(msg)
	for msg in messages
	])
	else:
	prompt = "\n".join([str(msg) for msg in messages])
	else:
	prompt = str(messages)

	# Add timeout to prevent hanging
	import signal
	import time

	def timeout_handler(signum, frame):
	raise TimeoutError("Ollama response timeout")

	# Set a 30-second timeout for Windows (using threading instead)
	import threading
	result = {'response': None, 'error': None}

	def generate_with_timeout():
	try:
	response = self.ollama.generate(model=self.model_name, prompt=prompt)
	result['response'] = response['response']
	except Exception as e:
	result['error'] = str(e)

	thread = threading.Thread(target=generate_with_timeout)
	thread.daemon = True
	thread.start()
	thread.join(timeout=30) # 30 second timeout

	if thread.is_alive():
	return "Error: Ollama response timed out after 30 seconds. Try a simpler query."
	elif result['error']:
	return f"Error generating response with Ollama: {result['error']}"
	elif result['response']:
	return result['response']
	else:
	return "Error: No response received from Ollama"

	except Exception as e:
	return f"Error generating response with Ollama: {e}"

	def generate(self, messages, **kwargs):
	"""Alternative method name that might be expected"""
	return self.__call__(messages, **kwargs)

	model = OllamaModel("llama2")
	self.agent = CodeAgent(
	tools=[DuckDuckGoSearchTool()],
	model=model
	)
	print("✅ SmoLagent configured successfully with Ollama!")
	print("💡 Local AI model ready for analysis (with 30s timeout)")
	return
	except Exception as e:
	print(f"⚠️ Ollama setup failed: {e}")
	print("💡 Make sure Ollama is running: ollama serve")

	# Try OpenAI if API key is available
	try:
	print("🔄 Checking for OpenAI API key...")
	import os
	from smolagents import OpenAIModel
	if os.getenv('OPENAI_API_KEY'):
	model = OpenAIModel(model_id="gpt-3.5-turbo")
	self.agent = CodeAgent(
	tools=[DuckDuckGoSearchTool()],
	model=model
	)
	print("✅ SmoLagent configured successfully with OpenAI!")
	return
	else:
	print("⚠️ OpenAI API key not found")
	except Exception as e:
	print(f"⚠️ OpenAI setup failed: {e}")

	# Fallback to Transformers model (smaller version)
	try:
	print("🔄 Attempting HuggingFace Transformers model...")
	from smolagents import TransformersModel
	model = TransformersModel(model_id="microsoft/DialoGPT-small") # Smaller model
	self.agent = CodeAgent(
	tools=[DuckDuckGoSearchTool()],
	model=model
	)
	print("✅ SmoLagent configured successfully with HuggingFace model!")
	print("💡 Note: First use may take time to download model")
	return
	except Exception as e:
	print(f"⚠️ HuggingFace setup failed: {e}")
	print(" Make sure transformers are installed: pip install 'smolagents[transformers]'")

	# If all models fail
	print("\n❌ No AI model could be configured.")
	print("📋 To fix this:")
	print(" 1. For local AI: Install Ollama and run 'ollama serve'")
	print(" 2. For OpenAI: Set OPENAI_API_KEY environment variable")
	print(" 3. For basic use: pip install 'smolagents[transformers]'")
	print("\n✅ You can still use all non-AI features!")
	self.agent = None

	except Exception as e:
	print(f"⚠️ Agent setup failed: {e}")
	print("💡 Try using: python fixed_upload.py")
	self.agent = None

	def configure_model_helper(self):
	"""Helper function to guide model configuration"""
	print("\n🤖 AI Model Configuration Helper")
	print("=" * 40)
	print("1. OpenAI (Recommended - Most capable)")
	print("2. Ollama (Free - Runs locally)")
	print("3. Hugging Face (Free - API based)")
	print("4. Skip AI features")

	choice = input("Choose your model (1-4): ").strip()

	if choice == "1":
	print("\n📝 OpenAI Setup:")
	print("1. Get API key from: https://platform.openai.com/")
	print("2. Set environment variable: OPENAI_API_KEY=your_key")
	print("3. Or edit the setup_agent() method with your key")

	elif choice == "2":
	print("\n📝 Ollama Setup:")
	print("1. Install Ollama from: https://ollama.ai/")
	print("2. Run: ollama pull llama2")
	print("3. Start server: ollama serve")
	print("4. Script is already configured to use SmoLagents' native OllamaModel")
	print("5. Just make sure Ollama is running and try the AI analysis!")

	elif choice == "3":
	print("\n📝 Hugging Face Setup:")
	print("1. Create account at: https://huggingface.co/")
	print("2. Get token from: https://huggingface.co/settings/tokens")
	print("3. Set environment variable: HF_TOKEN=your_token")
	print("4. Uncomment HF lines in setup_agent() method")

	elif choice == "4":
	print("✅ You can still use all non-AI features!")

	print("\n💡 Tip: Set environment variables in your system or use a .env file")
	return choice


	def load_data(self):
	"""Load the CSV data (keeping your original functionality)"""
	print(f"\n📁 Loading data from: {self.csv_path}")

	try:
	# Check if file exists
	if not os.path.exists(self.csv_path):
	print(f"❌ Error: File not found at {self.csv_path}")
	print("💡 Use option 7 to change the file path")
	return None

	# Read the CSV file into a DataFrame
	self.df = pd.read_csv(self.csv_path)

	print("=== DATA LOADED SUCCESSFULLY ===")
	print(f"📁 File: {os.path.basename(self.csv_path)}")
	print(f"📊 Dataset shape: {self.df.shape}")
	print(f"📋 Columns: {list(self.df.columns)}")
	print("\n=== FIRST 5 ROWS ===")
	print(self.df.head())

	print("\n=== DATA TYPES ===")
	print(self.df.dtypes)

	print("\n=== MISSING VALUES ===")
	print(self.df.isnull().sum())

	print("\n=== BASIC STATISTICS ===")
	print(self.df.describe())

	return self.df

	except Exception as e:
	print(f"Error loading data: {str(e)}")
	return None

	def create_visualizations(self):
	"""Create basic visualizations"""
	if self.df is None:
	print("❌ No data loaded. Run load_data() first.")
	return

	try:
	# Set up plotting style
	plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default')

	# Get numeric columns
	numeric_cols = self.df.select_dtypes(include=[np.number]).columns

	if len(numeric_cols) == 0:
	print("⚠️ No numeric columns found for visualization")
	return

	print(f"\n=== CREATING VISUALIZATIONS FOR {len(numeric_cols)} NUMERIC COLUMNS ===")

	# 1. Distribution plots
	n_cols = min(3, len(numeric_cols))
	n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

	plt.figure(figsize=(15, 5*n_rows))
	for i, col in enumerate(numeric_cols):
	plt.subplot(n_rows, n_cols, i+1)
	self.df[col].hist(bins=30, alpha=0.7, edgecolor='black')
	plt.title(f'Distribution of {col}')
	plt.xlabel(col)
	plt.ylabel('Frequency')

	plt.tight_layout()
	plt.savefig('data_distributions.png', dpi=300, bbox_inches='tight')
	plt.show()
	print("✅ Distribution plots saved as 'data_distributions.png'")

	# 2. Correlation heatmap (if more than 1 numeric column)
	if len(numeric_cols) > 1:
	plt.figure(figsize=(12, 8))
	correlation_matrix = self.df[numeric_cols].corr()
	sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
	square=True, linewidths=0.5)
	plt.title('Correlation Heatmap')
	plt.tight_layout()
	plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
	plt.show()
	print("✅ Correlation heatmap saved as 'correlation_heatmap.png'")

	except Exception as e:
	print(f"❌ Error creating visualizations: {e}")

	def analyze_data_quality(self):
	"""Analyze data quality issues"""
	if self.df is None:
	print("❌ No data loaded. Run load_data() first.")
	return

	print("\n=== DATA QUALITY ANALYSIS ===")

	# Missing data analysis
	missing_data = self.df.isnull().sum()
	missing_percentage = (missing_data / len(self.df)) * 100

	quality_report = pd.DataFrame({
	'Column': self.df.columns,
	'Missing_Count': missing_data.values,
	'Missing_Percentage': missing_percentage.values,
	'Data_Type': self.df.dtypes.values
	})

	print("Missing Data Summary:")
	print(quality_report[quality_report['Missing_Count'] > 0])

	# Duplicate rows
	duplicates = self.df.duplicated().sum()
	print(f"\nDuplicate rows: {duplicates}")

	# Memory usage
	memory_usage = self.df.memory_usage(deep=True).sum() / 1024**2
	print(f"Memory usage: {memory_usage:.2f} MB")

	return quality_report

	def ai_analysis(self, query):
	"""Use SmoLagent for AI-powered analysis"""
	print(f"\n🔍 Checking prerequisites for AI analysis...")

	if self.agent is None:
	print("❌ AI agent not configured. Please set up SmoLagent first.")
	print("💡 Try running one of these alternatives:")
	print(" • python fixed_upload.py")
	print(" • python quick_ai_demo.py")
	return

	if self.df is None:
	print("❌ No data loaded. Please load data first!")
	print("💡 Choose option 1 in the main menu to load your data.")
	return

	print("✅ Data loaded successfully")
	print("✅ AI agent configured")
	print(f"✅ Processing query: '{query}'")

	# Prepare context about the dataset
	try:
	data_context = f"""
	Dataset Analysis Request:
	- Dataset Shape: {self.df.shape}
	- Columns: {list(self.df.columns)}
	- Data Types: {dict(self.df.dtypes)}
	- Missing Values: {dict(self.df.isnull().sum())}

	Sample Data:
	{self.df.head(3).to_string()}

	Statistical Summary:
	{self.df.describe().to_string()}

	User Question: {query}
	"""

	print(f"\n🤖 SmoLagent is analyzing your data...")
	print("⏳ This may take 5-15 seconds...")

	# Use the agent with the data context and query
	response = self.agent.run(data_context)

	print("\n" + "="*60)
	print("✅ AI ANALYSIS COMPLETE")
	print("="*60)
	print(response)
	print("="*60)
	return response

	except Exception as e:
	print(f"\n❌ AI analysis failed: {e}")
	print("\n💡 Troubleshooting suggestions:")
	print(" • Check your internet connection")
	print(" • Try: python fixed_upload.py")
	print(" • Use basic analysis features (options 2-3)")
	return None

	def check_status(self):
	"""Check the status of data and AI setup"""
	print("\n🔍 SYSTEM STATUS CHECK")
	print("="*40)

	# Check file path
	print(f"📁 CSV File: {self.csv_path}")
	if os.path.exists(self.csv_path):
	print(f"✅ File exists: {os.path.basename(self.csv_path)}")
	else:
	print(f"❌ File not found")

	# Check data status
	if self.df is not None:
	print(f"✅ Data loaded: {self.df.shape[0]} rows, {self.df.shape[1]} columns")
	print(f"📋 Columns: {list(self.df.columns)}")
	else:
	print("❌ No data loaded")

	# Check AI agent status
	if self.agent is not None:
	print("✅ AI agent configured and ready")
	else:
	print("❌ AI agent not configured")

	print("="*40)

	def change_csv_file(self, new_path=None):
	"""Change the CSV file path"""
	if new_path is None:
	print(f"\n📁 Current file path: {self.csv_path}")
	new_path = input("Enter new CSV file path: ").strip()

	if os.path.exists(new_path):
	self.csv_path = new_path
	self.df = None # Clear current data
	print(f"✅ CSV file path updated to: {self.csv_path}")
	print("💡 Data cleared. Use option 1 to load the new file.")
	else:
	print(f"❌ File not found: {new_path}")
	print("💡 Please check the file path and try again.")

	def interactive_menu(self):
	"""Interactive menu for data exploration"""
	# Show initial status
	self.check_status()

	while True:
	print("\n" + "="*50)
	print("🤖 ENHANCED DATA EXPLORER WITH AI")
	print("="*50)
	print("1. Load and explore data")
	print("2. Create visualizations")
	print("3. Analyze data quality")
	print("4. AI-powered analysis")
	print("5. Show data summary")
	print("6. Check system status")
	print("7. Change CSV file path")
	print("8. Exit")
	print("="*50)
	print(f"📁 Current file: {os.path.basename(self.csv_path)}")

	choice = input("Enter your choice (1-8): ").strip()

	if choice == '1':
	self.load_data()
	elif choice == '2':
	self.create_visualizations()
	elif choice == '3':
	self.analyze_data_quality()
	elif choice == '4':
	if self.df is None:
	print("\n❌ No data loaded. Please load data first!")
	print("💡 Choose option 1 to load your data before using AI analysis.")
	input("\nPress Enter to continue...")
	else:
	# Setup AI on demand if not already done
	if self.agent is None:
	print("\n🤖 Setting up AI for first use...")
	self.setup_agent()

	if self.agent is None:
	print("\n❌ AI features not available. Please configure a model first.")
	print("Edit the setup_agent() method to add your API keys.")
	self.configure_model_helper()
	else:
	print("\n🤖 AI Analysis - Ask me anything about your data!")
	print("Example queries:")
	print(" • 'What are the main trends in this data?'")
	print(" • 'Find any outliers or anomalies'")
	print(" • 'Suggest data quality improvements'")
	print(" • 'Perform correlation analysis'")
	print(" • 'Identify seasonal patterns'")
	print(" • 'Recommend preprocessing steps'")

	query = input("\n💬 Your question: ").strip()
	if query:
	self.ai_analysis(query)
	# Wait for user to read the results before returning to menu
	input("\n📋 Press Enter to return to main menu...")
	else:
	print("❌ No question entered.")
	input("\nPress Enter to continue...")
	elif choice == '5':
	if self.df is not None:
	print(f"\n📊 Dataset Summary:")
	print(f"Shape: {self.df.shape}")
	print(f"Columns: {list(self.df.columns)}")
	print(f"Memory: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
	else:
	print("❌ No data loaded.")
	elif choice == '6':
	self.check_status()
	elif choice == '7':
	self.change_csv_file()
	elif choice == '8':
	print("👋 Goodbye!")
	break
	else:
	print("❌ Invalid choice. Please try again.")

	def load_and_explore_data():
	"""Load and explore the CSV data (keeping your original function)"""
	print(f"\n📁 Loading data from: {csv_file_path}")

	try:
	# Check if file exists
	if not os.path.exists(csv_file_path):
	print(f"❌ Error: File not found at {csv_file_path}")
	print("💡 Update the csv_file_path variable at the top of this file")
	return None

	# Read the CSV file into a DataFrame
	df = pd.read_csv(csv_file_path)

	print("=== DATA LOADED SUCCESSFULLY ===")
	print(f"📁 File: {os.path.basename(csv_file_path)}")
	print(f"📊 Dataset shape: {df.shape}")
	print(f"📋 Columns: {list(df.columns)}")
	print("\n=== FIRST 5 ROWS ===")
	print(df.head())

	print("\n=== DATA TYPES ===")
	print(df.dtypes)

	print("\n=== MISSING VALUES ===")
	print(df.isnull().sum())

	print("\n=== BASIC STATISTICS ===")
	print(df.describe())

	return df

	except Exception as e:
	print(f"Error loading data: {str(e)}")
	return None

	if __name__ == "__main__":
	print("🚀 Enhanced Data Explorer with SmoLagent AI")
	print("Choose your preferred mode:")
	print("1. Original function (load_and_explore_data)")
	print("2. Enhanced interactive mode with AI")

	mode = input("Enter mode (1 or 2): ").strip()

	if mode == "1":
	# Run your original function
	df = load_and_explore_data()
	elif mode == "2":
	# Run enhanced mode with AI capabilities
	explorer = EnhancedDataExplorer()
	explorer.interactive_menu()
	else:
	print("Invalid choice. Running original function...")
	df = load_and_explore_data()