import pandas as pd import os import numpy as np import matplotlib.pyplot as plt import seaborn as sns from smolagents import CodeAgent, DuckDuckGoSearchTool import warnings warnings.filterwarnings('ignore') # Replace 'your_file.csv' with your CSV file path csv_file_path = "C:/Users/Cosmo/Desktop/NTU Peak Singtel/outsystems_sample_logs_6months.csv" def set_csv_file_path(new_path): """Update the CSV file path""" global csv_file_path csv_file_path = new_path print(f"āœ… CSV file path updated to: {csv_file_path}") def get_csv_file_path(): """Get the current CSV file path""" return csv_file_path class EnhancedDataExplorer: """Enhanced data explorer with SmoLagent AI capabilities""" def __init__(self, csv_path=csv_file_path): self.csv_path = csv_path self.df = None self.agent = None print("šŸš€ Enhanced Data Explorer initialized!") print("šŸ’” AI setup will be done when first needed (option 4)") # Don't call setup_agent() here to avoid hanging def setup_agent(self): """Setup SmoLagent AI agent with simple configuration""" print("šŸ¤– Setting up SmoLagent AI agent...") print("šŸ”„ Trying multiple model configurations...") try: # Try with Ollama using direct ollama package (fast and local) try: print("šŸ”„ Attempting Ollama setup...") import ollama # Quick test if Ollama is available (without generation test) models = ollama.list() if models and 'models' in models and len(models['models']) > 0: print("āœ… Ollama is running and accessible!") print(f"šŸ“¦ Found model: {models['models'][0].get('name', 'llama2')}") else: raise Exception("No models found") # Create a custom model class for Ollama compatible with smolagents class OllamaModel: def __init__(self, model_name="llama2"): self.model_name = model_name import ollama self.ollama = ollama def __call__(self, messages, **kwargs): try: # Convert messages to Ollama format if isinstance(messages, str): prompt = messages elif isinstance(messages, list): # Handle different message formats if len(messages) > 0 and isinstance(messages[0], dict): # Extract content from message dictionaries prompt = "\n".join([ msg.get('content', str(msg)) if isinstance(msg, dict) else str(msg) for msg in messages ]) else: prompt = "\n".join([str(msg) for msg in messages]) else: prompt = str(messages) # Add timeout to prevent hanging import signal import time def timeout_handler(signum, frame): raise TimeoutError("Ollama response timeout") # Set a 30-second timeout for Windows (using threading instead) import threading result = {'response': None, 'error': None} def generate_with_timeout(): try: response = self.ollama.generate(model=self.model_name, prompt=prompt) result['response'] = response['response'] except Exception as e: result['error'] = str(e) thread = threading.Thread(target=generate_with_timeout) thread.daemon = True thread.start() thread.join(timeout=30) # 30 second timeout if thread.is_alive(): return "Error: Ollama response timed out after 30 seconds. Try a simpler query." elif result['error']: return f"Error generating response with Ollama: {result['error']}" elif result['response']: return result['response'] else: return "Error: No response received from Ollama" except Exception as e: return f"Error generating response with Ollama: {e}" def generate(self, messages, **kwargs): """Alternative method name that might be expected""" return self.__call__(messages, **kwargs) model = OllamaModel("llama2") self.agent = CodeAgent( tools=[DuckDuckGoSearchTool()], model=model ) print("āœ… SmoLagent configured successfully with Ollama!") print("šŸ’” Local AI model ready for analysis (with 30s timeout)") return except Exception as e: print(f"āš ļø Ollama setup failed: {e}") print("šŸ’” Make sure Ollama is running: ollama serve") # Try OpenAI if API key is available try: print("šŸ”„ Checking for OpenAI API key...") import os from smolagents import OpenAIModel if os.getenv('OPENAI_API_KEY'): model = OpenAIModel(model_id="gpt-3.5-turbo") self.agent = CodeAgent( tools=[DuckDuckGoSearchTool()], model=model ) print("āœ… SmoLagent configured successfully with OpenAI!") return else: print("āš ļø OpenAI API key not found") except Exception as e: print(f"āš ļø OpenAI setup failed: {e}") # Fallback to Transformers model (smaller version) try: print("šŸ”„ Attempting HuggingFace Transformers model...") from smolagents import TransformersModel model = TransformersModel(model_id="microsoft/DialoGPT-small") # Smaller model self.agent = CodeAgent( tools=[DuckDuckGoSearchTool()], model=model ) print("āœ… SmoLagent configured successfully with HuggingFace model!") print("šŸ’” Note: First use may take time to download model") return except Exception as e: print(f"āš ļø HuggingFace setup failed: {e}") print(" Make sure transformers are installed: pip install 'smolagents[transformers]'") # If all models fail print("\nāŒ No AI model could be configured.") print("šŸ“‹ To fix this:") print(" 1. For local AI: Install Ollama and run 'ollama serve'") print(" 2. For OpenAI: Set OPENAI_API_KEY environment variable") print(" 3. For basic use: pip install 'smolagents[transformers]'") print("\nāœ… You can still use all non-AI features!") self.agent = None except Exception as e: print(f"āš ļø Agent setup failed: {e}") print("šŸ’” Try using: python fixed_upload.py") self.agent = None def configure_model_helper(self): """Helper function to guide model configuration""" print("\nšŸ¤– AI Model Configuration Helper") print("=" * 40) print("1. OpenAI (Recommended - Most capable)") print("2. Ollama (Free - Runs locally)") print("3. Hugging Face (Free - API based)") print("4. Skip AI features") choice = input("Choose your model (1-4): ").strip() if choice == "1": print("\nšŸ“ OpenAI Setup:") print("1. Get API key from: https://platform.openai.com/") print("2. Set environment variable: OPENAI_API_KEY=your_key") print("3. Or edit the setup_agent() method with your key") elif choice == "2": print("\nšŸ“ Ollama Setup:") print("1. Install Ollama from: https://ollama.ai/") print("2. Run: ollama pull llama2") print("3. Start server: ollama serve") print("4. Script is already configured to use SmoLagents' native OllamaModel") print("5. Just make sure Ollama is running and try the AI analysis!") elif choice == "3": print("\nšŸ“ Hugging Face Setup:") print("1. Create account at: https://huggingface.co/") print("2. Get token from: https://huggingface.co/settings/tokens") print("3. Set environment variable: HF_TOKEN=your_token") print("4. Uncomment HF lines in setup_agent() method") elif choice == "4": print("āœ… You can still use all non-AI features!") print("\nšŸ’” Tip: Set environment variables in your system or use a .env file") return choice def load_data(self): """Load the CSV data (keeping your original functionality)""" print(f"\nšŸ“ Loading data from: {self.csv_path}") try: # Check if file exists if not os.path.exists(self.csv_path): print(f"āŒ Error: File not found at {self.csv_path}") print("šŸ’” Use option 7 to change the file path") return None # Read the CSV file into a DataFrame self.df = pd.read_csv(self.csv_path) print("=== DATA LOADED SUCCESSFULLY ===") print(f"šŸ“ File: {os.path.basename(self.csv_path)}") print(f"šŸ“Š Dataset shape: {self.df.shape}") print(f"šŸ“‹ Columns: {list(self.df.columns)}") print("\n=== FIRST 5 ROWS ===") print(self.df.head()) print("\n=== DATA TYPES ===") print(self.df.dtypes) print("\n=== MISSING VALUES ===") print(self.df.isnull().sum()) print("\n=== BASIC STATISTICS ===") print(self.df.describe()) return self.df except Exception as e: print(f"Error loading data: {str(e)}") return None def create_visualizations(self): """Create basic visualizations""" if self.df is None: print("āŒ No data loaded. Run load_data() first.") return try: # Set up plotting style plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default') # Get numeric columns numeric_cols = self.df.select_dtypes(include=[np.number]).columns if len(numeric_cols) == 0: print("āš ļø No numeric columns found for visualization") return print(f"\n=== CREATING VISUALIZATIONS FOR {len(numeric_cols)} NUMERIC COLUMNS ===") # 1. Distribution plots n_cols = min(3, len(numeric_cols)) n_rows = (len(numeric_cols) + n_cols - 1) // n_cols plt.figure(figsize=(15, 5*n_rows)) for i, col in enumerate(numeric_cols): plt.subplot(n_rows, n_cols, i+1) self.df[col].hist(bins=30, alpha=0.7, edgecolor='black') plt.title(f'Distribution of {col}') plt.xlabel(col) plt.ylabel('Frequency') plt.tight_layout() plt.savefig('data_distributions.png', dpi=300, bbox_inches='tight') plt.show() print("āœ… Distribution plots saved as 'data_distributions.png'") # 2. Correlation heatmap (if more than 1 numeric column) if len(numeric_cols) > 1: plt.figure(figsize=(12, 8)) correlation_matrix = self.df[numeric_cols].corr() sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True, linewidths=0.5) plt.title('Correlation Heatmap') plt.tight_layout() plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight') plt.show() print("āœ… Correlation heatmap saved as 'correlation_heatmap.png'") except Exception as e: print(f"āŒ Error creating visualizations: {e}") def analyze_data_quality(self): """Analyze data quality issues""" if self.df is None: print("āŒ No data loaded. Run load_data() first.") return print("\n=== DATA QUALITY ANALYSIS ===") # Missing data analysis missing_data = self.df.isnull().sum() missing_percentage = (missing_data / len(self.df)) * 100 quality_report = pd.DataFrame({ 'Column': self.df.columns, 'Missing_Count': missing_data.values, 'Missing_Percentage': missing_percentage.values, 'Data_Type': self.df.dtypes.values }) print("Missing Data Summary:") print(quality_report[quality_report['Missing_Count'] > 0]) # Duplicate rows duplicates = self.df.duplicated().sum() print(f"\nDuplicate rows: {duplicates}") # Memory usage memory_usage = self.df.memory_usage(deep=True).sum() / 1024**2 print(f"Memory usage: {memory_usage:.2f} MB") return quality_report def ai_analysis(self, query): """Use SmoLagent for AI-powered analysis""" print(f"\nšŸ” Checking prerequisites for AI analysis...") if self.agent is None: print("āŒ AI agent not configured. Please set up SmoLagent first.") print("šŸ’” Try running one of these alternatives:") print(" • python fixed_upload.py") print(" • python quick_ai_demo.py") return if self.df is None: print("āŒ No data loaded. Please load data first!") print("šŸ’” Choose option 1 in the main menu to load your data.") return print("āœ… Data loaded successfully") print("āœ… AI agent configured") print(f"āœ… Processing query: '{query}'") # Prepare context about the dataset try: data_context = f""" Dataset Analysis Request: - Dataset Shape: {self.df.shape} - Columns: {list(self.df.columns)} - Data Types: {dict(self.df.dtypes)} - Missing Values: {dict(self.df.isnull().sum())} Sample Data: {self.df.head(3).to_string()} Statistical Summary: {self.df.describe().to_string()} User Question: {query} """ print(f"\nšŸ¤– SmoLagent is analyzing your data...") print("ā³ This may take 5-15 seconds...") # Use the agent with the data context and query response = self.agent.run(data_context) print("\n" + "="*60) print("āœ… AI ANALYSIS COMPLETE") print("="*60) print(response) print("="*60) return response except Exception as e: print(f"\nāŒ AI analysis failed: {e}") print("\nšŸ’” Troubleshooting suggestions:") print(" • Check your internet connection") print(" • Try: python fixed_upload.py") print(" • Use basic analysis features (options 2-3)") return None def check_status(self): """Check the status of data and AI setup""" print("\nšŸ” SYSTEM STATUS CHECK") print("="*40) # Check file path print(f"šŸ“ CSV File: {self.csv_path}") if os.path.exists(self.csv_path): print(f"āœ… File exists: {os.path.basename(self.csv_path)}") else: print(f"āŒ File not found") # Check data status if self.df is not None: print(f"āœ… Data loaded: {self.df.shape[0]} rows, {self.df.shape[1]} columns") print(f"šŸ“‹ Columns: {list(self.df.columns)}") else: print("āŒ No data loaded") # Check AI agent status if self.agent is not None: print("āœ… AI agent configured and ready") else: print("āŒ AI agent not configured") print("="*40) def change_csv_file(self, new_path=None): """Change the CSV file path""" if new_path is None: print(f"\nšŸ“ Current file path: {self.csv_path}") new_path = input("Enter new CSV file path: ").strip() if os.path.exists(new_path): self.csv_path = new_path self.df = None # Clear current data print(f"āœ… CSV file path updated to: {self.csv_path}") print("šŸ’” Data cleared. Use option 1 to load the new file.") else: print(f"āŒ File not found: {new_path}") print("šŸ’” Please check the file path and try again.") def interactive_menu(self): """Interactive menu for data exploration""" # Show initial status self.check_status() while True: print("\n" + "="*50) print("šŸ¤– ENHANCED DATA EXPLORER WITH AI") print("="*50) print("1. Load and explore data") print("2. Create visualizations") print("3. Analyze data quality") print("4. AI-powered analysis") print("5. Show data summary") print("6. Check system status") print("7. Change CSV file path") print("8. Exit") print("="*50) print(f"šŸ“ Current file: {os.path.basename(self.csv_path)}") choice = input("Enter your choice (1-8): ").strip() if choice == '1': self.load_data() elif choice == '2': self.create_visualizations() elif choice == '3': self.analyze_data_quality() elif choice == '4': if self.df is None: print("\nāŒ No data loaded. Please load data first!") print("šŸ’” Choose option 1 to load your data before using AI analysis.") input("\nPress Enter to continue...") else: # Setup AI on demand if not already done if self.agent is None: print("\nšŸ¤– Setting up AI for first use...") self.setup_agent() if self.agent is None: print("\nāŒ AI features not available. Please configure a model first.") print("Edit the setup_agent() method to add your API keys.") self.configure_model_helper() else: print("\nšŸ¤– AI Analysis - Ask me anything about your data!") print("Example queries:") print(" • 'What are the main trends in this data?'") print(" • 'Find any outliers or anomalies'") print(" • 'Suggest data quality improvements'") print(" • 'Perform correlation analysis'") print(" • 'Identify seasonal patterns'") print(" • 'Recommend preprocessing steps'") query = input("\nšŸ’¬ Your question: ").strip() if query: self.ai_analysis(query) # Wait for user to read the results before returning to menu input("\nšŸ“‹ Press Enter to return to main menu...") else: print("āŒ No question entered.") input("\nPress Enter to continue...") elif choice == '5': if self.df is not None: print(f"\nšŸ“Š Dataset Summary:") print(f"Shape: {self.df.shape}") print(f"Columns: {list(self.df.columns)}") print(f"Memory: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") else: print("āŒ No data loaded.") elif choice == '6': self.check_status() elif choice == '7': self.change_csv_file() elif choice == '8': print("šŸ‘‹ Goodbye!") break else: print("āŒ Invalid choice. Please try again.") def load_and_explore_data(): """Load and explore the CSV data (keeping your original function)""" print(f"\nšŸ“ Loading data from: {csv_file_path}") try: # Check if file exists if not os.path.exists(csv_file_path): print(f"āŒ Error: File not found at {csv_file_path}") print("šŸ’” Update the csv_file_path variable at the top of this file") return None # Read the CSV file into a DataFrame df = pd.read_csv(csv_file_path) print("=== DATA LOADED SUCCESSFULLY ===") print(f"šŸ“ File: {os.path.basename(csv_file_path)}") print(f"šŸ“Š Dataset shape: {df.shape}") print(f"šŸ“‹ Columns: {list(df.columns)}") print("\n=== FIRST 5 ROWS ===") print(df.head()) print("\n=== DATA TYPES ===") print(df.dtypes) print("\n=== MISSING VALUES ===") print(df.isnull().sum()) print("\n=== BASIC STATISTICS ===") print(df.describe()) return df except Exception as e: print(f"Error loading data: {str(e)}") return None if __name__ == "__main__": print("šŸš€ Enhanced Data Explorer with SmoLagent AI") print("Choose your preferred mode:") print("1. Original function (load_and_explore_data)") print("2. Enhanced interactive mode with AI") mode = input("Enter mode (1 or 2): ").strip() if mode == "1": # Run your original function df = load_and_explore_data() elif mode == "2": # Run enhanced mode with AI capabilities explorer = EnhancedDataExplorer() explorer.interactive_menu() else: print("Invalid choice. Running original function...") df = load_and_explore_data()