Final_Assignment_Template

Runtime error

App Files Files Community

LamiaYT commited on Jun 26

Commit

34c5bf3

1 Parent(s): 6ea9560

Initial commit with LlamaIndex-based agent

Browse files

Files changed (1) hide show

app.py +287 -231

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
-# app.py - Fixed for Local Instruction-Following Models
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.core.agent import ReActAgent
 from llama_index.core.tools import FunctionTool
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
 import gradio as gr
 import requests
@@ -10,6 +11,7 @@ import pandas as pd
 import traceback
 import torch
 import re
 # Import real tool dependencies
 try:
@@ -19,7 +21,7 @@ except ImportError:
     DDGS = None
 try:
-    from sympy import sympify, solve, simplify, N
     from sympy.core.sympify import SympifyError
 except ImportError:
     print("Warning: sympy not installed. Math calculator will be limited.")
@@ -29,253 +31,298 @@ except ImportError:
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- Smart Agent with Better Local Models ---
-class SmartAgent:
     def __init__(self):
-        print("Initializing Local Instruction-Following Agent...")
-        if torch.cuda.is_available():
-            print(f"CUDA available. GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
-            device_map = "auto"
-        else:
-            print("CUDA not available, using CPU")
-            device_map = "cpu"
-        # FIXED: Use instruction-following models, not chat models
-        model_options = [
-            "microsoft/DialoGPT-medium",  # Remove this - it's for chat only
-            "google/flan-t5-base",        # Good for instructions
-            "google/flan-t5-large",       # Better reasoning (if memory allows)
-            "microsoft/DialoGPT-small",   # Fallback
-        ]
-        # Try FLAN-T5 first - it's designed for instruction following
-        model_name = "google/flan-t5-base"  # Start with smaller, reliable model
-        print(f"Loading instruction model: {model_name}")
-        try:
-            # FLAN-T5 specific configuration
-            self.llm = HuggingFaceLLM(
-                model_name=model_name,
-                tokenizer_name=model_name,
-                context_window=1024,
-                max_new_tokens=256,
-                generate_kwargs={
-                    "temperature": 0.1,
-                    "do_sample": False,  # Use greedy for more consistent answers
-                    "repetition_penalty": 1.1,
-                },
-                device_map=device_map,
-                model_kwargs={
-                    "torch_dtype": torch.float16,
-                    "low_cpu_mem_usage": True,
-                },
-                # Clear system message for FLAN-T5
-                system_message="Answer questions accurately using the provided tools when needed."
-            )
-            print(f"✅ Successfully loaded: {model_name}")
-        except Exception as e:
-            print(f"❌ Failed to load {model_name}: {e}")
-            print("🔄 Trying manual approach without LlamaIndex LLM wrapper...")
-            # Try direct approach without complex wrapper
-            self.llm = None
-            self.use_direct_mode = True
-        # Define enhanced tools
         self.tools = [
             FunctionTool.from_defaults(
-                fn=self.web_search,
-                name="web_search",
-                description="Search web for current information, facts, people, events, or recent data"
             ),
             FunctionTool.from_defaults(
-                fn=self.math_calculator,
-                name="math_calculator",
-                description="Calculate mathematical expressions, solve equations, or perform numerical operations"
             )
         ]
-        # Try to create agent, but prepare for direct mode
-        try:
-            if self.llm:
-                self.agent = ReActAgent.from_tools(
-                    tools=self.tools,
-                    llm=self.llm,
-                    verbose=True,
-                    max_iterations=3,
-                )
-                print("✅ ReAct Agent created successfully")
-                self.use_direct_mode = False
-            else:
-                raise Exception("No LLM available")
-        except Exception as e:
-            print(f"⚠️ Agent creation failed: {e}")
-            print("🔄 Switching to direct tool mode...")
-            self.agent = None
-            self.use_direct_mode = True
-    def web_search(self, query: str) -> str:
-        """Enhanced web search"""
-        print(f"🔍 Searching: {query}")
         if not DDGS:
-            return "Web search unavailable"
         try:
             with DDGS() as ddgs:
-                results = list(ddgs.text(query, max_results=5, region='wt-wt'))
-                if results:
-                    # Format results clearly
-                    search_results = []
-                    for i, result in enumerate(results, 1):
-                        title = result.get('title', 'No title')
-                        body = result.get('body', '').strip()[:200]
-                        search_results.append(f"{i}. {title}\n   {body}...")
-                    return f"Search results for '{query}':\n\n" + "\n\n".join(search_results)
-                else:
                     return f"No results found for: {query}"
         except Exception as e:
             print(f"❌ Search error: {e}")
             return f"Search failed: {str(e)}"
-    def math_calculator(self, expression: str) -> str:
-        """Enhanced math calculator"""
-        print(f"🧮 Calculating: {expression}")
         try:
-            # Clean the expression
             clean_expr = expression.replace('^', '**').replace('×', '*').replace('÷', '/')
             if sympify:
-                # Use SymPy for safe evaluation
-                result = sympify(clean_expr)
-                numerical = N(result, 10)
-                return f"Calculation result: {numerical}"
             else:
-                # Basic fallback
                 result = eval(clean_expr)
-                return f"Calculation result: {result}"
         except Exception as e:
             return f"Could not calculate '{expression}': {str(e)}"
-    def __call__(self, question: str) -> str:
-        print(f"\n🤔 Question: {question[:100]}...")
-        # If using direct mode (no LLM agent), route questions manually
-        if self.use_direct_mode:
-            return self._direct_question_answering(question)
-        # Try using the agent
         try:
             response = self.agent.query(question)
-            response_str = str(response).strip()
-            # Check if response is meaningful
-            if len(response_str) < 5 or response_str in ['?', '!', 'what', 'I']:
-                print("⚠️ Poor agent response, switching to direct mode")
-                return self._direct_question_answering(question)
-            return response_str
         except Exception as e:
-            print(f"❌ Agent failed: {e}")
-            return self._direct_question_answering(question)
-    def _direct_question_answering(self, question: str) -> str:
-        """Direct question answering without LLM agent"""
-        print("🎯 Using direct approach...")
         question_lower = question.lower()
-        # Enhanced detection patterns
-        search_patterns = [
-            'how many', 'who is', 'what is', 'when was', 'where is',
-            'mercedes sosa', 'albums', 'published', 'studio albums',
-            'between', 'winner', 'recipient', 'nationality', 'born',
-            'current', 'latest', 'recent', 'president', 'capital',
-            'malko', 'competition', 'award', 'founded', 'established'
-        ]
-        math_patterns = [
-            'calculate', 'compute', 'solve', 'equation', 'sum', 'total',
-            'average', 'percentage', '+', '-', '*', '/', '=', 'find x'
-        ]
-        needs_search = any(pattern in question_lower for pattern in search_patterns)
-        needs_math = any(pattern in question_lower for pattern in math_patterns)
-        # Check for numbers that suggest math
-        has_math_numbers = bool(re.search(r'\d+\s*[\+\-\*/=]\s*\d+', question))
-        if has_math_numbers:
-            needs_math = True
-        print(f"📊 Analysis - Search: {needs_search}, Math: {needs_math}")
-        if needs_search:
-            # Extract key search terms
-            important_words = []
-            # Special handling for specific questions
-            if 'mercedes sosa' in question_lower and 'albums' in question_lower:
-                search_query = "Mercedes Sosa studio albums discography 2000-2009"
-            else:
-                # General search term extraction
-                words = question.replace('?', '').replace(',', '').split()
-                skip_words = {'how', 'many', 'what', 'when', 'where', 'who', 'is', 'the', 'a', 'an', 'and', 'or', 'but', 'between', 'were', 'was', 'can', 'you', 'use'}
-                for word in words:
-                    clean_word = word.lower().strip('.,!?;:()')
-                    if len(clean_word) > 2 and clean_word not in skip_words:
-                        important_words.append(clean_word)
-                search_query = ' '.join(important_words[:5])
-            print(f"🔍 Search query: {search_query}")
-            search_result = self.web_search(search_query)
-            # Try to extract specific answer from search results
-            if 'albums' in question_lower and 'mercedes sosa' in question_lower:
-                # Look for numbers in the search results
-                numbers = re.findall(r'\b\d+\b', search_result)
-                if numbers:
-                    return f"Based on web search, Mercedes Sosa published approximately {numbers[0]} studio albums between 2000-2009. Full search results:\n\n{search_result}"
-            return f"Search results:\n\n{search_result}"
-        if needs_math:
-            # Extract mathematical expressions
-            math_expressions = re.findall(r'[\d+\-*/().\s=]+', question)
-            for expr in math_expressions:
-                if any(op in expr for op in ['+', '-', '*', '/', '=']):
-                    result = self.math_calculator(expr.strip())
-                    return result
-        # Default: Try a general web search
-        key_words = question.split()[:5]
-        general_query = ' '.join(word.strip('.,!?') for word in key_words if len(word) > 2)
-        if general_query:
-            search_result = self.web_search(general_query)
-            return f"General search results:\n\n{search_result}"
-        return f"I need more specific information to answer: {question[:100]}..."
 def cleanup_memory():
-    """Clean up memory"""
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     print("🧹 Memory cleaned")
 def run_and_submit_all(profile: gr.OAuthProfile | None):
-    """Run evaluation with better error handling"""
     if not profile:
         return "❌ Please login to Hugging Face first", None
@@ -290,12 +337,15 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     cleanup_memory()
-    # Initialize agent
     try:
-        agent = SmartAgent()
-        print("✅ Agent initialized")
     except Exception as e:
-        return f"❌ Agent initialization failed: {str(e)}", None
     # Get space info
     space_id = os.getenv("SPACE_ID", "unknown")
@@ -316,7 +366,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     answers_payload = []
     print("\n" + "="*50)
-    print("🚀 STARTING EVALUATION")
     print("="*50)
     for i, item in enumerate(questions_data, 1):
@@ -328,17 +378,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         print(f"\n📝 Question {i}/{len(questions_data)}")
         print(f"🆔 ID: {task_id}")
-        print(f"❓ Q: {question_text}")
         try:
-            # Get answer from agent
             answer = agent(question_text)
-            # Ensure answer is not empty
-            if not answer or len(answer.strip()) < 3:
-                answer = f"Unable to process question about: {question_text[:50]}..."
-            print(f"✅ A: {answer[:150]}...")
             # Store results
             answers_payload.append({
@@ -348,17 +398,17 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             results_log.append({
                 "Task ID": task_id,
-                "Question": question_text[:100] + ("..." if len(question_text) > 100 else ""),
-                "Answer": answer[:150] + ("..." if len(answer) > 150 else "")
             })
             # Memory cleanup every few questions
-            if i % 5 == 0:
                 cleanup_memory()
         except Exception as e:
             print(f"❌ Error processing {task_id}: {e}")
-            error_answer = f"Error: {str(e)[:100]}"
             answers_payload.append({
                 "task_id": task_id,
@@ -367,7 +417,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             results_log.append({
                 "Task ID": task_id,
-                "Question": question_text[:100] + "...",
                 "Answer": error_answer
             })
@@ -381,7 +431,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     }
     try:
-        response = requests.post(submit_url, json=submission_data, timeout=120)
         response.raise_for_status()
         result_data = response.json()
@@ -391,16 +441,23 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         message = result_data.get('message', '')
         # Create final status message
-        final_status = f"""🎉 EVALUATION COMPLETE!
 👤 User: {username}
 📊 Final Score: {score}%
 ✅ Correct: {correct}/{total}
-🎯 Target: 30%+ {'✅ ACHIEVED!' if score >= 30 else '❌ Keep improving!'}
 📝 Message: {message}
-🔧 Mode Used: {'Direct Tool Mode' if hasattr(agent, 'use_direct_mode') and agent.use_direct_mode else 'Agent Mode'}
 """
         print(f"\n🏆 FINAL SCORE: {score}%")
@@ -411,20 +468,19 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         print(error_msg)
         return error_msg, pd.DataFrame(results_log)
 # --- Gradio Interface ---
-with gr.Blocks(title="Fixed Local Agent", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🔧 Fixed Local Agent (No API Required)")
     gr.Markdown("""
-    **Key Fixes:**
-    - ✅ Uses instruction-following models (FLAN-T5) instead of chat models
-    - 🎯 Direct question routing when agent fails
-    - 🔍 Enhanced web search with better keyword extraction
-    - 🧮 Robust math calculator
-    - 💾 Optimized for 16GB memory
-    - 🛡️ Multiple fallback strategies
-    **Target: 30%+ Score**
     """)
     with gr.Row():
@@ -432,19 +488,19 @@ with gr.Blocks(title="Fixed Local Agent", theme=gr.themes.Soft()) as demo:
     with gr.Row():
         run_button = gr.Button(
-            "🚀 Run Fixed Evaluation",
             variant="primary",
             size="lg"
         )
     status_output = gr.Textbox(
         label="📊 Evaluation Results",
-        lines=12,
         interactive=False
     )
     results_table = gr.DataFrame(
-        label="📝 Question & Answer Details",
         wrap=True
     )
@@ -454,8 +510,8 @@ with gr.Blocks(title="Fixed Local Agent", theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
-    print("🚀 Starting Fixed Local Agent...")
-    print("💡 No API keys required - everything runs locally!")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

+# app.py - Improved GAIA Agent with GPT-NeoX-20B + LoRA
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.core.agent import ReActAgent
 from llama_index.core.tools import FunctionTool
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import LoraConfig, get_peft_model
 import os
 import gradio as gr
 import requests
 import traceback
 import torch
 import re
+import json
 # Import real tool dependencies
 try:
     DDGS = None
 try:
+    from sympy import sympify, solve, simplify, N, symbols
     from sympy.core.sympify import SympifyError
 except ImportError:
     print("Warning: sympy not installed. Math calculator will be limited.")
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+def print_trainable_parameters(model):
+    """Print trainable parameters info"""
+    trainable_parameters = 0
+    all_parameters = 0
+    for _, param in model.named_parameters():
+        all_parameters += param.numel()
+        if param.requires_grad:
+            trainable_parameters += param.numel()
+    print(
+        f"Trainable: {trainable_parameters} || All: {all_parameters} || Trainable %: {100 * trainable_parameters / all_parameters:.2f}%"
+    )
+class ImprovedGAIAAgent:
     def __init__(self):
+        print("🚀 Initializing Improved GAIA Agent with GPT-NeoX-20B...")
+        if not torch.cuda.is_available():
+            raise RuntimeError("❌ CUDA required for GPT-NeoX-20B. Please use a GPU environment.")
+        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+        print(f"🔥 GPU Memory: {gpu_memory:.1f}GB")
+        # Model configuration
+        self.model_name = "EleutherAI/gpt-neox-20b"
+        # 4-bit quantization config for memory efficiency
+        self.bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16
+        )
+        # LoRA configuration for efficient fine-tuning capability
+        self.lora_config = LoraConfig(
+            r=16,  # Increased for better performance
+            lora_alpha=32,
+            target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],  # More comprehensive targets
+            lora_dropout=0.1,
+            bias="none",
+            task_type="CAUSAL_LM"
+        )
+        self.load_model()
+        self.setup_tools()
+        self.create_agent()
+    def load_model(self):
+        """Load and configure the model"""
+        print("📥 Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        # Add padding token if not present
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("📥 Loading model with 4-bit quantization...")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            quantization_config=self.bnb_config,
+            device_map="auto",
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16
+        )
+        print("🔧 Applying LoRA configuration...")
+        self.model = get_peft_model(self.model, self.lora_config)
+        print_trainable_parameters(self.model)
+        # Create LlamaIndex LLM wrapper
+        print("🔗 Creating LlamaIndex LLM wrapper...")
+        self.llm = HuggingFaceLLM(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            context_window=2048,  # GPT-NeoX context length
+            max_new_tokens=512,
+            generate_kwargs={
+                "temperature": 0.1,
+                "do_sample": True,
+                "top_p": 0.9,
+                "repetition_penalty": 1.1,
+                "pad_token_id": self.tokenizer.eos_token_id,
+            },
+            # Improved system message for GAIA tasks
+            system_message="""You are a helpful AI assistant that can search the web and perform calculations.
+When answering questions:
+1. Think step by step
+2. Use tools when you need current information or calculations
+3. Be precise and factual
+4. For numerical answers, provide exact numbers when possible
+5. Always show your reasoning
+Available tools: web_search, math_calculator"""
+        )
+    def setup_tools(self):
+        """Setup enhanced tools for GAIA benchmark"""
         self.tools = [
             FunctionTool.from_defaults(
+                fn=self.enhanced_web_search,
+                name="web_search",
+                description="Search the web for current information, facts, people, events, or recent data. Use specific keywords."
             ),
             FunctionTool.from_defaults(
+                fn=self.advanced_calculator,
+                name="math_calculator",
+                description="Perform mathematical calculations, solve equations, handle percentages, averages, and complex math operations."
+            ),
+            FunctionTool.from_defaults(
+                fn=self.fact_checker,
+                name="fact_checker",
+                description="Verify facts and get detailed information about people, places, events, or concepts."
             )
         ]
+    def enhanced_web_search(self, query: str) -> str:
+        """Enhanced web search with better result processing"""
+        print(f"🔍 Enhanced search: {query}")
         if not DDGS:
+            return "Web search unavailable - duckduckgo_search not installed"
         try:
             with DDGS() as ddgs:
+                # Get both regular results and news if relevant
+                results = list(ddgs.text(query, max_results=8, region='wt-wt'))
+                if not results:
                     return f"No results found for: {query}"
+                # Process and format results
+                formatted_results = []
+                for i, result in enumerate(results, 1):
+                    title = result.get('title', 'No title')
+                    body = result.get('body', '').strip()
+                    url = result.get('href', '')
+                    # Extract key information
+                    if len(body) > 300:
+                        body = body[:300] + "..."
+                    formatted_results.append(f"""Result {i}: {title}
+Content: {body}
+Source: {url}
+""")
+                search_summary = f"Search results for '{query}':\n\n" + "\n".join(formatted_results)
+                # Try to extract specific answers for common question types
+                if any(keyword in query.lower() for keyword in ['how many', 'when was', 'who is', 'what year']):
+                    # Look for numbers and dates in results
+                    all_text = " ".join([r.get('body', '') for r in results])
+                    # Extract years
+                    years = re.findall(r'\b(19|20)\d{2}\b', all_text)
+                    if years and 'when' in query.lower():
+                        search_summary += f"\n\nExtracted years: {', '.join(set(years))}"
+                    # Extract numbers
+                    numbers = re.findall(r'\b\d+\b', all_text)
+                    if numbers and 'how many' in query.lower():
+                        search_summary += f"\n\nExtracted numbers: {', '.join(set(numbers)[:5])}"
+                return search_summary
         except Exception as e:
             print(f"❌ Search error: {e}")
             return f"Search failed: {str(e)}"
+    def advanced_calculator(self, expression: str) -> str:
+        """Advanced calculator with symbolic math"""
+        print(f"🧮 Advanced calculation: {expression}")
         try:
+            # Clean and normalize the expression
             clean_expr = expression.replace('^', '**').replace('×', '*').replace('÷', '/')
+            clean_expr = re.sub(r'(\d)\s*\(', r'\1*(', clean_expr)  # Add implicit multiplication
             if sympify:
+                try:
+                    # Try symbolic computation first
+                    expr = sympify(clean_expr, evaluate=False)
+                    result = simplify(expr)
+                    numerical = N(result, 15)  # High precision
+                    # Handle different result types
+                    if result.is_number:
+                        return f"Calculation: {expression} = {numerical}"
+                    else:
+                        return f"Calculation: {expression} = {result} ≈ {numerical}"
+                except SympifyError:
+                    # Fallback to numerical evaluation
+                    result = eval(clean_expr)
+                    return f"Calculation: {expression} = {result}"
             else:
+                # Basic evaluation
                 result = eval(clean_expr)
+                return f"Calculation: {expression} = {result}"
         except Exception as e:
             return f"Could not calculate '{expression}': {str(e)}"
+    def fact_checker(self, query: str) -> str:
+        """Specialized fact checking with multiple search strategies"""
+        print(f"✅ Fact checking: {query}")
+        # Try different search strategies
+        search_variations = [
+            query,
+            f"{query} facts",
+            f"{query} biography" if any(word in query.lower() for word in ['who is', 'person', 'artist']) else f"{query} information",
+        ]
+        all_results = []
+        for search_query in search_variations[:2]:  # Limit to avoid rate limiting
+            result = self.enhanced_web_search(search_query)
+            if "No results found" not in result:
+                all_results.append(f"Search: {search_query}\n{result}")
+        return "\n\n" + "="*50 + "\n\n".join(all_results) if all_results else f"Could not verify facts about: {query}"
+    def create_agent(self):
+        """Create the ReAct agent"""
+        print("🤖 Creating ReAct agent...")
+        try:
+            self.agent = ReActAgent.from_tools(
+                tools=self.tools,
+                llm=self.llm,
+                verbose=True,
+                max_iterations=5,  # Allow more iterations for complex problems
+                react_chat_formatter=None,  # Use default formatter
+            )
+            print("✅ ReAct Agent created successfully")
+        except Exception as e:
+            print(f"❌ Agent creation failed: {e}")
+            traceback.print_exc()
+            raise
+    def __call__(self, question: str) -> str:
+        """Process question through the agent"""
+        print(f"\n" + "="*60)
+        print(f"🤔 Processing: {question}")
+        print("="*60)
         try:
+            # Use the agent to process the question
             response = self.agent.query(question)
+            answer = str(response).strip()
+            # Validate response quality
+            if len(answer) < 10 or answer.lower() in ['error', 'none', 'unknown']:
+                print("⚠️ Poor response, trying direct approach...")
+                return self._direct_approach(question)
+            print(f"✅ Agent response: {answer[:200]}...")
+            return answer
         except Exception as e:
+            print(f"❌ Agent error: {e}")
+            print("🔄 Falling back to direct approach...")
+            return self._direct_approach(question)
+    def _direct_approach(self, question: str) -> str:
+        """Direct approach when agent fails"""
         question_lower = question.lower()
+        # Determine approach based on question type
+        if any(term in question_lower for term in ['calculate', 'compute', 'math', '+', '-', '*', '/', '=', 'percentage', 'average']):
+            # Math-focused approach
+            math_result = self.advanced_calculator(question)
+            return math_result
+        elif any(term in question_lower for term in ['who is', 'when was', 'where is', 'what is', 'how many']):
+            # Search-focused approach
+            search_result = self.enhanced_web_search(question)
+            fact_result = self.fact_checker(question)
+            return f"{search_result}\n\nFact Check:\n{fact_result}"
+        else:
+            # General approach
+            search_result = self.enhanced_web_search(question)
+            return search_result
 def cleanup_memory():
+    """Clean up GPU memory"""
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     print("🧹 Memory cleaned")
 def run_and_submit_all(profile: gr.OAuthProfile | None):
+    """Run evaluation with improved agent"""
     if not profile:
         return "❌ Please login to Hugging Face first", None
     cleanup_memory()
+    # Initialize improved agent
     try:
+        print("🚀 Initializing Improved GAIA Agent...")
+        agent = ImprovedGAIAAgent()
+        print("✅ Agent initialized successfully")
     except Exception as e:
+        error_msg = f"❌ Agent initialization failed: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return error_msg, None
     # Get space info
     space_id = os.getenv("SPACE_ID", "unknown")
     answers_payload = []
     print("\n" + "="*50)
+    print("🚀 STARTING GAIA EVALUATION")
     print("="*50)
     for i, item in enumerate(questions_data, 1):
         print(f"\n📝 Question {i}/{len(questions_data)}")
         print(f"🆔 ID: {task_id}")
+        print(f"❓ Question: {question_text}")
         try:
+            # Get answer from improved agent
             answer = agent(question_text)
+            # Ensure answer is meaningful
+            if not answer or len(answer.strip()) < 5:
+                answer = f"Unable to determine answer for: {question_text[:100]}..."
+            print(f"✅ Answer: {answer[:200]}...")
             # Store results
             answers_payload.append({
             results_log.append({
                 "Task ID": task_id,
+                "Question": question_text[:150] + ("..." if len(question_text) > 150 else ""),
+                "Answer": answer[:200] + ("..." if len(answer) > 200 else "")
             })
             # Memory cleanup every few questions
+            if i % 3 == 0:
                 cleanup_memory()
         except Exception as e:
             print(f"❌ Error processing {task_id}: {e}")
+            error_answer = f"Processing error: {str(e)[:150]}"
             answers_payload.append({
                 "task_id": task_id,
             results_log.append({
                 "Task ID": task_id,
+                "Question": question_text[:150] + "...",
                 "Answer": error_answer
             })
     }
     try:
+        response = requests.post(submit_url, json=submission_data, timeout=180)
         response.raise_for_status()
         result_data = response.json()
         message = result_data.get('message', '')
         # Create final status message
+        final_status = f"""🎉 IMPROVED GAIA EVALUATION COMPLETE!
 👤 User: {username}
+🤖 Model: GPT-NeoX-20B + LoRA + 4-bit Quantization
 📊 Final Score: {score}%
 ✅ Correct: {correct}/{total}
+🎯 Target: 30%+ {'🎉 ACHIEVED!' if score >= 30 else '📈 Significant improvement expected!'}
 📝 Message: {message}
+🔧 Improvements Made:
+- ✅ Proper causal LM (GPT-NeoX-20B) instead of encoder-decoder
+- ✅ 4-bit quantization for memory efficiency
+- ✅ LoRA for better parameter efficiency
+- ✅ Enhanced tools with fact checking
+- ✅ Better reasoning prompts
+- ✅ Multi-strategy search approach
 """
         print(f"\n🏆 FINAL SCORE: {score}%")
         print(error_msg)
         return error_msg, pd.DataFrame(results_log)
 # --- Gradio Interface ---
+with gr.Blocks(title="Improved GAIA Agent", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 Improved GAIA Agent - GPT-NeoX-20B + LoRA")
     gr.Markdown("""
+    **Major Improvements:**
+    - 🧠 **GPT-NeoX-20B**: 20B parameter causal language model (vs 220M FLAN-T5)
+    - ⚡ **4-bit Quantization**: Memory efficient loading with BitsAndBytes
+    - 🎯 **LoRA**: Parameter-efficient fine-tuning ready
+    - 🔍 **Enhanced Tools**: Multi-strategy search + fact checking + advanced math
+    - 🤖 **Better ReAct**: Improved reasoning prompts and error handling
+    - 📈 **Expected**: Significant improvement over 0% baseline
+    **Requirements**: CUDA GPU with 16GB+ VRAM
     """)
     with gr.Row():
     with gr.Row():
         run_button = gr.Button(
+            "🚀 Run Improved GAIA Evaluation",
             variant="primary",
             size="lg"
         )
     status_output = gr.Textbox(
         label="📊 Evaluation Results",
+        lines=15,
         interactive=False
     )
     results_table = gr.DataFrame(
+        label="📝 Detailed Results",
         wrap=True
     )
     )
 if __name__ == "__main__":
+    print("🚀 Starting Improved GAIA Agent...")
+    print("💪 Using GPT-NeoX-20B + LoRA + 4-bit Quantization")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,