Final_Assignment_Template

Sleeping

App Files Files Community

tatianija commited on Jul 1

Commit

b0ffe80

verified ·

1 Parent(s): 5d98e50

Update app.py

Browse files

Files changed (1) hide show

app.py +495 -0

app.py CHANGED Viewed

@@ -1,3 +1,498 @@
 def _detect_and_process_direct_attachments(self, file_name: str) -> Tuple[List[str], List[str], List[str]]:
     """
     Detect and process a single attachment directly attached to a question (not as a URL).

+import os
+import gradio as gr
+import requests
+import inspect
+import time
+import pandas as pd
+from smolagents import DuckDuckGoSearchTool
+import threading
+from typing import Dict, List, Optional, Tuple, Union
+import json
+from huggingface_hub import InferenceClient
+import base64
+from PIL import Image
+import io
+import tempfile
+import urllib.parse
+from pathlib import Path
+import re
+from bs4 import BeautifulSoup
+import mimetypes
+# --- Constants ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# --- Global Cache for Answers ---
+cached_answers = {}
+cached_questions = []
+processing_status = {"is_processing": False, "progress": 0, "total": 0}
+# --- Web Content Fetcher ---
+class WebContentFetcher:
+    def __init__(self, debug: bool = True):
+        self.debug = debug
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+    def extract_urls_from_text(self, text: str) -> List[str]:
+        """Extract URLs from text using regex."""
+        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+        urls = re.findall(url_pattern, text)
+        return list(set(urls))  # Remove duplicates
+    def fetch_url_content(self, url: str) -> Dict[str, str]:
+        """
+        Fetch content from a URL and extract text, handling different content types.
+        Returns a dictionary with 'content', 'title', 'content_type', and 'error' keys.
+        """
+        try:
+            # Clean the URL
+            url = url.strip()
+            if not url.startswith(('http://', 'https://')):
+                url = 'https://' + url
+            if self.debug:
+                print(f"Fetching URL: {url}")
+            response = self.session.get(url, timeout=30, allow_redirects=True)
+            response.raise_for_status()
+            content_type = response.headers.get('content-type', '').lower()
+            result = {
+                'url': url,
+                'content_type': content_type,
+                'title': '',
+                'content': '',
+                'error': None
+            }
+            # Handle different content types
+            if 'text/html' in content_type:
+                # Parse HTML content
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Extract title
+                title_tag = soup.find('title')
+                result['title'] = title_tag.get_text().strip() if title_tag else 'No title'
+                # Remove script and style elements
+                for script in soup(["script", "style"]):
+                    script.decompose()
+                # Extract text content
+                text_content = soup.get_text()
+                # Clean up text
+                lines = (line.strip() for line in text_content.splitlines())
+                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+                text_content = ' '.join(chunk for chunk in chunks if chunk)
+                # Limit content length
+                if len(text_content) > 8000:
+                    text_content = text_content[:8000] + "... (truncated)"
+                result['content'] = text_content
+            elif 'text/plain' in content_type:
+                # Handle plain text
+                text_content = response.text
+                if len(text_content) > 8000:
+                    text_content = text_content[:8000] + "... (truncated)"
+                result['content'] = text_content
+                result['title'] = f"Text document from {url}"
+            elif 'application/json' in content_type:
+                # Handle JSON content
+                try:
+                    json_data = response.json()
+                    result['content'] = json.dumps(json_data, indent=2)[:8000]
+                    result['title'] = f"JSON document from {url}"
+                except:
+                    result['content'] = response.text[:8000]
+                    result['title'] = f"JSON document from {url}"
+            elif any(x in content_type for x in ['application/pdf', 'application/msword', 'application/vnd.openxmlformats']):
+                # Handle document files
+                result['content'] = f"Document file detected ({content_type}). Content extraction for this file type is not implemented."
+                result['title'] = f"Document from {url}"
+            else:
+                # Handle other content types
+                if response.text:
+                    content = response.text[:8000]
+                    result['content'] = content
+                    result['title'] = f"Content from {url}"
+                else:
+                    result['content'] = f"Non-text content detected ({content_type})"
+                    result['title'] = f"File from {url}"
+            if self.debug:
+                print(f"Successfully fetched content from {url}: {len(result['content'])} characters")
+            return result
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Failed to fetch {url}: {str(e)}"
+            if self.debug:
+                print(error_msg)
+            return {
+                'url': url,
+                'content_type': 'error',
+                'title': f"Error fetching {url}",
+                'content': '',
+                'error': error_msg
+            }
+        except Exception as e:
+            error_msg = f"Unexpected error fetching {url}: {str(e)}"
+            if self.debug:
+                print(error_msg)
+            return {
+                'url': url,
+                'content_type': 'error',
+                'title': f"Error fetching {url}",
+                'content': '',
+                'error': error_msg
+            }
+    def fetch_multiple_urls(self, urls: List[str]) -> List[Dict[str, str]]:
+        """Fetch content from multiple URLs."""
+        results = []
+        for url in urls[:5]:  # Limit to 5 URLs to avoid excessive processing
+            result = self.fetch_url_content(url)
+            results.append(result)
+            time.sleep(1)  # Be respectful to servers
+        return results
+# --- File Processing Utility ---
+def save_attachment_to_file(attachment_data: Union[str, bytes, dict], temp_dir: str, file_name: str = None) -> Optional[str]:
+    """
+    Save attachment data to a temporary file.
+    Returns the local file path if successful, None otherwise.
+    """
+    try:
+        # Determine file name and extension
+        if not file_name:
+            file_name = f"attachment_{int(time.time())}"
+        # Handle different data types
+        if isinstance(attachment_data, dict):
+            # Handle dict with file data
+            if 'data' in attachment_data:
+                file_data = attachment_data['data']
+                file_type = attachment_data.get('type', '').lower()
+                original_name = attachment_data.get('name', file_name)
+            elif 'content' in attachment_data:
+                file_data = attachment_data['content']
+                file_type = attachment_data.get('mime_type', '').lower()
+                original_name = attachment_data.get('filename', file_name)
+            else:
+                # Try to use the dict as file data directly
+                file_data = str(attachment_data)
+                file_type = ''
+                original_name = file_name
+            # Use original name if available
+            if original_name and original_name != file_name:
+                file_name = original_name
+        elif isinstance(attachment_data, str):
+            # Could be base64 encoded data or plain text
+            file_data = attachment_data
+            file_type = ''
+        elif isinstance(attachment_data, bytes):
+            # Binary data
+            file_data = attachment_data
+            file_type = ''
+        else:
+            print(f"Unknown attachment data type: {type(attachment_data)}")
+            return None
+        # Ensure file has an extension
+        if '.' not in file_name:
+            # Try to determine extension from type
+            if 'image' in file_type:
+                if 'jpeg' in file_type or 'jpg' in file_type:
+                    file_name += '.jpg'
+                elif 'png' in file_type:
+                    file_name += '.png'
+                else:
+                    file_name += '.img'
+            elif 'audio' in file_type:
+                if 'mp3' in file_type:
+                    file_name += '.mp3'
+                elif 'wav' in file_type:
+                    file_name += '.wav'
+                else:
+                    file_name += '.audio'
+            elif 'python' in file_type or 'text' in file_type:
+                file_name += '.py'
+            else:
+                file_name += '.file'
+        file_path = os.path.join(temp_dir, file_name)
+        # Save the file
+        if isinstance(file_data, str):
+            # Try to decode if it's base64
+            try:
+                # Check if it looks like base64
+                if len(file_data) > 100 and '=' in file_data[-5:]:
+                    decoded_data = base64.b64decode(file_data)
+                    with open(file_path, 'wb') as f:
+                        f.write(decoded_data)
+                else:
+                    # Plain text
+                    with open(file_path, 'w', encoding='utf-8') as f:
+                        f.write(file_data)
+            except:
+                # If base64 decode fails, save as text
+                with open(file_path, 'w', encoding='utf-8') as f:
+                    f.write(file_data)
+        else:
+            # Binary data
+            with open(file_path, 'wb') as f:
+                f.write(file_data)
+        print(f"Saved attachment: {file_path}")
+        return file_path
+    except Exception as e:
+        print(f"Failed to save attachment: {e}")
+        return None
+# --- Code Processing Tool ---
+class CodeAnalysisTool:
+    def __init__(self, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
+        self.client = InferenceClient(model=model_name, provider="sambanova")
+    def analyze_code(self, code_path: str) -> str:
+        """
+        Analyze Python code and return insights.
+        """
+        try:
+            with open(code_path, 'r', encoding='utf-8') as f:
+                code_content = f.read()
+            # Limit code length for analysis
+            if len(code_content) > 5000:
+                code_content = code_content[:5000] + "\n... (truncated)"
+            analysis_prompt = f"""Analyze this Python code and provide a concise summary of:
+1. What the code does (main functionality)
+2. Key functions/classes
+3. Any notable patterns or issues
+4. Input/output behavior if applicable
+Code:
+```python
+{code_content}
+```
+Provide a brief, focused analysis:"""
+            messages = [{"role": "user", "content": analysis_prompt}]
+            response = self.client.chat_completion(
+                messages=messages,
+                max_tokens=500,
+                temperature=0.3
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            return f"Code analysis failed: {e}"
+# --- Image Processing Tool ---
+class ImageAnalysisTool:
+    def __init__(self, model_name: str = "microsoft/Florence-2-large"):
+        self.client = InferenceClient(model=model_name)
+    def analyze_image(self, image_path: str, prompt: str = "Describe this image in detail") -> str:
+        """
+        Analyze an image and return a description.
+        """
+        try:
+            # Open and process the image
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+            # Use the vision model to analyze the image
+            response = self.client.image_to_text(
+                image=image_bytes,
+                model="microsoft/Florence-2-large"
+            )
+            return response.get("generated_text", "Could not analyze image")
+        except Exception as e:
+            try:
+                # Fallback: use a different vision model
+                response = self.client.image_to_text(
+                    image=image_bytes,
+                    model="Salesforce/blip-image-captioning-large"
+                )
+                return response.get("generated_text", f"Image analysis error: {e}")
+            except:
+                return f"Image analysis failed: {e}"
+    def extract_text_from_image(self, image_path: str) -> str:
+        """
+        Extract text from an image using OCR.
+        """
+        try:
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+            # Use an OCR model
+            response = self.client.image_to_text(
+                image=image_bytes,
+                model="microsoft/trocr-base-printed"
+            )
+            return response.get("generated_text", "No text found in image")
+        except Exception as e:
+            return f"OCR failed: {e}"
+# --- Audio Processing Tool ---
+class AudioTranscriptionTool:
+    def __init__(self, model_name: str = "openai/whisper-large-v3"):
+        self.client = InferenceClient(model=model_name)
+    def transcribe_audio(self, audio_path: str) -> str:
+        """
+        Transcribe audio file to text.
+        """
+        try:
+            with open(audio_path, "rb") as f:
+                audio_bytes = f.read()
+            # Use Whisper for transcription
+            response = self.client.automatic_speech_recognition(
+                audio=audio_bytes
+            )
+            return response.get("text", "Could not transcribe audio")
+        except Exception as e:
+            try:
+                # Fallback to a different ASR model
+                response = self.client.automatic_speech_recognition(
+                    audio=audio_bytes,
+                    model="facebook/wav2vec2-large-960h-lv60-self"
+                )
+                return response.get("text", f"Audio transcription error: {e}")
+            except:
+                return f"Audio transcription failed: {e}"
+# --- Enhanced Intelligent Agent with Direct Attachment Processing ---
+class IntelligentAgent:
+    def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
+        self.search = DuckDuckGoSearchTool()
+        self.client = InferenceClient(model=model_name, provider="sambanova")
+        self.image_tool = ImageAnalysisTool()
+        self.audio_tool = AudioTranscriptionTool()
+        self.code_tool = CodeAnalysisTool(model_name)
+        self.web_fetcher = WebContentFetcher(debug)
+        self.debug = debug
+        if self.debug:
+            print(f"IntelligentAgent initialized with model: {model_name}")
+    def _chat_completion(self, prompt: str, max_tokens: int = 500, temperature: float = 0.3) -> str:
+        """
+        Use chat completion instead of text generation to avoid provider compatibility issues.
+        """
+        try:
+            messages = [{"role": "user", "content": prompt}]
+            # Try chat completion first
+            try:
+                response = self.client.chat_completion(
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    temperature=temperature
+                )
+                return response.choices[0].message.content.strip()
+            except Exception as chat_error:
+                if self.debug:
+                    print(f"Chat completion failed: {chat_error}, trying text generation...")
+                # Fallback to text generation
+                response = self.client.conversational(
+                    prompt,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=temperature > 0
+                )
+                return response.strip()
+        except Exception as e:
+            if self.debug:
+                print(f"Both chat completion and text generation failed: {e}")
+            raise e
+    def _extract_and_process_urls(self, question_text: str) -> str:
+        """
+        Extract URLs from question text and fetch their content.
+        Returns formatted content from all URLs.
+        """
+        urls = self.web_fetcher.extract_urls_from_text(question_text)
+        if not urls:
+            return ""
+        if self.debug:
+            print(f"...Found {len(urls)} URLs in question: {urls}")
+        url_contents = self.web_fetcher.fetch_multiple_urls(urls)
+        if not url_contents:
+            return ""
+        # Format the content
+        formatted_content = []
+        for content_data in url_contents:
+            if content_data['error']:
+                formatted_content.append(f"URL: {content_data['url']}\nError: {content_data['error']}")
+            else:
+                formatted_content.append(
+                    f"URL: {content_data['url']}\n"
+                    f"Title: {content_data['title']}\n"
+                    f"Content Type: {content_data['content_type']}\n"
+                    f"Content: {content_data['content']}"
+                )
+        return "\n\n" + "="*50 + "\n".join(formatted_content) + "\n" + "="*50
 def _detect_and_process_direct_attachments(self, file_name: str) -> Tuple[List[str], List[str], List[str]]:
     """
     Detect and process a single attachment directly attached to a question (not as a URL).