Final_Assignment_Template

Sleeping

App Files Files Community

tatianija commited on Jun 27

Commit

958c53e

verified ·

1 Parent(s): 48cda19

Update app.py

Browse files

Files changed (1) hide show

app.py +225 -28

app.py CHANGED Viewed

@@ -15,6 +15,9 @@ import io
 import tempfile
 import urllib.parse
 from pathlib import Path
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -24,6 +27,145 @@ cached_answers = {}
 cached_questions = []
 processing_status = {"is_processing": False, "progress": 0, "total": 0}
 # --- File Download Utility ---
 def download_attachment(url: str, temp_dir: str) -> Optional[str]:
     """
@@ -197,7 +339,7 @@ class AudioTranscriptionTool:
             except:
                 return f"Audio transcription failed: {e}"
-# --- Enhanced Intelligent Agent with Media Processing ---
 class IntelligentAgent:
     def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
         self.search = DuckDuckGoSearchTool()
@@ -205,6 +347,7 @@ class IntelligentAgent:
         self.image_tool = ImageAnalysisTool()
         self.audio_tool = AudioTranscriptionTool()
         self.code_tool = CodeAnalysisTool(model_name)
         self.debug = debug
         if self.debug:
             print(f"IntelligentAgent initialized with model: {model_name}")
@@ -242,6 +385,39 @@ class IntelligentAgent:
                 print(f"Both chat completion and text generation failed: {e}")
             raise e
     def _detect_and_download_attachments(self, question_data: dict) -> Tuple[List[str], List[str], List[str]]:
         """
         Detect and download attachments from question data.
@@ -268,12 +444,17 @@ class IntelligentAgent:
                 elif isinstance(field_data, str):
                     attachments.append(field_data)
-        # Also check if the question text contains URLs
         question_text = question_data.get('question', '')
         if 'http' in question_text:
-            import re
             urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', question_text)
-            attachments.extend(urls)
         # Download and categorize attachments
         for attachment in attachments:
@@ -376,9 +557,9 @@ class IntelligentAgent:
         return "\n\n".join(attachment_content) if attachment_content else ""
-    def _should_search(self, question: str, attachment_context: str = "") -> bool:
         """
-        Use LLM to determine if search is needed for the question, considering attachment context.
         Returns True if search is recommended, False otherwise.
         """
         decision_prompt = f"""Analyze this question and decide if it requires real-time information, recent data, or specific facts that might not be in your training data.
@@ -400,19 +581,22 @@ SEARCH IS NOT NEEDED for:
 - How-to instructions for common tasks
 - Creative writing or opinion-based responses
 - Questions that can be answered from attached files (code, images, audio)
 - Code analysis, debugging, or explanation questions
-- Questions about uploaded content
 Question: "{question}"
 {f"Attachment Context Available: {attachment_context[:500]}..." if attachment_context else "No attachment context available."}
 Respond with only "SEARCH" or "NO_SEARCH" followed by a brief reason (max 20 words).
 Example responses:
 - "SEARCH - Current weather data needed"
 - "NO_SEARCH - Mathematical concept, general knowledge sufficient"
-- "NO_SEARCH - Can be answered from attached code/image content"
 """
         try:
@@ -429,15 +613,23 @@ Example responses:
         except Exception as e:
             if self.debug:
-                print(f"Error in search decision: {e}, defaulting to no search for attachment questions")
-            # Default to no search if decision fails and there are attachments
-            return len(attachment_context) == 0
-    def _answer_with_llm(self, question: str, attachment_context: str = "") -> str:
         """
-        Generate answer using LLM without search, considering attachment context.
         """
-        context_section = f"\n\nAttachment Context:\n{attachment_context}" if attachment_context else ""
         answer_prompt = f"""You are a general AI assistant. I will ask you a question.
         YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@@ -456,9 +648,9 @@ Answer:"""
         except Exception as e:
             return f"Sorry, I encountered an error generating the response: {e}"
-    def _answer_with_search(self, question: str, attachment_context: str = "") -> str:
         """
-        Generate answer using search results and LLM, considering attachment context.
         """
         try:
             # Perform search
@@ -469,7 +661,7 @@ Answer:"""
                 print(f"Search results type: {type(search_results)}")
             if not search_results:
-                return "No search results found. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context)
             # Format search results - handle different result formats
             if isinstance(search_results, str):
@@ -490,12 +682,20 @@ Answer:"""
                 search_context = "\n\n".join(formatted_results)
-            # Generate answer using search context and attachment context
-            context_section = f"\n\nAttachment Context:\n{attachment_context}" if attachment_context else ""
             answer_prompt = f"""You are a general AI assistant. I will ask you a question.
-            Based on the search results and the context section below, provide an answer to the question.
-            If the search results don't fully answer the question, you can supplement with your general knowledge.
             Your ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
             Do not add dot if your answer is a number.
             If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
@@ -505,10 +705,7 @@ Answer:"""
 Question: {question}
-Search Results:
-{search_context}
-{context_section}
 Answer:"""
@@ -538,16 +735,16 @@ Answer:"""
                     return "Search completed but no usable results found."
         except Exception as e:
-            return f"Search failed: {e}. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context)
     def process_question_with_attachments(self, question_data: dict) -> str:
         """
-        Process a question that may have attachments.
         """
         question_text = question_data.get('question', '')
         if self.debug:
-            print(f"Processing question with potential attachments: {question_text[:100]}...")
         try:
             # Detect and download attachments

 import tempfile
 import urllib.parse
 from pathlib import Path
+import re
+from bs4 import BeautifulSoup
+import mimetypes
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 cached_questions = []
 processing_status = {"is_processing": False, "progress": 0, "total": 0}
+# --- Web Content Fetcher ---
+class WebContentFetcher:
+    def __init__(self, debug: bool = True):
+        self.debug = debug
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+    def extract_urls_from_text(self, text: str) -> List[str]:
+        """Extract URLs from text using regex."""
+        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+        urls = re.findall(url_pattern, text)
+        return list(set(urls))  # Remove duplicates
+    def fetch_url_content(self, url: str) -> Dict[str, str]:
+        """
+        Fetch content from a URL and extract text, handling different content types.
+        Returns a dictionary with 'content', 'title', 'content_type', and 'error' keys.
+        """
+        try:
+            # Clean the URL
+            url = url.strip()
+            if not url.startswith(('http://', 'https://')):
+                url = 'https://' + url
+            if self.debug:
+                print(f"Fetching URL: {url}")
+            response = self.session.get(url, timeout=30, allow_redirects=True)
+            response.raise_for_status()
+            content_type = response.headers.get('content-type', '').lower()
+            result = {
+                'url': url,
+                'content_type': content_type,
+                'title': '',
+                'content': '',
+                'error': None
+            }
+            # Handle different content types
+            if 'text/html' in content_type:
+                # Parse HTML content
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Extract title
+                title_tag = soup.find('title')
+                result['title'] = title_tag.get_text().strip() if title_tag else 'No title'
+                # Remove script and style elements
+                for script in soup(["script", "style"]):
+                    script.decompose()
+                # Extract text content
+                text_content = soup.get_text()
+                # Clean up text
+                lines = (line.strip() for line in text_content.splitlines())
+                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+                text_content = ' '.join(chunk for chunk in chunks if chunk)
+                # Limit content length
+                if len(text_content) > 8000:
+                    text_content = text_content[:8000] + "... (truncated)"
+                result['content'] = text_content
+            elif 'text/plain' in content_type:
+                # Handle plain text
+                text_content = response.text
+                if len(text_content) > 8000:
+                    text_content = text_content[:8000] + "... (truncated)"
+                result['content'] = text_content
+                result['title'] = f"Text document from {url}"
+            elif 'application/json' in content_type:
+                # Handle JSON content
+                try:
+                    json_data = response.json()
+                    result['content'] = json.dumps(json_data, indent=2)[:8000]
+                    result['title'] = f"JSON document from {url}"
+                except:
+                    result['content'] = response.text[:8000]
+                    result['title'] = f"JSON document from {url}"
+            elif any(x in content_type for x in ['application/pdf', 'application/msword', 'application/vnd.openxmlformats']):
+                # Handle document files
+                result['content'] = f"Document file detected ({content_type}). Content extraction for this file type is not implemented."
+                result['title'] = f"Document from {url}"
+            else:
+                # Handle other content types
+                if response.text:
+                    content = response.text[:8000]
+                    result['content'] = content
+                    result['title'] = f"Content from {url}"
+                else:
+                    result['content'] = f"Non-text content detected ({content_type})"
+                    result['title'] = f"File from {url}"
+            if self.debug:
+                print(f"Successfully fetched content from {url}: {len(result['content'])} characters")
+            return result
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Failed to fetch {url}: {str(e)}"
+            if self.debug:
+                print(error_msg)
+            return {
+                'url': url,
+                'content_type': 'error',
+                'title': f"Error fetching {url}",
+                'content': '',
+                'error': error_msg
+            }
+        except Exception as e:
+            error_msg = f"Unexpected error fetching {url}: {str(e)}"
+            if self.debug:
+                print(error_msg)
+            return {
+                'url': url,
+                'content_type': 'error',
+                'title': f"Error fetching {url}",
+                'content': '',
+                'error': error_msg
+            }
+    def fetch_multiple_urls(self, urls: List[str]) -> List[Dict[str, str]]:
+        """Fetch content from multiple URLs."""
+        results = []
+        for url in urls[:5]:  # Limit to 5 URLs to avoid excessive processing
+            result = self.fetch_url_content(url)
+            results.append(result)
+            time.sleep(1)  # Be respectful to servers
+        return results
 # --- File Download Utility ---
 def download_attachment(url: str, temp_dir: str) -> Optional[str]:
     """
             except:
                 return f"Audio transcription failed: {e}"
+# --- Enhanced Intelligent Agent with URL Processing ---
 class IntelligentAgent:
     def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
         self.search = DuckDuckGoSearchTool()
         self.image_tool = ImageAnalysisTool()
         self.audio_tool = AudioTranscriptionTool()
         self.code_tool = CodeAnalysisTool(model_name)
+        self.web_fetcher = WebContentFetcher(debug)
         self.debug = debug
         if self.debug:
             print(f"IntelligentAgent initialized with model: {model_name}")
                 print(f"Both chat completion and text generation failed: {e}")
             raise e
+    def _extract_and_process_urls(self, question_text: str) -> str:
+        """
+        Extract URLs from question text and fetch their content.
+        Returns formatted content from all URLs.
+        """
+        urls = self.web_fetcher.extract_urls_from_text(question_text)
+        if not urls:
+            return ""
+        if self.debug:
+            print(f"Found {len(urls)} URLs in question: {urls}")
+        url_contents = self.web_fetcher.fetch_multiple_urls(urls)
+        if not url_contents:
+            return ""
+        # Format the content
+        formatted_content = []
+        for content_data in url_contents:
+            if content_data['error']:
+                formatted_content.append(f"URL: {content_data['url']}\nError: {content_data['error']}")
+            else:
+                formatted_content.append(
+                    f"URL: {content_data['url']}\n"
+                    f"Title: {content_data['title']}\n"
+                    f"Content Type: {content_data['content_type']}\n"
+                    f"Content: {content_data['content']}"
+                )
+        return "\n\n" + "="*50 + "\n".join(formatted_content) + "\n" + "="*50
     def _detect_and_download_attachments(self, question_data: dict) -> Tuple[List[str], List[str], List[str]]:
         """
         Detect and download attachments from question data.
                 elif isinstance(field_data, str):
                     attachments.append(field_data)
+        # Also check if the question text contains file URLs (not web URLs)
         question_text = question_data.get('question', '')
         if 'http' in question_text:
+            # Only consider URLs that likely point to files, not web pages
             urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', question_text)
+            for url in urls:
+                # Check if URL likely points to a file (has file extension)
+                parsed = urllib.parse.urlparse(url)
+                path = parsed.path.lower()
+                if any(path.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.mp3', '.wav', '.py', '.txt', '.pdf']):
+                    attachments.append(url)
         # Download and categorize attachments
         for attachment in attachments:
         return "\n\n".join(attachment_content) if attachment_content else ""
+    def _should_search(self, question: str, attachment_context: str = "", url_context: str = "") -> bool:
         """
+        Use LLM to determine if search is needed for the question, considering attachment and URL context.
         Returns True if search is recommended, False otherwise.
         """
         decision_prompt = f"""Analyze this question and decide if it requires real-time information, recent data, or specific facts that might not be in your training data.
 - How-to instructions for common tasks
 - Creative writing or opinion-based responses
 - Questions that can be answered from attached files (code, images, audio)
+- Questions that can be answered from URL content provided
 - Code analysis, debugging, or explanation questions
+- Questions about uploaded or linked content
 Question: "{question}"
 {f"Attachment Context Available: {attachment_context[:500]}..." if attachment_context else "No attachment context available."}
+{f"URL Content Available: {url_context[:500]}..." if url_context else "No URL content available."}
 Respond with only "SEARCH" or "NO_SEARCH" followed by a brief reason (max 20 words).
 Example responses:
 - "SEARCH - Current weather data needed"
 - "NO_SEARCH - Mathematical concept, general knowledge sufficient"
+- "NO_SEARCH - Can be answered from attached code/image/URL content"
 """
         try:
         except Exception as e:
             if self.debug:
+                print(f"Error in search decision: {e}, defaulting to no search for questions with context")
+            # Default to no search if decision fails and there is context available
+            return len(attachment_context) == 0 and len(url_context) == 0
+    def _answer_with_llm(self, question: str, attachment_context: str = "", url_context: str = "") -> str:
         """
+        Generate answer using LLM without search, considering attachment and URL context.
         """
+        context_sections = []
+        if attachment_context:
+            context_sections.append(f"Attachment Context:\n{attachment_context}")
+        if url_context:
+            context_sections.append(f"URL Content:\n{url_context}")
+        context_section = "\n\n".join(context_sections) if context_sections else ""
         answer_prompt = f"""You are a general AI assistant. I will ask you a question.
         YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
         except Exception as e:
             return f"Sorry, I encountered an error generating the response: {e}"
+    def _answer_with_search(self, question: str, attachment_context: str = "", url_context: str = "") -> str:
         """
+        Generate answer using search results and LLM, considering attachment and URL context.
         """
         try:
             # Perform search
                 print(f"Search results type: {type(search_results)}")
             if not search_results:
+                return "No search results found. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context, url_context)
             # Format search results - handle different result formats
             if isinstance(search_results, str):
                 search_context = "\n\n".join(formatted_results)
+            # Generate answer using search context, attachment context, and URL context
+            context_sections = [f"Search Results:\n{search_context}"]
+            if attachment_context:
+                context_sections.append(f"Attachment Context:\n{attachment_context}")
+            if url_context:
+                context_sections.append(f"URL Content:\n{url_context}")
+            full_context = "\n\n".join(context_sections)
             answer_prompt = f"""You are a general AI assistant. I will ask you a question.
+            Based on the search results and the context sections below, provide an answer to the question.
+            If the search results don't fully answer the question, you can supplement with information from other context sections or your general knowledge.
             Your ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
             Do not add dot if your answer is a number.
             If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
 Question: {question}
+{full_context}
 Answer:"""
                     return "Search completed but no usable results found."
         except Exception as e:
+            return f"Search failed: {e}. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context, url_context)
     def process_question_with_attachments(self, question_data: dict) -> str:
         """
+        Process a question that may have attachments and URLs.
         """
         question_text = question_data.get('question', '')
         if self.debug:
+            print(f"Processing question with potential attachments and URLs: {question_text[:100]}...")
         try:
             # Detect and download attachments