Final_Assignment_Template

Sleeping

App Files Files Community

tatianija commited on Jun 26

Commit

430ca10

verified ·

1 Parent(s): 718ab42

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -4

app.py CHANGED Viewed

@@ -6,9 +6,12 @@ import time
 import pandas as pd
 from smolagents import DuckDuckGoSearchTool
 import threading
-from typing import Dict, List, Optional, Tuple
 import json
 from huggingface_hub import InferenceClient
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -18,12 +21,96 @@ cached_answers = {}
 cached_questions = []
 processing_status = {"is_processing": False, "progress": 0, "total": 0}
-# --- Intelligent Agent with Conditional Search ---
 class IntelligentAgent:
     def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
         self.search = DuckDuckGoSearchTool()
-        self.client = InferenceClient(model=model_name,
-                                      provider = "sambanova")
         self.debug = debug
         if self.debug:
             print(f"IntelligentAgent initialized with model: {model_name}")
@@ -61,6 +148,45 @@ class IntelligentAgent:
                 print(f"Both chat completion and text generation failed: {e}")
             raise e
     def _should_search(self, question: str) -> bool:
         """
         Use LLM to determine if search is needed for the question.

 import pandas as pd
 from smolagents import DuckDuckGoSearchTool
 import threading
+from typing import Dict, List, Optional, Tuple, Union
 import json
 from huggingface_hub import InferenceClient
+import base64
+from PIL import Image
+import io
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 cached_questions = []
 processing_status = {"is_processing": False, "progress": 0, "total": 0}
+# --- Image Processing Tool ---
+class ImageAnalysisTool:
+    def __init__(self, model_name: str = "microsoft/Florence-2-large"):
+        self.client = InferenceClient(model=model_name)
+    def analyze_image(self, image_path: str, prompt: str = "Describe this image in detail") -> str:
+        """
+        Analyze an image and return a description.
+        """
+        try:
+            # Open and process the image
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+            # Use the vision model to analyze the image
+            response = self.client.image_to_text(
+                image=image_bytes,
+                model="microsoft/Florence-2-large"
+            )
+            return response.get("generated_text", "Could not analyze image")
+        except Exception as e:
+            try:
+                # Fallback: use a different vision model
+                response = self.client.image_to_text(
+                    image=image_bytes,
+                    model="Salesforce/blip-image-captioning-large"
+                )
+                return response.get("generated_text", f"Image analysis error: {e}")
+            except:
+                return f"Image analysis failed: {e}"
+    def extract_text_from_image(self, image_path: str) -> str:
+        """
+        Extract text from an image using OCR.
+        """
+        try:
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+            # Use an OCR model
+            response = self.client.image_to_text(
+                image=image_bytes,
+                model="microsoft/trocr-base-printed"
+            )
+            return response.get("generated_text", "No text found in image")
+        except Exception as e:
+            return f"OCR failed: {e}"
+# --- Audio Processing Tool ---
+class AudioTranscriptionTool:
+    def __init__(self, model_name: str = "openai/whisper-large-v3"):
+        self.client = InferenceClient(model=model_name)
+    def transcribe_audio(self, audio_path: str) -> str:
+        """
+        Transcribe audio file to text.
+        """
+        try:
+            with open(audio_path, "rb") as f:
+                audio_bytes = f.read()
+            # Use Whisper for transcription
+            response = self.client.automatic_speech_recognition(
+                audio=audio_bytes
+            )
+            return response.get("text", "Could not transcribe audio")
+        except Exception as e:
+            try:
+                # Fallback to a different ASR model
+                response = self.client.automatic_speech_recognition(
+                    audio=audio_bytes,
+                    model="facebook/wav2vec2-large-960h-lv60-self"
+                )
+                return response.get("text", f"Audio transcription error: {e}")
+            except:
+                return f"Audio transcription failed: {e}"
+# --- Enhanced Intelligent Agent with Media Processing ---
 class IntelligentAgent:
     def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
         self.search = DuckDuckGoSearchTool()
+        self.client = InferenceClient(model=model_name, provider="sambanova")
+        self.image_tool = ImageAnalysisTool()
+        self.audio_tool = AudioTranscriptionTool()
         self.debug = debug
         if self.debug:
             print(f"IntelligentAgent initialized with model: {model_name}")
                 print(f"Both chat completion and text generation failed: {e}")
             raise e
+    def _process_media_files(self, image_files: List[str] = None, audio_files: List[str] = None) -> str:
+        """
+        Process attached media files and return their content as text.
+        """
+        media_content = []
+        # Process images
+        if image_files:
+            for image_file in image_files:
+                if image_file and os.path.exists(image_file):
+                    try:
+                        # Analyze the image
+                        image_description = self.image_tool.analyze_image(image_file)
+                        media_content.append(f"Image Analysis: {image_description}")
+                        # Try to extract text from image
+                        extracted_text = self.image_tool.extract_text_from_image(image_file)
+                        if extracted_text and "No text found" not in extracted_text:
+                            media_content.append(f"Text from Image: {extracted_text}")
+                    except Exception as e:
+                        media_content.append(f"Error processing image {image_file}: {e}")
+        # Process audio files
+        if audio_files:
+            for audio_file in audio_files:
+                if audio_file and os.path.exists(audio_file):
+                    try:
+                        # Transcribe the audio
+                        transcription = self.audio_tool.transcribe_audio(audio_file)
+                        media_content.append(f"Audio Transcription: {transcription}")
+                    except Exception as e:
+                        media_content.append(f"Error processing audio {audio_file}: {e}")
+        return "\n\n".join(media_content) if media_content else ""
+    def _should_search(self, question: str, media_context: str
     def _should_search(self, question: str) -> bool:
         """
         Use LLM to determine if search is needed for the question.