tatianija commited on
Commit
430ca10
·
verified ·
1 Parent(s): 718ab42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -4
app.py CHANGED
@@ -6,9 +6,12 @@ import time
6
  import pandas as pd
7
  from smolagents import DuckDuckGoSearchTool
8
  import threading
9
- from typing import Dict, List, Optional, Tuple
10
  import json
11
  from huggingface_hub import InferenceClient
 
 
 
12
 
13
  # --- Constants ---
14
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -18,12 +21,96 @@ cached_answers = {}
18
  cached_questions = []
19
  processing_status = {"is_processing": False, "progress": 0, "total": 0}
20
 
21
- # --- Intelligent Agent with Conditional Search ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class IntelligentAgent:
23
  def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
24
  self.search = DuckDuckGoSearchTool()
25
- self.client = InferenceClient(model=model_name,
26
- provider = "sambanova")
 
27
  self.debug = debug
28
  if self.debug:
29
  print(f"IntelligentAgent initialized with model: {model_name}")
@@ -61,6 +148,45 @@ class IntelligentAgent:
61
  print(f"Both chat completion and text generation failed: {e}")
62
  raise e
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def _should_search(self, question: str) -> bool:
65
  """
66
  Use LLM to determine if search is needed for the question.
 
6
  import pandas as pd
7
  from smolagents import DuckDuckGoSearchTool
8
  import threading
9
+ from typing import Dict, List, Optional, Tuple, Union
10
  import json
11
  from huggingface_hub import InferenceClient
12
+ import base64
13
+ from PIL import Image
14
+ import io
15
 
16
  # --- Constants ---
17
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
21
  cached_questions = []
22
  processing_status = {"is_processing": False, "progress": 0, "total": 0}
23
 
24
+ # --- Image Processing Tool ---
25
+ class ImageAnalysisTool:
26
+ def __init__(self, model_name: str = "microsoft/Florence-2-large"):
27
+ self.client = InferenceClient(model=model_name)
28
+
29
+ def analyze_image(self, image_path: str, prompt: str = "Describe this image in detail") -> str:
30
+ """
31
+ Analyze an image and return a description.
32
+ """
33
+ try:
34
+ # Open and process the image
35
+ with open(image_path, "rb") as f:
36
+ image_bytes = f.read()
37
+
38
+ # Use the vision model to analyze the image
39
+ response = self.client.image_to_text(
40
+ image=image_bytes,
41
+ model="microsoft/Florence-2-large"
42
+ )
43
+
44
+ return response.get("generated_text", "Could not analyze image")
45
+
46
+ except Exception as e:
47
+ try:
48
+ # Fallback: use a different vision model
49
+ response = self.client.image_to_text(
50
+ image=image_bytes,
51
+ model="Salesforce/blip-image-captioning-large"
52
+ )
53
+ return response.get("generated_text", f"Image analysis error: {e}")
54
+ except:
55
+ return f"Image analysis failed: {e}"
56
+
57
+ def extract_text_from_image(self, image_path: str) -> str:
58
+ """
59
+ Extract text from an image using OCR.
60
+ """
61
+ try:
62
+ with open(image_path, "rb") as f:
63
+ image_bytes = f.read()
64
+
65
+ # Use an OCR model
66
+ response = self.client.image_to_text(
67
+ image=image_bytes,
68
+ model="microsoft/trocr-base-printed"
69
+ )
70
+
71
+ return response.get("generated_text", "No text found in image")
72
+
73
+ except Exception as e:
74
+ return f"OCR failed: {e}"
75
+
76
+ # --- Audio Processing Tool ---
77
+ class AudioTranscriptionTool:
78
+ def __init__(self, model_name: str = "openai/whisper-large-v3"):
79
+ self.client = InferenceClient(model=model_name)
80
+
81
+ def transcribe_audio(self, audio_path: str) -> str:
82
+ """
83
+ Transcribe audio file to text.
84
+ """
85
+ try:
86
+ with open(audio_path, "rb") as f:
87
+ audio_bytes = f.read()
88
+
89
+ # Use Whisper for transcription
90
+ response = self.client.automatic_speech_recognition(
91
+ audio=audio_bytes
92
+ )
93
+
94
+ return response.get("text", "Could not transcribe audio")
95
+
96
+ except Exception as e:
97
+ try:
98
+ # Fallback to a different ASR model
99
+ response = self.client.automatic_speech_recognition(
100
+ audio=audio_bytes,
101
+ model="facebook/wav2vec2-large-960h-lv60-self"
102
+ )
103
+ return response.get("text", f"Audio transcription error: {e}")
104
+ except:
105
+ return f"Audio transcription failed: {e}"
106
+
107
+ # --- Enhanced Intelligent Agent with Media Processing ---
108
  class IntelligentAgent:
109
  def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
110
  self.search = DuckDuckGoSearchTool()
111
+ self.client = InferenceClient(model=model_name, provider="sambanova")
112
+ self.image_tool = ImageAnalysisTool()
113
+ self.audio_tool = AudioTranscriptionTool()
114
  self.debug = debug
115
  if self.debug:
116
  print(f"IntelligentAgent initialized with model: {model_name}")
 
148
  print(f"Both chat completion and text generation failed: {e}")
149
  raise e
150
 
151
+ def _process_media_files(self, image_files: List[str] = None, audio_files: List[str] = None) -> str:
152
+ """
153
+ Process attached media files and return their content as text.
154
+ """
155
+ media_content = []
156
+
157
+ # Process images
158
+ if image_files:
159
+ for image_file in image_files:
160
+ if image_file and os.path.exists(image_file):
161
+ try:
162
+ # Analyze the image
163
+ image_description = self.image_tool.analyze_image(image_file)
164
+ media_content.append(f"Image Analysis: {image_description}")
165
+
166
+ # Try to extract text from image
167
+ extracted_text = self.image_tool.extract_text_from_image(image_file)
168
+ if extracted_text and "No text found" not in extracted_text:
169
+ media_content.append(f"Text from Image: {extracted_text}")
170
+
171
+ except Exception as e:
172
+ media_content.append(f"Error processing image {image_file}: {e}")
173
+
174
+ # Process audio files
175
+ if audio_files:
176
+ for audio_file in audio_files:
177
+ if audio_file and os.path.exists(audio_file):
178
+ try:
179
+ # Transcribe the audio
180
+ transcription = self.audio_tool.transcribe_audio(audio_file)
181
+ media_content.append(f"Audio Transcription: {transcription}")
182
+
183
+ except Exception as e:
184
+ media_content.append(f"Error processing audio {audio_file}: {e}")
185
+
186
+ return "\n\n".join(media_content) if media_content else ""
187
+
188
+ def _should_search(self, question: str, media_context: str
189
+
190
  def _should_search(self, question: str) -> bool:
191
  """
192
  Use LLM to determine if search is needed for the question.