tatianija commited on
Commit
b0ffe80
·
verified ·
1 Parent(s): 5d98e50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +495 -0
app.py CHANGED
@@ -1,3 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def _detect_and_process_direct_attachments(self, file_name: str) -> Tuple[List[str], List[str], List[str]]:
2
  """
3
  Detect and process a single attachment directly attached to a question (not as a URL).
 
1
+ import os
2
+ import gradio as gr
3
+ import requests
4
+ import inspect
5
+ import time
6
+ import pandas as pd
7
+ from smolagents import DuckDuckGoSearchTool
8
+ import threading
9
+ from typing import Dict, List, Optional, Tuple, Union
10
+ import json
11
+ from huggingface_hub import InferenceClient
12
+ import base64
13
+ from PIL import Image
14
+ import io
15
+ import tempfile
16
+ import urllib.parse
17
+ from pathlib import Path
18
+ import re
19
+ from bs4 import BeautifulSoup
20
+ import mimetypes
21
+
22
+ # --- Constants ---
23
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
24
+
25
+ # --- Global Cache for Answers ---
26
+ cached_answers = {}
27
+ cached_questions = []
28
+ processing_status = {"is_processing": False, "progress": 0, "total": 0}
29
+
30
+ # --- Web Content Fetcher ---
31
+ class WebContentFetcher:
32
+ def __init__(self, debug: bool = True):
33
+ self.debug = debug
34
+ self.session = requests.Session()
35
+ self.session.headers.update({
36
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
37
+ })
38
+
39
+ def extract_urls_from_text(self, text: str) -> List[str]:
40
+ """Extract URLs from text using regex."""
41
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
42
+ urls = re.findall(url_pattern, text)
43
+ return list(set(urls)) # Remove duplicates
44
+
45
+ def fetch_url_content(self, url: str) -> Dict[str, str]:
46
+ """
47
+ Fetch content from a URL and extract text, handling different content types.
48
+ Returns a dictionary with 'content', 'title', 'content_type', and 'error' keys.
49
+ """
50
+ try:
51
+ # Clean the URL
52
+ url = url.strip()
53
+ if not url.startswith(('http://', 'https://')):
54
+ url = 'https://' + url
55
+
56
+ if self.debug:
57
+ print(f"Fetching URL: {url}")
58
+
59
+ response = self.session.get(url, timeout=30, allow_redirects=True)
60
+ response.raise_for_status()
61
+
62
+ content_type = response.headers.get('content-type', '').lower()
63
+
64
+ result = {
65
+ 'url': url,
66
+ 'content_type': content_type,
67
+ 'title': '',
68
+ 'content': '',
69
+ 'error': None
70
+ }
71
+
72
+ # Handle different content types
73
+ if 'text/html' in content_type:
74
+ # Parse HTML content
75
+ soup = BeautifulSoup(response.content, 'html.parser')
76
+
77
+ # Extract title
78
+ title_tag = soup.find('title')
79
+ result['title'] = title_tag.get_text().strip() if title_tag else 'No title'
80
+
81
+ # Remove script and style elements
82
+ for script in soup(["script", "style"]):
83
+ script.decompose()
84
+
85
+ # Extract text content
86
+ text_content = soup.get_text()
87
+
88
+ # Clean up text
89
+ lines = (line.strip() for line in text_content.splitlines())
90
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
91
+ text_content = ' '.join(chunk for chunk in chunks if chunk)
92
+
93
+ # Limit content length
94
+ if len(text_content) > 8000:
95
+ text_content = text_content[:8000] + "... (truncated)"
96
+
97
+ result['content'] = text_content
98
+
99
+ elif 'text/plain' in content_type:
100
+ # Handle plain text
101
+ text_content = response.text
102
+ if len(text_content) > 8000:
103
+ text_content = text_content[:8000] + "... (truncated)"
104
+ result['content'] = text_content
105
+ result['title'] = f"Text document from {url}"
106
+
107
+ elif 'application/json' in content_type:
108
+ # Handle JSON content
109
+ try:
110
+ json_data = response.json()
111
+ result['content'] = json.dumps(json_data, indent=2)[:8000]
112
+ result['title'] = f"JSON document from {url}"
113
+ except:
114
+ result['content'] = response.text[:8000]
115
+ result['title'] = f"JSON document from {url}"
116
+
117
+ elif any(x in content_type for x in ['application/pdf', 'application/msword', 'application/vnd.openxmlformats']):
118
+ # Handle document files
119
+ result['content'] = f"Document file detected ({content_type}). Content extraction for this file type is not implemented."
120
+ result['title'] = f"Document from {url}"
121
+
122
+ else:
123
+ # Handle other content types
124
+ if response.text:
125
+ content = response.text[:8000]
126
+ result['content'] = content
127
+ result['title'] = f"Content from {url}"
128
+ else:
129
+ result['content'] = f"Non-text content detected ({content_type})"
130
+ result['title'] = f"File from {url}"
131
+
132
+ if self.debug:
133
+ print(f"Successfully fetched content from {url}: {len(result['content'])} characters")
134
+
135
+ return result
136
+
137
+ except requests.exceptions.RequestException as e:
138
+ error_msg = f"Failed to fetch {url}: {str(e)}"
139
+ if self.debug:
140
+ print(error_msg)
141
+ return {
142
+ 'url': url,
143
+ 'content_type': 'error',
144
+ 'title': f"Error fetching {url}",
145
+ 'content': '',
146
+ 'error': error_msg
147
+ }
148
+ except Exception as e:
149
+ error_msg = f"Unexpected error fetching {url}: {str(e)}"
150
+ if self.debug:
151
+ print(error_msg)
152
+ return {
153
+ 'url': url,
154
+ 'content_type': 'error',
155
+ 'title': f"Error fetching {url}",
156
+ 'content': '',
157
+ 'error': error_msg
158
+ }
159
+
160
+ def fetch_multiple_urls(self, urls: List[str]) -> List[Dict[str, str]]:
161
+ """Fetch content from multiple URLs."""
162
+ results = []
163
+ for url in urls[:5]: # Limit to 5 URLs to avoid excessive processing
164
+ result = self.fetch_url_content(url)
165
+ results.append(result)
166
+ time.sleep(1) # Be respectful to servers
167
+ return results
168
+
169
+ # --- File Processing Utility ---
170
+ def save_attachment_to_file(attachment_data: Union[str, bytes, dict], temp_dir: str, file_name: str = None) -> Optional[str]:
171
+ """
172
+ Save attachment data to a temporary file.
173
+ Returns the local file path if successful, None otherwise.
174
+ """
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+ try:
183
+ # Determine file name and extension
184
+ if not file_name:
185
+ file_name = f"attachment_{int(time.time())}"
186
+
187
+ # Handle different data types
188
+ if isinstance(attachment_data, dict):
189
+ # Handle dict with file data
190
+ if 'data' in attachment_data:
191
+ file_data = attachment_data['data']
192
+ file_type = attachment_data.get('type', '').lower()
193
+ original_name = attachment_data.get('name', file_name)
194
+ elif 'content' in attachment_data:
195
+ file_data = attachment_data['content']
196
+ file_type = attachment_data.get('mime_type', '').lower()
197
+ original_name = attachment_data.get('filename', file_name)
198
+ else:
199
+ # Try to use the dict as file data directly
200
+ file_data = str(attachment_data)
201
+ file_type = ''
202
+ original_name = file_name
203
+
204
+ # Use original name if available
205
+ if original_name and original_name != file_name:
206
+ file_name = original_name
207
+
208
+ elif isinstance(attachment_data, str):
209
+ # Could be base64 encoded data or plain text
210
+ file_data = attachment_data
211
+ file_type = ''
212
+
213
+ elif isinstance(attachment_data, bytes):
214
+ # Binary data
215
+ file_data = attachment_data
216
+ file_type = ''
217
+
218
+ else:
219
+ print(f"Unknown attachment data type: {type(attachment_data)}")
220
+ return None
221
+
222
+ # Ensure file has an extension
223
+ if '.' not in file_name:
224
+ # Try to determine extension from type
225
+ if 'image' in file_type:
226
+ if 'jpeg' in file_type or 'jpg' in file_type:
227
+ file_name += '.jpg'
228
+ elif 'png' in file_type:
229
+ file_name += '.png'
230
+ else:
231
+ file_name += '.img'
232
+ elif 'audio' in file_type:
233
+ if 'mp3' in file_type:
234
+ file_name += '.mp3'
235
+ elif 'wav' in file_type:
236
+ file_name += '.wav'
237
+ else:
238
+ file_name += '.audio'
239
+ elif 'python' in file_type or 'text' in file_type:
240
+ file_name += '.py'
241
+ else:
242
+ file_name += '.file'
243
+
244
+ file_path = os.path.join(temp_dir, file_name)
245
+
246
+ # Save the file
247
+ if isinstance(file_data, str):
248
+ # Try to decode if it's base64
249
+ try:
250
+ # Check if it looks like base64
251
+ if len(file_data) > 100 and '=' in file_data[-5:]:
252
+ decoded_data = base64.b64decode(file_data)
253
+ with open(file_path, 'wb') as f:
254
+ f.write(decoded_data)
255
+ else:
256
+ # Plain text
257
+ with open(file_path, 'w', encoding='utf-8') as f:
258
+ f.write(file_data)
259
+ except:
260
+ # If base64 decode fails, save as text
261
+ with open(file_path, 'w', encoding='utf-8') as f:
262
+ f.write(file_data)
263
+ else:
264
+ # Binary data
265
+ with open(file_path, 'wb') as f:
266
+ f.write(file_data)
267
+
268
+ print(f"Saved attachment: {file_path}")
269
+ return file_path
270
+
271
+ except Exception as e:
272
+ print(f"Failed to save attachment: {e}")
273
+ return None
274
+
275
+
276
+
277
+ # --- Code Processing Tool ---
278
+ class CodeAnalysisTool:
279
+ def __init__(self, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
280
+ self.client = InferenceClient(model=model_name, provider="sambanova")
281
+
282
+ def analyze_code(self, code_path: str) -> str:
283
+ """
284
+ Analyze Python code and return insights.
285
+ """
286
+ try:
287
+ with open(code_path, 'r', encoding='utf-8') as f:
288
+ code_content = f.read()
289
+
290
+ # Limit code length for analysis
291
+ if len(code_content) > 5000:
292
+ code_content = code_content[:5000] + "\n... (truncated)"
293
+
294
+ analysis_prompt = f"""Analyze this Python code and provide a concise summary of:
295
+ 1. What the code does (main functionality)
296
+ 2. Key functions/classes
297
+ 3. Any notable patterns or issues
298
+ 4. Input/output behavior if applicable
299
+
300
+ Code:
301
+ ```python
302
+ {code_content}
303
+ ```
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+ Provide a brief, focused analysis:"""
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+ messages = [{"role": "user", "content": analysis_prompt}]
323
+ response = self.client.chat_completion(
324
+ messages=messages,
325
+ max_tokens=500,
326
+ temperature=0.3
327
+ )
328
+
329
+ return response.choices[0].message.content.strip()
330
+
331
+ except Exception as e:
332
+ return f"Code analysis failed: {e}"
333
+
334
+ # --- Image Processing Tool ---
335
+ class ImageAnalysisTool:
336
+ def __init__(self, model_name: str = "microsoft/Florence-2-large"):
337
+ self.client = InferenceClient(model=model_name)
338
+
339
+ def analyze_image(self, image_path: str, prompt: str = "Describe this image in detail") -> str:
340
+ """
341
+ Analyze an image and return a description.
342
+ """
343
+ try:
344
+ # Open and process the image
345
+ with open(image_path, "rb") as f:
346
+ image_bytes = f.read()
347
+
348
+ # Use the vision model to analyze the image
349
+ response = self.client.image_to_text(
350
+ image=image_bytes,
351
+ model="microsoft/Florence-2-large"
352
+ )
353
+
354
+ return response.get("generated_text", "Could not analyze image")
355
+
356
+ except Exception as e:
357
+ try:
358
+ # Fallback: use a different vision model
359
+ response = self.client.image_to_text(
360
+ image=image_bytes,
361
+ model="Salesforce/blip-image-captioning-large"
362
+ )
363
+ return response.get("generated_text", f"Image analysis error: {e}")
364
+ except:
365
+ return f"Image analysis failed: {e}"
366
+
367
+ def extract_text_from_image(self, image_path: str) -> str:
368
+ """
369
+ Extract text from an image using OCR.
370
+ """
371
+ try:
372
+ with open(image_path, "rb") as f:
373
+ image_bytes = f.read()
374
+
375
+ # Use an OCR model
376
+ response = self.client.image_to_text(
377
+ image=image_bytes,
378
+ model="microsoft/trocr-base-printed"
379
+ )
380
+
381
+ return response.get("generated_text", "No text found in image")
382
+
383
+ except Exception as e:
384
+ return f"OCR failed: {e}"
385
+
386
+ # --- Audio Processing Tool ---
387
+ class AudioTranscriptionTool:
388
+ def __init__(self, model_name: str = "openai/whisper-large-v3"):
389
+ self.client = InferenceClient(model=model_name)
390
+
391
+ def transcribe_audio(self, audio_path: str) -> str:
392
+ """
393
+ Transcribe audio file to text.
394
+ """
395
+ try:
396
+ with open(audio_path, "rb") as f:
397
+ audio_bytes = f.read()
398
+
399
+ # Use Whisper for transcription
400
+ response = self.client.automatic_speech_recognition(
401
+ audio=audio_bytes
402
+ )
403
+
404
+ return response.get("text", "Could not transcribe audio")
405
+
406
+ except Exception as e:
407
+ try:
408
+ # Fallback to a different ASR model
409
+ response = self.client.automatic_speech_recognition(
410
+ audio=audio_bytes,
411
+ model="facebook/wav2vec2-large-960h-lv60-self"
412
+ )
413
+ return response.get("text", f"Audio transcription error: {e}")
414
+ except:
415
+ return f"Audio transcription failed: {e}"
416
+
417
+ # --- Enhanced Intelligent Agent with Direct Attachment Processing ---
418
+ class IntelligentAgent:
419
+ def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
420
+ self.search = DuckDuckGoSearchTool()
421
+ self.client = InferenceClient(model=model_name, provider="sambanova")
422
+ self.image_tool = ImageAnalysisTool()
423
+ self.audio_tool = AudioTranscriptionTool()
424
+ self.code_tool = CodeAnalysisTool(model_name)
425
+ self.web_fetcher = WebContentFetcher(debug)
426
+ self.debug = debug
427
+ if self.debug:
428
+ print(f"IntelligentAgent initialized with model: {model_name}")
429
+
430
+ def _chat_completion(self, prompt: str, max_tokens: int = 500, temperature: float = 0.3) -> str:
431
+ """
432
+ Use chat completion instead of text generation to avoid provider compatibility issues.
433
+ """
434
+ try:
435
+ messages = [{"role": "user", "content": prompt}]
436
+
437
+ # Try chat completion first
438
+ try:
439
+ response = self.client.chat_completion(
440
+ messages=messages,
441
+ max_tokens=max_tokens,
442
+ temperature=temperature
443
+ )
444
+ return response.choices[0].message.content.strip()
445
+ except Exception as chat_error:
446
+ if self.debug:
447
+ print(f"Chat completion failed: {chat_error}, trying text generation...")
448
+
449
+ # Fallback to text generation
450
+ response = self.client.conversational(
451
+ prompt,
452
+ max_new_tokens=max_tokens,
453
+ temperature=temperature,
454
+ do_sample=temperature > 0
455
+ )
456
+ return response.strip()
457
+
458
+ except Exception as e:
459
+ if self.debug:
460
+ print(f"Both chat completion and text generation failed: {e}")
461
+ raise e
462
+
463
+ def _extract_and_process_urls(self, question_text: str) -> str:
464
+ """
465
+ Extract URLs from question text and fetch their content.
466
+ Returns formatted content from all URLs.
467
+ """
468
+ urls = self.web_fetcher.extract_urls_from_text(question_text)
469
+
470
+ if not urls:
471
+ return ""
472
+
473
+ if self.debug:
474
+ print(f"...Found {len(urls)} URLs in question: {urls}")
475
+
476
+ url_contents = self.web_fetcher.fetch_multiple_urls(urls)
477
+
478
+ if not url_contents:
479
+ return ""
480
+
481
+ # Format the content
482
+ formatted_content = []
483
+ for content_data in url_contents:
484
+ if content_data['error']:
485
+ formatted_content.append(f"URL: {content_data['url']}\nError: {content_data['error']}")
486
+ else:
487
+ formatted_content.append(
488
+ f"URL: {content_data['url']}\n"
489
+ f"Title: {content_data['title']}\n"
490
+ f"Content Type: {content_data['content_type']}\n"
491
+ f"Content: {content_data['content']}"
492
+ )
493
+
494
+ return "\n\n" + "="*50 + "\n".join(formatted_content) + "\n" + "="*50
495
+
496
  def _detect_and_process_direct_attachments(self, file_name: str) -> Tuple[List[str], List[str], List[str]]:
497
  """
498
  Detect and process a single attachment directly attached to a question (not as a URL).