tatianija commited on
Commit
958c53e
·
verified ·
1 Parent(s): 48cda19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +225 -28
app.py CHANGED
@@ -15,6 +15,9 @@ import io
15
  import tempfile
16
  import urllib.parse
17
  from pathlib import Path
 
 
 
18
 
19
  # --- Constants ---
20
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -24,6 +27,145 @@ cached_answers = {}
24
  cached_questions = []
25
  processing_status = {"is_processing": False, "progress": 0, "total": 0}
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # --- File Download Utility ---
28
  def download_attachment(url: str, temp_dir: str) -> Optional[str]:
29
  """
@@ -197,7 +339,7 @@ class AudioTranscriptionTool:
197
  except:
198
  return f"Audio transcription failed: {e}"
199
 
200
- # --- Enhanced Intelligent Agent with Media Processing ---
201
  class IntelligentAgent:
202
  def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
203
  self.search = DuckDuckGoSearchTool()
@@ -205,6 +347,7 @@ class IntelligentAgent:
205
  self.image_tool = ImageAnalysisTool()
206
  self.audio_tool = AudioTranscriptionTool()
207
  self.code_tool = CodeAnalysisTool(model_name)
 
208
  self.debug = debug
209
  if self.debug:
210
  print(f"IntelligentAgent initialized with model: {model_name}")
@@ -242,6 +385,39 @@ class IntelligentAgent:
242
  print(f"Both chat completion and text generation failed: {e}")
243
  raise e
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  def _detect_and_download_attachments(self, question_data: dict) -> Tuple[List[str], List[str], List[str]]:
246
  """
247
  Detect and download attachments from question data.
@@ -268,12 +444,17 @@ class IntelligentAgent:
268
  elif isinstance(field_data, str):
269
  attachments.append(field_data)
270
 
271
- # Also check if the question text contains URLs
272
  question_text = question_data.get('question', '')
273
  if 'http' in question_text:
274
- import re
275
  urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', question_text)
276
- attachments.extend(urls)
 
 
 
 
 
277
 
278
  # Download and categorize attachments
279
  for attachment in attachments:
@@ -376,9 +557,9 @@ class IntelligentAgent:
376
 
377
  return "\n\n".join(attachment_content) if attachment_content else ""
378
 
379
- def _should_search(self, question: str, attachment_context: str = "") -> bool:
380
  """
381
- Use LLM to determine if search is needed for the question, considering attachment context.
382
  Returns True if search is recommended, False otherwise.
383
  """
384
  decision_prompt = f"""Analyze this question and decide if it requires real-time information, recent data, or specific facts that might not be in your training data.
@@ -400,19 +581,22 @@ SEARCH IS NOT NEEDED for:
400
  - How-to instructions for common tasks
401
  - Creative writing or opinion-based responses
402
  - Questions that can be answered from attached files (code, images, audio)
 
403
  - Code analysis, debugging, or explanation questions
404
- - Questions about uploaded content
405
 
406
  Question: "{question}"
407
 
408
  {f"Attachment Context Available: {attachment_context[:500]}..." if attachment_context else "No attachment context available."}
409
 
 
 
410
  Respond with only "SEARCH" or "NO_SEARCH" followed by a brief reason (max 20 words).
411
 
412
  Example responses:
413
  - "SEARCH - Current weather data needed"
414
  - "NO_SEARCH - Mathematical concept, general knowledge sufficient"
415
- - "NO_SEARCH - Can be answered from attached code/image content"
416
  """
417
 
418
  try:
@@ -429,15 +613,23 @@ Example responses:
429
 
430
  except Exception as e:
431
  if self.debug:
432
- print(f"Error in search decision: {e}, defaulting to no search for attachment questions")
433
- # Default to no search if decision fails and there are attachments
434
- return len(attachment_context) == 0
435
 
436
- def _answer_with_llm(self, question: str, attachment_context: str = "") -> str:
437
  """
438
- Generate answer using LLM without search, considering attachment context.
439
  """
440
- context_section = f"\n\nAttachment Context:\n{attachment_context}" if attachment_context else ""
 
 
 
 
 
 
 
 
441
 
442
  answer_prompt = f"""You are a general AI assistant. I will ask you a question.
443
  YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@@ -456,9 +648,9 @@ Answer:"""
456
  except Exception as e:
457
  return f"Sorry, I encountered an error generating the response: {e}"
458
 
459
- def _answer_with_search(self, question: str, attachment_context: str = "") -> str:
460
  """
461
- Generate answer using search results and LLM, considering attachment context.
462
  """
463
  try:
464
  # Perform search
@@ -469,7 +661,7 @@ Answer:"""
469
  print(f"Search results type: {type(search_results)}")
470
 
471
  if not search_results:
472
- return "No search results found. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context)
473
 
474
  # Format search results - handle different result formats
475
  if isinstance(search_results, str):
@@ -490,12 +682,20 @@ Answer:"""
490
 
491
  search_context = "\n\n".join(formatted_results)
492
 
493
- # Generate answer using search context and attachment context
494
- context_section = f"\n\nAttachment Context:\n{attachment_context}" if attachment_context else ""
 
 
 
 
 
 
 
 
495
 
496
  answer_prompt = f"""You are a general AI assistant. I will ask you a question.
497
- Based on the search results and the context section below, provide an answer to the question.
498
- If the search results don't fully answer the question, you can supplement with your general knowledge.
499
  Your ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
500
  Do not add dot if your answer is a number.
501
  If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
@@ -505,10 +705,7 @@ Answer:"""
505
 
506
  Question: {question}
507
 
508
- Search Results:
509
- {search_context}
510
-
511
- {context_section}
512
 
513
  Answer:"""
514
 
@@ -538,16 +735,16 @@ Answer:"""
538
  return "Search completed but no usable results found."
539
 
540
  except Exception as e:
541
- return f"Search failed: {e}. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context)
542
 
543
  def process_question_with_attachments(self, question_data: dict) -> str:
544
  """
545
- Process a question that may have attachments.
546
  """
547
  question_text = question_data.get('question', '')
548
 
549
  if self.debug:
550
- print(f"Processing question with potential attachments: {question_text[:100]}...")
551
 
552
  try:
553
  # Detect and download attachments
 
15
  import tempfile
16
  import urllib.parse
17
  from pathlib import Path
18
+ import re
19
+ from bs4 import BeautifulSoup
20
+ import mimetypes
21
 
22
  # --- Constants ---
23
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
27
  cached_questions = []
28
  processing_status = {"is_processing": False, "progress": 0, "total": 0}
29
 
30
+ # --- Web Content Fetcher ---
31
+ class WebContentFetcher:
32
+ def __init__(self, debug: bool = True):
33
+ self.debug = debug
34
+ self.session = requests.Session()
35
+ self.session.headers.update({
36
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
37
+ })
38
+
39
+ def extract_urls_from_text(self, text: str) -> List[str]:
40
+ """Extract URLs from text using regex."""
41
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
42
+ urls = re.findall(url_pattern, text)
43
+ return list(set(urls)) # Remove duplicates
44
+
45
+ def fetch_url_content(self, url: str) -> Dict[str, str]:
46
+ """
47
+ Fetch content from a URL and extract text, handling different content types.
48
+ Returns a dictionary with 'content', 'title', 'content_type', and 'error' keys.
49
+ """
50
+ try:
51
+ # Clean the URL
52
+ url = url.strip()
53
+ if not url.startswith(('http://', 'https://')):
54
+ url = 'https://' + url
55
+
56
+ if self.debug:
57
+ print(f"Fetching URL: {url}")
58
+
59
+ response = self.session.get(url, timeout=30, allow_redirects=True)
60
+ response.raise_for_status()
61
+
62
+ content_type = response.headers.get('content-type', '').lower()
63
+
64
+ result = {
65
+ 'url': url,
66
+ 'content_type': content_type,
67
+ 'title': '',
68
+ 'content': '',
69
+ 'error': None
70
+ }
71
+
72
+ # Handle different content types
73
+ if 'text/html' in content_type:
74
+ # Parse HTML content
75
+ soup = BeautifulSoup(response.content, 'html.parser')
76
+
77
+ # Extract title
78
+ title_tag = soup.find('title')
79
+ result['title'] = title_tag.get_text().strip() if title_tag else 'No title'
80
+
81
+ # Remove script and style elements
82
+ for script in soup(["script", "style"]):
83
+ script.decompose()
84
+
85
+ # Extract text content
86
+ text_content = soup.get_text()
87
+
88
+ # Clean up text
89
+ lines = (line.strip() for line in text_content.splitlines())
90
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
91
+ text_content = ' '.join(chunk for chunk in chunks if chunk)
92
+
93
+ # Limit content length
94
+ if len(text_content) > 8000:
95
+ text_content = text_content[:8000] + "... (truncated)"
96
+
97
+ result['content'] = text_content
98
+
99
+ elif 'text/plain' in content_type:
100
+ # Handle plain text
101
+ text_content = response.text
102
+ if len(text_content) > 8000:
103
+ text_content = text_content[:8000] + "... (truncated)"
104
+ result['content'] = text_content
105
+ result['title'] = f"Text document from {url}"
106
+
107
+ elif 'application/json' in content_type:
108
+ # Handle JSON content
109
+ try:
110
+ json_data = response.json()
111
+ result['content'] = json.dumps(json_data, indent=2)[:8000]
112
+ result['title'] = f"JSON document from {url}"
113
+ except:
114
+ result['content'] = response.text[:8000]
115
+ result['title'] = f"JSON document from {url}"
116
+
117
+ elif any(x in content_type for x in ['application/pdf', 'application/msword', 'application/vnd.openxmlformats']):
118
+ # Handle document files
119
+ result['content'] = f"Document file detected ({content_type}). Content extraction for this file type is not implemented."
120
+ result['title'] = f"Document from {url}"
121
+
122
+ else:
123
+ # Handle other content types
124
+ if response.text:
125
+ content = response.text[:8000]
126
+ result['content'] = content
127
+ result['title'] = f"Content from {url}"
128
+ else:
129
+ result['content'] = f"Non-text content detected ({content_type})"
130
+ result['title'] = f"File from {url}"
131
+
132
+ if self.debug:
133
+ print(f"Successfully fetched content from {url}: {len(result['content'])} characters")
134
+
135
+ return result
136
+
137
+ except requests.exceptions.RequestException as e:
138
+ error_msg = f"Failed to fetch {url}: {str(e)}"
139
+ if self.debug:
140
+ print(error_msg)
141
+ return {
142
+ 'url': url,
143
+ 'content_type': 'error',
144
+ 'title': f"Error fetching {url}",
145
+ 'content': '',
146
+ 'error': error_msg
147
+ }
148
+ except Exception as e:
149
+ error_msg = f"Unexpected error fetching {url}: {str(e)}"
150
+ if self.debug:
151
+ print(error_msg)
152
+ return {
153
+ 'url': url,
154
+ 'content_type': 'error',
155
+ 'title': f"Error fetching {url}",
156
+ 'content': '',
157
+ 'error': error_msg
158
+ }
159
+
160
+ def fetch_multiple_urls(self, urls: List[str]) -> List[Dict[str, str]]:
161
+ """Fetch content from multiple URLs."""
162
+ results = []
163
+ for url in urls[:5]: # Limit to 5 URLs to avoid excessive processing
164
+ result = self.fetch_url_content(url)
165
+ results.append(result)
166
+ time.sleep(1) # Be respectful to servers
167
+ return results
168
+
169
  # --- File Download Utility ---
170
  def download_attachment(url: str, temp_dir: str) -> Optional[str]:
171
  """
 
339
  except:
340
  return f"Audio transcription failed: {e}"
341
 
342
+ # --- Enhanced Intelligent Agent with URL Processing ---
343
  class IntelligentAgent:
344
  def __init__(self, debug: bool = True, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
345
  self.search = DuckDuckGoSearchTool()
 
347
  self.image_tool = ImageAnalysisTool()
348
  self.audio_tool = AudioTranscriptionTool()
349
  self.code_tool = CodeAnalysisTool(model_name)
350
+ self.web_fetcher = WebContentFetcher(debug)
351
  self.debug = debug
352
  if self.debug:
353
  print(f"IntelligentAgent initialized with model: {model_name}")
 
385
  print(f"Both chat completion and text generation failed: {e}")
386
  raise e
387
 
388
+ def _extract_and_process_urls(self, question_text: str) -> str:
389
+ """
390
+ Extract URLs from question text and fetch their content.
391
+ Returns formatted content from all URLs.
392
+ """
393
+ urls = self.web_fetcher.extract_urls_from_text(question_text)
394
+
395
+ if not urls:
396
+ return ""
397
+
398
+ if self.debug:
399
+ print(f"Found {len(urls)} URLs in question: {urls}")
400
+
401
+ url_contents = self.web_fetcher.fetch_multiple_urls(urls)
402
+
403
+ if not url_contents:
404
+ return ""
405
+
406
+ # Format the content
407
+ formatted_content = []
408
+ for content_data in url_contents:
409
+ if content_data['error']:
410
+ formatted_content.append(f"URL: {content_data['url']}\nError: {content_data['error']}")
411
+ else:
412
+ formatted_content.append(
413
+ f"URL: {content_data['url']}\n"
414
+ f"Title: {content_data['title']}\n"
415
+ f"Content Type: {content_data['content_type']}\n"
416
+ f"Content: {content_data['content']}"
417
+ )
418
+
419
+ return "\n\n" + "="*50 + "\n".join(formatted_content) + "\n" + "="*50
420
+
421
  def _detect_and_download_attachments(self, question_data: dict) -> Tuple[List[str], List[str], List[str]]:
422
  """
423
  Detect and download attachments from question data.
 
444
  elif isinstance(field_data, str):
445
  attachments.append(field_data)
446
 
447
+ # Also check if the question text contains file URLs (not web URLs)
448
  question_text = question_data.get('question', '')
449
  if 'http' in question_text:
450
+ # Only consider URLs that likely point to files, not web pages
451
  urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', question_text)
452
+ for url in urls:
453
+ # Check if URL likely points to a file (has file extension)
454
+ parsed = urllib.parse.urlparse(url)
455
+ path = parsed.path.lower()
456
+ if any(path.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.mp3', '.wav', '.py', '.txt', '.pdf']):
457
+ attachments.append(url)
458
 
459
  # Download and categorize attachments
460
  for attachment in attachments:
 
557
 
558
  return "\n\n".join(attachment_content) if attachment_content else ""
559
 
560
+ def _should_search(self, question: str, attachment_context: str = "", url_context: str = "") -> bool:
561
  """
562
+ Use LLM to determine if search is needed for the question, considering attachment and URL context.
563
  Returns True if search is recommended, False otherwise.
564
  """
565
  decision_prompt = f"""Analyze this question and decide if it requires real-time information, recent data, or specific facts that might not be in your training data.
 
581
  - How-to instructions for common tasks
582
  - Creative writing or opinion-based responses
583
  - Questions that can be answered from attached files (code, images, audio)
584
+ - Questions that can be answered from URL content provided
585
  - Code analysis, debugging, or explanation questions
586
+ - Questions about uploaded or linked content
587
 
588
  Question: "{question}"
589
 
590
  {f"Attachment Context Available: {attachment_context[:500]}..." if attachment_context else "No attachment context available."}
591
 
592
+ {f"URL Content Available: {url_context[:500]}..." if url_context else "No URL content available."}
593
+
594
  Respond with only "SEARCH" or "NO_SEARCH" followed by a brief reason (max 20 words).
595
 
596
  Example responses:
597
  - "SEARCH - Current weather data needed"
598
  - "NO_SEARCH - Mathematical concept, general knowledge sufficient"
599
+ - "NO_SEARCH - Can be answered from attached code/image/URL content"
600
  """
601
 
602
  try:
 
613
 
614
  except Exception as e:
615
  if self.debug:
616
+ print(f"Error in search decision: {e}, defaulting to no search for questions with context")
617
+ # Default to no search if decision fails and there is context available
618
+ return len(attachment_context) == 0 and len(url_context) == 0
619
 
620
+ def _answer_with_llm(self, question: str, attachment_context: str = "", url_context: str = "") -> str:
621
  """
622
+ Generate answer using LLM without search, considering attachment and URL context.
623
  """
624
+ context_sections = []
625
+
626
+ if attachment_context:
627
+ context_sections.append(f"Attachment Context:\n{attachment_context}")
628
+
629
+ if url_context:
630
+ context_sections.append(f"URL Content:\n{url_context}")
631
+
632
+ context_section = "\n\n".join(context_sections) if context_sections else ""
633
 
634
  answer_prompt = f"""You are a general AI assistant. I will ask you a question.
635
  YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 
648
  except Exception as e:
649
  return f"Sorry, I encountered an error generating the response: {e}"
650
 
651
+ def _answer_with_search(self, question: str, attachment_context: str = "", url_context: str = "") -> str:
652
  """
653
+ Generate answer using search results and LLM, considering attachment and URL context.
654
  """
655
  try:
656
  # Perform search
 
661
  print(f"Search results type: {type(search_results)}")
662
 
663
  if not search_results:
664
+ return "No search results found. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context, url_context)
665
 
666
  # Format search results - handle different result formats
667
  if isinstance(search_results, str):
 
682
 
683
  search_context = "\n\n".join(formatted_results)
684
 
685
+ # Generate answer using search context, attachment context, and URL context
686
+ context_sections = [f"Search Results:\n{search_context}"]
687
+
688
+ if attachment_context:
689
+ context_sections.append(f"Attachment Context:\n{attachment_context}")
690
+
691
+ if url_context:
692
+ context_sections.append(f"URL Content:\n{url_context}")
693
+
694
+ full_context = "\n\n".join(context_sections)
695
 
696
  answer_prompt = f"""You are a general AI assistant. I will ask you a question.
697
+ Based on the search results and the context sections below, provide an answer to the question.
698
+ If the search results don't fully answer the question, you can supplement with information from other context sections or your general knowledge.
699
  Your ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
700
  Do not add dot if your answer is a number.
701
  If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
 
705
 
706
  Question: {question}
707
 
708
+ {full_context}
 
 
 
709
 
710
  Answer:"""
711
 
 
735
  return "Search completed but no usable results found."
736
 
737
  except Exception as e:
738
+ return f"Search failed: {e}. Let me try to answer based on my knowledge:\n\n" + self._answer_with_llm(question, attachment_context, url_context)
739
 
740
  def process_question_with_attachments(self, question_data: dict) -> str:
741
  """
742
+ Process a question that may have attachments and URLs.
743
  """
744
  question_text = question_data.get('question', '')
745
 
746
  if self.debug:
747
+ print(f"Processing question with potential attachments and URLs: {question_text[:100]}...")
748
 
749
  try:
750
  # Detect and download attachments