wt002 commited on
Commit
2eb3b6b
·
verified ·
1 Parent(s): 84992c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +311 -175
app.py CHANGED
@@ -1,223 +1,359 @@
1
  import os
 
2
  import gradio as gr
3
  import requests
4
- import inspect
5
  import pandas as pd
6
- from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
7
- from dotenv import load_dotenv
8
  import heapq
9
  from collections import Counter
10
- import re
11
  from io import BytesIO
12
  from youtube_transcript_api import YouTubeTranscriptApi
13
- from langchain_community.tools.tavily_search import TavilySearchResults
14
- from langchain_community.document_loaders import WikipediaLoader
15
- from langchain_community.utilities import WikipediaAPIWrapper
16
- from langchain_community.document_loaders import ArxivLoader
 
 
17
 
18
- # (Keep Constants as is)
19
- # --- Constants ---
20
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
21
 
22
- #Load environment variables
23
  load_dotenv()
24
-
25
-
26
-
27
-
28
- from smolagents import Tool
29
- from langchain_community.document_loaders import WikipediaLoader
30
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class WikiSearchTool(Tool):
 
32
  name = "wiki_search"
33
- description = "Search Wikipedia for a query and return up to 2 results."
34
- inputs = {
35
- "query": {"type": "string", "description": "The search term for Wikipedia."}
36
- }
37
  output_type = "string"
38
 
39
  def forward(self, query: str) -> str:
40
- search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
41
-
42
- formatted_search_docs = "\n\n---\n\n".join(
43
- [
44
- f'<Document source="{doc.metadata.get("source", "Wikipedia")}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
45
- for doc in search_docs
46
- ]
47
- )
48
- return formatted_search_docs
49
-
50
-
51
-
52
-
53
- class StringReverseTool(Tool):
54
- name = "reverse_message"
55
- description = "When you received a strange text, try to reverse it and perform action described in reversed message."
56
- inputs = {
57
- "message": {
58
- "type": "string",
59
- "description": "A message, which looks like strange and can be reversed to get actions to execute."
60
- }
61
- }
 
 
 
 
 
62
  output_type = "string"
63
 
64
- def __init__(self):
65
- return
66
-
67
- def forward(self, message: str):
68
- return message[::-1]
69
-
70
- class KeywordsExtractorTool(Tool):
71
- """Extracts top 5 keywords from a given text based on frequency."""
72
-
73
- name = "keywords_extractor"
74
- description = "This tool returns the 5 most frequent keywords occur in provided block of text."
75
-
76
- inputs = {
77
- "text": {
78
- "type": "string",
79
- "description": "Text to analyze for keywords.",
80
- }
81
- }
82
- output_type = "string"
83
 
84
- def forward(self, text: str) -> str:
85
  try:
86
- all_words = re.findall(r'\b\w+\b', text.lower())
87
- conjunctions = {'a', 'and', 'of', 'is', 'in', 'to', 'the'}
88
- filtered_words = []
89
- for w in all_words:
90
- if w not in conjunctions:
91
- filtered_words.push(w)
92
- word_counts = Counter(filtered_words)
93
- k = 5
94
- return heapq.nlargest(k, word_counts.items(), key=lambda x: x[1])
 
 
95
  except Exception as e:
96
- return f"Error during extracting most common words: {e}"
97
-
98
- @tool
99
- def parse_excel_to_json(task_id: str) -> dict:
100
- """
101
- For a given task_id fetch and parse an Excel file and save parsed data in structured JSON file.
102
- Args:
103
- task_id: An task ID to fetch.
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- Returns:
106
- {
107
- "task_id": str,
108
- "sheets": {
109
- "SheetName1": [ {col1: val1, col2: val2, ...}, ... ],
110
- ...
111
- },
112
- "status": "Success" | "Error"
113
- }
114
- """
115
- url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
116
-
117
- try:
118
- response = requests.get(url, timeout=100)
119
- if response.status_code != 200:
120
- return {"task_id": task_id, "sheets": {}, "status": f"{response.status_code} - Failed"}
121
-
122
- xls_content = pd.ExcelFile(BytesIO(response.content))
123
- json_sheets = {}
124
-
125
- for sheet in xls_content.sheet_names:
126
- df = xls_content.parse(sheet)
127
- df = df.dropna(how="all")
128
- rows = df.head(20).to_dict(orient="records")
129
- json_sheets[sheet] = rows
130
-
131
- return {
132
- "task_id": task_id,
133
- "sheets": json_sheets,
134
- "status": "Success"
135
- }
136
-
137
- except Exception as e:
138
- return {
139
- "task_id": task_id,
140
- "sheets": {},
141
- "status": f"Error in parsing Excel file: {str(e)}"
142
- }
143
-
144
 
 
 
 
 
 
 
 
145
 
146
  class VideoTranscriptionTool(Tool):
147
- """Fetch transcripts from YouTube videos"""
148
  name = "transcript_video"
149
- description = "Fetch text transcript from YouTube movies with optional timestamps"
150
  inputs = {
151
- "url": {"type": "string", "description": "YouTube video URL or ID"},
152
- "include_timestamps": {"type": "boolean", "description": "If timestamps should be included in output", "nullable": True}
153
  }
154
  output_type = "string"
155
 
156
  def forward(self, url: str, include_timestamps: bool = False) -> str:
157
-
158
- if "youtube.com/watch" in url:
159
- video_id = url.split("v=")[1].split("&")[0]
160
- elif "youtu.be/" in url:
161
- video_id = url.split("youtu.be/")[1].split("?")[0]
162
- elif len(url.strip()) == 11: # Direct ID
163
- video_id = url.strip()
164
- else:
165
- return f"YouTube URL or ID: {url} is invalid!"
166
-
167
  try:
168
- transcription = YouTubeTranscriptApi.get_transcript(video_id)
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  if include_timestamps:
171
- formatted_transcription = []
172
- for part in transcription:
173
- timestamp = f"{int(part['start']//60)}:{int(part['start']%60):02d}"
174
- formatted_transcription.append(f"[{timestamp}] {part['text']}")
175
- return "\n".join(formatted_transcription)
176
  else:
177
- return " ".join([part['text'] for part in transcription])
 
 
 
 
 
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  except Exception as e:
180
- return f"Error in extracting YouTube transcript: {str(e)}"
 
181
 
182
- class BasicAgent:
 
183
  def __init__(self):
184
- token = os.environ.get("HF_API_TOKEN")
185
- model = HfApiModel(
186
- temperature=0.1,
187
- token=token
188
  )
189
-
190
- search_tool = DuckDuckGoSearchTool()
191
- wiki_search_tool = WikiSearchTool()
192
- str_reverse_tool = StringReverseTool()
193
- keywords_extract_tool = KeywordsExtractorTool()
194
- speech_to_text_tool = SpeechToTextTool()
195
- visit_webpage_tool = VisitWebpageTool()
196
- final_answer_tool = FinalAnswerTool()
197
- video_transcription_tool = VideoTranscriptionTool()
198
-
199
- system_prompt = f"""
200
- You are my general AI assistant. Your task is to answer the question I asked.
201
- First, provide an explanation of your reasoning, step by step, to arrive at the answer.
202
- Then, return your final answer in a single line, formatted as follows: "FINAL ANSWER: [YOUR FINAL ANSWER]".
203
- [YOUR FINAL ANSWER] should be a number, a string, or a comma-separated list of numbers and/or strings, depending on the question.
204
- If the answer is a number, do not use commas or units (e.g., $, %) unless specified.
205
- If the answer is a string, do not use articles or abbreviations (e.g., for cities), and write digits in plain text unless specified.
206
- If the answer is a comma-separated list, apply the above rules for each element based on whether it is a number or a string.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  """
208
- self.agent = CodeAgent(
209
- model=model,
210
- tools=[search_tool, wiki_search_tool, str_reverse_tool, keywords_extract_tool, speech_to_text_tool, visit_webpage_tool, final_answer_tool, parse_excel_to_json, video_transcription_tool],
211
  add_base_tools=True
212
  )
213
- self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt
 
214
 
215
  def __call__(self, question: str) -> str:
216
- print(f"Agent received question (first 50 chars): {question[:50]}...")
217
- answer = self.agent.run(question)
218
- print(f"Agent returning answer: {answer}")
219
- return answer
220
-
 
 
 
221
 
222
  def run_and_submit_all( profile: gr.OAuthProfile | None):
223
  """
 
1
  import os
2
+ import re
3
  import gradio as gr
4
  import requests
 
5
  import pandas as pd
 
 
6
  import heapq
7
  from collections import Counter
 
8
  from io import BytesIO
9
  from youtube_transcript_api import YouTubeTranscriptApi
10
+ from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
11
+ from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
12
+ from dotenv import load_dotenv
13
+ import tempfile
14
+ import mimetypes
15
+ import logging
16
 
17
+ # --- Initialize logging ---
18
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19
+ logger = logging.getLogger(__name__)
20
 
21
+ # --- Load environment variables ---
22
  load_dotenv()
23
+ HF_API_TOKEN = os.getenv("HF_API_TOKEN")
24
+ if not HF_API_TOKEN:
25
+ logger.error("HF_API_TOKEN not found in environment variables! Please set it to use the HfApiModel.")
26
+ # Exit or raise an error if the token is critical for functionality
27
+ # sys.exit(1) # Uncomment if you want to exit the script if token is missing
28
+
29
+ # --- Utility Functions ---
30
+ def extract_youtube_id(url: str) -> str:
31
+ """Extract YouTube ID from various URL formats"""
32
+ patterns = [
33
+ r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
34
+ r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
35
+ r'([a-zA-Z0-9_-]{11})' # Catches just the ID if provided directly
36
+ ]
37
+ for pattern in patterns:
38
+ match = re.search(pattern, url)
39
+ if match:
40
+ return match.group(1)
41
+ return ""
42
+
43
+ # --- Enhanced Tools ---
44
  class WikiSearchTool(Tool):
45
+ """Enhanced Wikipedia search with better formatting and error handling"""
46
  name = "wiki_search"
47
+ description = "Search Wikipedia for a query. Returns up to 2 results with metadata."
48
+ inputs = {"query": {"type": "string", "description": "Search term for Wikipedia"}}
 
 
49
  output_type = "string"
50
 
51
  def forward(self, query: str) -> str:
52
+ try:
53
+ logger.info(f"Searching Wikipedia for: {query}")
54
+ docs = WikipediaLoader(query=query, load_max_docs=2).load()
55
+ if not docs:
56
+ logger.info(f"No Wikipedia articles found for: {query}")
57
+ return "No Wikipedia articles found."
58
+
59
+ formatted_results = []
60
+ for i, doc in enumerate(docs):
61
+ # Limit page content length to avoid overwhelming the model, but provide enough context
62
+ summary = doc.page_content[:1000] + "..." if len(doc.page_content) > 1000 else doc.page_content
63
+ formatted_results.append(
64
+ f"--- Wikipedia Result {i+1} ---\n"
65
+ f"Title: {doc.metadata.get('title', 'N/A')}\n"
66
+ f"URL: {doc.metadata.get('source', 'N/A')}\n"
67
+ f"Summary: {summary}\n"
68
+ )
69
+ return "\n\n".join(formatted_results)
70
+ except Exception as e:
71
+ logger.error(f"Wikipedia search error for '{query}': {e}")
72
+ return f"Wikipedia search error: {str(e)}"
73
+
74
+ class FileAnalysisTool(Tool):
75
+ """Universal file analyzer for text/PDF/Excel files"""
76
+ name = "file_analysis"
77
+ description = "Analyze text, PDF, and Excel files. Returns extracted content."
78
+ inputs = {"file_path": {"type": "string", "description": "Path to the local file"}}
79
  output_type = "string"
80
 
81
+ def forward(self, file_path: str) -> str:
82
+ if not os.path.exists(file_path):
83
+ return f"File not found: {file_path}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
 
85
  try:
86
+ mime_type, _ = mimetypes.guess_type(file_path)
87
+ logger.info(f"Analyzing file: {file_path} with MIME type: {mime_type}")
88
+
89
+ if mime_type == "application/pdf":
90
+ return self._process_pdf(file_path)
91
+ elif mime_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"]:
92
+ return self._process_excel(file_path)
93
+ elif mime_type and ("text" in mime_type or "csv" in mime_type):
94
+ return self._process_text(file_path)
95
+ else:
96
+ return f"Unsupported file type for analysis: {mime_type}. Only PDF, Excel, and text/CSV files are supported."
97
  except Exception as e:
98
+ logger.error(f"File analysis error for '{file_path}': {e}")
99
+ return f"File analysis error: {str(e)}"
100
+
101
+ def _process_pdf(self, path: str) -> str:
102
+ loader = PyPDFLoader(path)
103
+ docs = loader.load()
104
+ content = "\n\n".join([doc.page_content for doc in docs])
105
+ # Truncate to avoid excessive token usage, provide a warning if truncated
106
+ if len(content) > 8000:
107
+ logger.warning(f"PDF content truncated from {len(content)} to 8000 characters for {path}")
108
+ return content[:8000] + "\n... [Content truncated]"
109
+ return content
110
+
111
+ def _process_excel(self, path: str) -> str:
112
+ df = pd.read_excel(path)
113
+ # Provide a sample of the data and its basic info
114
+ info = BytesIO()
115
+ df.info(buf=info)
116
+ info_str = info.getvalue().decode('utf-8')
117
 
118
+ return (f"Excel file loaded. First 10 rows:\n{df.head(10).to_markdown()}\n\n"
119
+ f"DataFrame Info:\n{info_str}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ def _process_text(self, path: str) -> str:
122
+ with open(path, 'r', encoding='utf-8') as f:
123
+ content = f.read()
124
+ if len(content) > 8000:
125
+ logger.warning(f"Text file content truncated from {len(content)} to 8000 characters for {path}")
126
+ return content[:8000] + "\n... [Content truncated]"
127
+ return content
128
 
129
  class VideoTranscriptionTool(Tool):
130
+ """Enhanced YouTube transcription with multilingual support and better output"""
131
  name = "transcript_video"
132
+ description = "Fetch YouTube video transcripts with optional timestamps. Supports English, French, Spanish, German."
133
  inputs = {
134
+ "url": {"type": "string", "description": "YouTube URL or ID"},
135
+ "include_timestamps": {"type": "boolean", "description": "Include timestamps? (default: False)"}
136
  }
137
  output_type = "string"
138
 
139
  def forward(self, url: str, include_timestamps: bool = False) -> str:
 
 
 
 
 
 
 
 
 
 
140
  try:
141
+ video_id = extract_youtube_id(url)
142
+ if not video_id:
143
+ return "Invalid YouTube URL or ID format. Please provide a valid YouTube URL or an 11-character video ID."
144
+
145
+ logger.info(f"Attempting to transcribe video ID: {video_id}")
146
+ transcript = YouTubeTranscriptApi.get_transcript(
147
+ video_id,
148
+ languages=['en', 'fr', 'es', 'de'] # Prioritize common languages
149
+ )
150
+
151
+ if not transcript:
152
+ return f"No transcript found for video ID: {video_id} in supported languages (en, fr, es, de)."
153
 
154
  if include_timestamps:
155
+ formatted_transcript = "\n".join(
156
+ f"[{int(seg['start']//60):02d}:{int(seg['start']%60):02d}] {seg['text']}"
157
+ for seg in transcript
158
+ )
 
159
  else:
160
+ formatted_transcript = " ".join(seg['text'] for seg in transcript)
161
+
162
+ return formatted_transcript
163
+ except Exception as e:
164
+ logger.error(f"Transcription error for '{url}': {e}")
165
+ return f"Transcription error: {str(e)}. This might be due to no available transcript or an unsupported video."
166
 
167
+ class DataAnalysisTool(Tool):
168
+ """Perform data analysis using pandas on structured data (CSV/Excel)"""
169
+ name = "data_analysis"
170
+ description = "Analyze CSV/Excel data using pandas operations. Supported operations: 'describe', 'groupby:column:aggfunc' (e.g., 'groupby:Category:mean')."
171
+ inputs = {
172
+ "file_path": {"type": "string", "description": "Path to the local data file (CSV or Excel)"},
173
+ "operation": {"type": "string", "description": "Pandas operation (e.g., 'describe', 'groupby:column_name:mean')"}
174
+ }
175
+ output_type = "string"
176
+
177
+ def forward(self, file_path: str, operation: str) -> str:
178
+ if not os.path.exists(file_path):
179
+ return f"File not found: {file_path}"
180
+
181
+ try:
182
+ if file_path.endswith('.csv'):
183
+ df = pd.read_csv(file_path)
184
+ elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
185
+ df = pd.read_excel(file_path)
186
+ else:
187
+ return "Unsupported file format for data analysis. Please provide a .csv or .xlsx file."
188
+
189
+ logger.info(f"Performing data analysis operation '{operation}' on {file_path}")
190
+
191
+ if operation == "describe":
192
+ return "Descriptive Statistics:\n" + str(df.describe())
193
+ elif operation.startswith("groupby:"):
194
+ parts = operation.split(":")
195
+ if len(parts) == 3:
196
+ _, col, agg = parts
197
+ if col not in df.columns:
198
+ return f"Column '{col}' not found in the DataFrame."
199
+ try:
200
+ result = df.groupby(col).agg(agg)
201
+ return f"Groupby operation '{agg}' on column '{col}':\n" + str(result)
202
+ except Exception as agg_e:
203
+ return f"Error performing aggregation '{agg}' on column '{col}': {str(agg_e)}"
204
+ else:
205
+ return "Invalid 'groupby' operation format. Use 'groupby:column_name:agg_function'."
206
+ else:
207
+ return "Unsupported operation. Try: 'describe' or 'groupby:column_name:agg_function'."
208
  except Exception as e:
209
+ logger.error(f"Data analysis error for '{file_path}' with operation '{operation}': {e}")
210
+ return f"Data analysis error: {str(e)}. Please check file content and operation."
211
 
212
+ # --- Agent Initialization ---
213
+ class ResearchAgent:
214
  def __init__(self):
215
+ self.model = HfApiModel(
216
+ temperature=0.0, # Slightly increased temperature for more creative responses if appropriate
217
+ token=HF_API_TOKEN,
218
+ max_tokens=2000
219
  )
220
+
221
+ self.tools = self._initialize_tools()
222
+ self.agent = self._create_agent()
223
+
224
+ def _initialize_tools(self) -> list:
225
+ """Initialize all tools with enhanced capabilities"""
226
+ return [
227
+ DuckDuckGoSearchTool(),
228
+ WikiSearchTool(),
229
+ VisitWebpageTool(),
230
+ SpeechToTextTool(), # Might be less relevant for a text-based research agent but kept if needed
231
+ FinalAnswerTool(),
232
+ VideoTranscriptionTool(),
233
+ FileAnalysisTool(),
234
+ DataAnalysisTool(),
235
+ self._create_excel_download_tool(), # Renamed for clarity
236
+ self._create_keywords_tool()
237
+ ]
238
+
239
+ def _create_excel_download_tool(self):
240
+ """Tool to download and parse Excel files from a specific URL"""
241
+ @tool
242
+ def download_and_parse_excel(task_id: str) -> dict:
243
+ """
244
+ Downloads an Excel file from a predefined URL using a task_id and parses its content.
245
+ Returns a dictionary with status and data (first 20 rows).
246
+ """
247
+ try:
248
+ url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
249
+ logger.info(f"Attempting to download Excel from: {url}")
250
+ response = requests.get(url, timeout=60) # Increased timeout for larger files
251
+ response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
252
+
253
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
254
+ tmp.write(response.content)
255
+ temp_file_path = tmp.name
256
+
257
+ df = pd.read_excel(temp_file_path)
258
+ os.unlink(temp_file_path) # Clean up the temporary file
259
+
260
+ logger.info(f"Successfully downloaded and parsed Excel for task_id: {task_id}")
261
+ return {
262
+ "task_id": task_id,
263
+ "data_sample": df.head(10).to_dict(orient="records"), # Reduced to 10 for conciseness
264
+ "status": "Success",
265
+ "columns": df.columns.tolist(), # Added column names for context
266
+ "shape": df.shape # Added shape for context
267
+ }
268
+ except requests.exceptions.RequestException as req_err:
269
+ logger.error(f"Network or HTTP error downloading Excel for task_id '{task_id}': {req_err}")
270
+ return {"status": f"Download error: {str(req_err)}"}
271
+ except Exception as e:
272
+ logger.error(f"Error parsing Excel for task_id '{task_id}': {e}")
273
+ return {"status": f"Parsing error: {str(e)}"}
274
+ return download_and_parse_excel
275
+
276
+ def _create_keywords_tool(self):
277
+ """Keywords extractor with TF-IDF like scoring (basic frequency for now)"""
278
+ @tool
279
+ def extract_keywords(text: str, top_n: int = 5) -> list:
280
+ """
281
+ Extracts the most frequent keywords from a given text, excluding common stopwords.
282
+ Args:
283
+ text (str): The input text to extract keywords from.
284
+ top_n (int): The number of top keywords to return.
285
+ Returns:
286
+ list: A list of the most frequent keywords.
287
+ """
288
+ if not text:
289
+ return []
290
+
291
+ # Use a more comprehensive list of English stopwords
292
+ stopwords = set([
293
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
294
+ "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
295
+ "they", "this", "to", "was", "will", "with", "he", "she", "it's", "i", "we", "you", "my",
296
+ "your", "our", "us", "him", "her", "his", "hers", "its", "them", "their", "what", "when",
297
+ "where", "why", "how", "which", "who", "whom", "can", "could", "would", "should", "may",
298
+ "might", "must", "have", "has", "had", "do", "does", "did", "am", "are", "is", "were", "been",
299
+ "being", "from", "up", "down", "out", "off", "over", "under", "again", "further", "then",
300
+ "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few",
301
+ "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
302
+ "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
303
+ ])
304
+
305
+ words = re.findall(r'\b\w+\b', text.lower()) # Relaxed regex to capture all words
306
+ filtered = [w for w in words if w not in stopwords and len(w) > 2] # Filter words less than 3 chars
307
+ counter = Counter(filtered)
308
+ return [word for word, _ in counter.most_common(top_n)]
309
+ return extract_keywords
310
+
311
+ def _create_agent(self) -> CodeAgent:
312
+ """Create agent with improved system prompt"""
313
+ system_prompt = """
314
+ You are an advanced, helpful, and highly analytical research assistant. Your goal is to provide accurate, comprehensive, and well-structured answers to user queries, leveraging all available tools efficiently.
315
+
316
+ **Follow this robust process:**
317
+
318
+ 1. **Understand the User's Need:** Carefully analyze the user's question, including any attached files or specific requests (e.g., "summarize," "analyze data," "find facts").
319
+ 2. **Formulate a Detailed Plan:** Before acting, create a clear, step-by-step plan. This plan should outline:
320
+ * What information needs to be gathered.
321
+ * Which tools are most appropriate for each step (e.g., `duckduckgo_search` for general web search, `wiki_search` for encyclopedic facts, `transcript_video` for YouTube, `file_analysis` or `data_analysis` for local files).
322
+ * How you will combine information from different sources.
323
+ * How you will verify or synthesize the findings.
324
+ 3. **Execute the Plan Using Tools:** Call the necessary tools, providing clear and correct arguments. If a tool fails, try to understand why and adapt your plan (e.g., try a different search query or tool).
325
+ 4. **Synthesize and Verify Information:** Once you have gathered sufficient information, synthesize it into a coherent answer. Do not just list facts; explain their significance and how they relate to the original question. If there are contradictions or uncertainties, mention them.
326
+ 5. **Formulate the Final Answer:**
327
+ * Present your answer clearly and concisely.
328
+ * Always begin your ultimate response with "FINAL ANSWER:".
329
+ * If the answer is a single number, provide only the number.
330
+ * If the answer is a list, provide comma-separated values.
331
+ * For complex answers, use structured formats like bullet points or JSON where appropriate to enhance readability.
332
+ * **Crucially, always include sources or references (e.g., URLs, Wikipedia titles, file names) where you obtained the information.** This builds trust and allows for verification.
333
+ * If you used `file_analysis` or `data_analysis` tools on an uploaded file, explicitly state that you analyzed the provided file.
334
+
335
+ **Important Considerations:**
336
+ * **Prioritize:** If the query involves a specific file, start by analyzing that file if appropriate.
337
+ * **Limitations:** If you cannot answer a question with the available tools, state that clearly.
338
+ * **Conciseness:** Be as concise as possible while providing a complete and accurate answer.
339
  """
340
+ agent = CodeAgent(
341
+ model=self.model,
342
+ tools=self.tools,
343
  add_base_tools=True
344
  )
345
+ agent.prompt_templates["system_prompt"] = system_prompt
346
+ return agent
347
 
348
  def __call__(self, question: str) -> str:
349
+ logger.info(f"Received question: {question[:200]}...") # Log more of the question
350
+ try:
351
+ response = self.agent.run(question)
352
+ logger.info(f"Response generated successfully for question: {question[:200]}")
353
+ return response
354
+ except Exception as e:
355
+ logger.error(f"Agent execution failed for question '{question[:100]}': {str(e)}", exc_info=True) # Log full traceback
356
+ return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
357
 
358
  def run_and_submit_all( profile: gr.OAuthProfile | None):
359
  """