wt002 commited on
Commit
b54639b
·
verified ·
1 Parent(s): 0ed442f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +507 -161
app.py CHANGED
@@ -1,204 +1,508 @@
1
  import os
 
2
  import gradio as gr
3
  import requests
4
- import inspect
5
  import pandas as pd
6
- from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
7
- from dotenv import load_dotenv
8
  import heapq
9
  from collections import Counter
10
- import re
11
  from io import BytesIO
12
  from youtube_transcript_api import YouTubeTranscriptApi
13
- from langchain_community.tools.tavily_search import TavilySearchResults
14
- from langchain_community.document_loaders import WikipediaLoader
15
- from langchain_community.utilities import WikipediaAPIWrapper
16
- from langchain_community.document_loaders import ArxivLoader
17
-
18
- # (Keep Constants as is)
19
- # --- Constants ---
20
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
21
-
22
- #Load environment variables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
-
27
-
28
- from smolagents import Tool
29
- from langchain_community.document_loaders import WikipediaLoader
30
-
31
  class WikiSearchTool(Tool):
 
32
  name = "wiki_search"
33
- description = "Search Wikipedia for a query and return up to 2 results."
34
- inputs = {
35
- "query": {"type": "string", "description": "The search term for Wikipedia."}
36
- }
37
  output_type = "string"
38
 
39
  def forward(self, query: str) -> str:
40
- search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- formatted_search_docs = "\n\n---\n\n".join(
43
- [
44
- f'<Document source="{doc.metadata.get("source", "Wikipedia")}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
45
- for doc in search_docs
46
- ]
47
- )
48
- return formatted_search_docs
49
 
50
-
 
 
 
 
 
 
 
 
 
 
 
 
51
 
 
 
 
 
 
 
 
 
52
 
53
- class StringReverseTool(Tool):
54
- name = "reverse_message"
55
- description = "When you received a strange text, try to reverse it and perform action described in reversed message."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  inputs = {
57
- "message": {
58
- "type": "string",
59
- "description": "A message, which looks like strange and can be reversed to get actions to execute."
60
- }
61
  }
62
  output_type = "string"
63
 
64
- def __init__(self):
65
- return
66
-
67
- def forward(self, message: str):
68
- return message[::-1]
69
-
70
- class KeywordsExtractorTool(Tool):
71
- """Extracts top 5 keywords from a given text based on frequency."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- name = "keywords_extractor"
74
- description = "This tool returns the 5 most frequent keywords occur in provided block of text."
 
 
 
 
 
 
 
 
 
 
75
 
 
 
 
 
76
  inputs = {
77
- "text": {
78
- "type": "string",
79
- "description": "Text to analyze for keywords.",
80
- }
81
  }
82
  output_type = "string"
83
 
84
- def forward(self, text: str) -> str:
 
 
 
85
  try:
86
- all_words = re.findall(r'\b\w+\b', text.lower())
87
- conjunctions = {'a', 'and', 'of', 'is', 'in', 'to', 'the'}
88
- filtered_words = []
89
- for w in all_words:
90
- if w not in conjunctions:
91
- filtered_words.push(w)
92
- word_counts = Counter(filtered_words)
93
- k = 5
94
- return heapq.nlargest(k, word_counts.items(), key=lambda x: x[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  except Exception as e:
96
- return f"Error during extracting most common words: {e}"
97
-
98
-
99
 
100
- @tool
101
- def parse_excel_to_json(task_id: str) -> dict:
102
  """
103
- For a given task_id fetch and parse an Excel file and save parsed data in structured JSON file.
104
- Args:
105
- task_id: An task ID to fetch.
106
-
107
- Returns:
108
- {
109
- "task_id": str,
110
- "sheets": {
111
- "SheetName1": [ {col1: val1, col2: val2, ...}, ... ],
112
- ...
113
- },
114
- "status": "Success" | "Error"
115
- }
116
  """
117
- url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
118
-
119
- try:
120
- response = requests.get(url, timeout=100)
121
- if response.status_code != 200:
122
- return {"task_id": task_id, "sheets": {}, "status": f"{response.status_code} - Failed"}
123
-
124
- xls_content = pd.ExcelFile(BytesIO(response.content))
125
- json_sheets = {}
126
-
127
- for sheet in xls_content.sheet_names:
128
- df = xls_content.parse(sheet)
129
- df = df.dropna(how="all")
130
- rows = df.head(20).to_dict(orient="records")
131
- json_sheets[sheet] = rows
132
-
133
- return {
134
- "task_id": task_id,
135
- "sheets": json_sheets,
136
- "status": "Success"
137
- }
138
-
139
- except Exception as e:
140
- return {
141
- "task_id": task_id,
142
- "sheets": {},
143
- "status": f"Error in parsing Excel file: {str(e)}"
144
- }
145
-
146
-
147
-
148
- class VideoTranscriptionTool(Tool):
149
- """Fetch transcripts from YouTube videos"""
150
- name = "transcript_video"
151
- description = "Fetch text transcript from YouTube movies with optional timestamps"
152
  inputs = {
153
- "url": {"type": "string", "description": "YouTube video URL or ID"},
154
- "include_timestamps": {"type": "boolean", "description": "If timestamps should be included in output", "nullable": True}
155
  }
156
  output_type = "string"
157
 
158
- def forward(self, url: str, include_timestamps: bool = False) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- if "youtube.com/watch" in url:
161
- video_id = url.split("v=")[1].split("&")[0]
162
- elif "youtu.be/" in url:
163
- video_id = url.split("youtu.be/")[1].split("?")[0]
164
- elif len(url.strip()) == 11: # Direct ID
165
- video_id = url.strip()
166
- else:
167
- return f"YouTube URL or ID: {url} is invalid!"
 
 
 
 
 
 
 
 
168
 
 
 
 
169
  try:
170
- transcription = YouTubeTranscriptApi.get_transcript(video_id)
171
-
172
- if include_timestamps:
173
- formatted_transcription = []
174
- for part in transcription:
175
- timestamp = f"{int(part['start']//60)}:{int(part['start']%60):02d}"
176
- formatted_transcription.append(f"[{timestamp}] {part['text']}")
177
- return "\n".join(formatted_transcription)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  else:
179
- return " ".join([part['text'] for part in transcription])
180
 
 
 
 
181
  except Exception as e:
182
- return f"Error in extracting YouTube transcript: {str(e)}"
 
183
 
 
184
  class BasicAgent:
185
  def __init__(self):
186
- token = os.environ.get("HF_API_TOKEN")
187
- model = HfApiModel(
188
- temperature=0.1,
189
- token=token
190
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- search_tool = DuckDuckGoSearchTool()
193
- wiki_search_tool = WikiSearchTool()
194
- str_reverse_tool = StringReverseTool()
195
- keywords_extract_tool = KeywordsExtractorTool()
196
- speech_to_text_tool = SpeechToTextTool()
197
- visit_webpage_tool = VisitWebpageTool()
198
- final_answer_tool = FinalAnswerTool()
199
- video_transcription_tool = VideoTranscriptionTool()
200
-
201
- system_prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  You are an advanced, helpful, and highly analytical research assistant. Your goal is to provide accurate, comprehensive, and well-structured answers to user queries, leveraging all available tools efficiently.
203
 
204
  **Follow this robust process:**
@@ -206,7 +510,14 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
206
  1. **Understand the User's Need:** Carefully analyze the user's question, including any attached files or specific requests (e.g., "summarize," "analyze data," "find facts").
207
  2. **Formulate a Detailed Plan:** Before acting, create a clear, step-by-step plan. This plan should outline:
208
  * What information needs to be gathered.
209
- * Which tools are most appropriate for each step (e.g., `duckduckgo_search` for general web search, `wiki_search` for encyclopedic facts, `transcript_video` for YouTube, `file_analysis` or `data_analysis` for local files).
 
 
 
 
 
 
 
210
  * How you will combine information from different sources.
211
  * How you will verify or synthesize the findings.
212
  3. **Execute the Plan Using Tools:** Call the necessary tools, providing clear and correct arguments. If a tool fails, try to understand why and adapt your plan (e.g., try a different search query or tool).
@@ -217,28 +528,63 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
217
  * If the answer is a single number, provide only the number.
218
  * If the answer is a list, provide comma-separated values.
219
  * For complex answers, use structured formats like bullet points or JSON where appropriate to enhance readability.
220
- * **Crucially, always include sources or references (e.g., URLs, Wikipedia titles, file names) where you obtained the information.** This builds trust and allows for verification.
221
  * If you used `file_analysis` or `data_analysis` tools on an uploaded file, explicitly state that you analyzed the provided file.
222
 
223
  **Important Considerations:**
224
- * **Prioritize:** If the query involves a specific file, start by analyzing that file if appropriate.
225
  * **Limitations:** If you cannot answer a question with the available tools, state that clearly.
226
  * **Conciseness:** Be as concise as possible while providing an accurate answer.
227
  """
228
- self.agent = CodeAgent(
229
- model=model,
230
- tools=[search_tool, wiki_search_tool, str_reverse_tool, keywords_extract_tool, speech_to_text_tool, visit_webpage_tool, \
231
- final_answer_tool, parse_excel_to_json, video_transcription_tool],
232
- add_base_tools=True
233
  )
234
- self.agent.prompt_templates["system_prompt"] = self.agent.prompt_templates["system_prompt"] + system_prompt
 
235
 
236
  def __call__(self, question: str) -> str:
237
  print(f"Agent received question (first 50 chars): {question[:50]}...")
238
  answer = self.agent.run(question)
239
  print(f"Agent returning answer: {answer}")
240
  return answer
241
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  def run_and_submit_all( profile: gr.OAuthProfile | None):
244
  """
 
1
  import os
2
+ import re
3
  import gradio as gr
4
  import requests
 
5
  import pandas as pd
 
 
6
  import heapq
7
  from collections import Counter
 
8
  from io import BytesIO
9
  from youtube_transcript_api import YouTubeTranscriptApi
10
+ from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
11
+ from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_community.embeddings import HuggingFaceEmbeddings
14
+ from langchain_community.vectorstores import DocArrayInMemorySearch
15
+ from langchain_core.documents import Document
16
+ from dotenv import load_dotenv
17
+ import tempfile
18
+ import mimetypes
19
+ import logging
20
+ import uuid
21
+ # For timeout functionality
22
+ import concurrent.futures
23
+ import time
24
+
25
+ # --- Initialize logging ---
26
+ LOG_FILE_PATH = "agent_activity.log"
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
30
+ filename=LOG_FILE_PATH,
31
+ filemode='a'
32
+ )
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # --- Load environment variables ---
36
  load_dotenv()
37
+ HF_API_TOKEN = os.getenv("HF_API_TOKEN")
38
+ HF_EMBEDDING_MODEL_ID = os.getenv("HF_EMBEDDING_MODEL_ID", "sentence-transformers/all-MiniLM-L6-v2")
39
+
40
+ if not HF_API_TOKEN:
41
+ logger.error("HF_API_TOKEN not found in environment variables! Please set it to use the HfApiModel.")
42
+
43
+ # --- Global Vector Store and Embeddings ---
44
+ try:
45
+ embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
46
+ logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
47
+ except Exception as e:
48
+ logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
49
+ embeddings = None
50
+
51
+ vectorstore = DocArrayInMemorySearch(embedding_function=embeddings) if embeddings else None
52
+ text_splitter = RecursiveCharacterTextSplitter(
53
+ chunk_size=1000,
54
+ chunk_overlap=200,
55
+ length_function=len,
56
+ is_separator_regex=False,
57
+ )
58
+ logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
59
+
60
+
61
+ # --- Utility Functions ---
62
+ def extract_youtube_id(url: str) -> str:
63
+ """Extract YouTube ID from various URL formats"""
64
+ patterns = [
65
+ r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
66
+ r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
67
+ r'([a-zA-Z0-9_-]{11})'
68
+ ]
69
+ for pattern in patterns:
70
+ match = re.search(pattern, url)
71
+ if match:
72
+ return match.group(1)
73
+ return ""
74
+
75
+ def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
76
+ """
77
+ Adds content to the global vector store.
78
+ Chunks the content and creates LangChain Documents.
79
+ """
80
+ if vectorstore is None:
81
+ logger.warning("Vector store not initialized. Cannot add document.")
82
+ return
83
 
84
+ try:
85
+ chunks = text_splitter.split_text(content)
86
+ docs = []
87
+ for i, chunk in enumerate(chunks):
88
+ doc_metadata = {"source": source, "chunk_index": i}
89
+ if metadata:
90
+ doc_metadata.update(metadata)
91
+ docs.append(Document(page_content=chunk, metadata=doc_metadata))
92
+
93
+ vectorstore.add_documents(docs)
94
+ logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
95
+ except Exception as e:
96
+ logger.error(f"Error adding document from '{source}' to vector store: {e}")
97
 
98
+ # --- Enhanced Tools ---
 
 
 
 
99
  class WikiSearchTool(Tool):
100
+ """Enhanced Wikipedia search with better formatting and error handling"""
101
  name = "wiki_search"
102
+ description = "Search Wikipedia for a query. Returns up to 2 results with metadata."
103
+ inputs = {"query": {"type": "string", "description": "Search term for Wikipedia"}}
 
 
104
  output_type = "string"
105
 
106
  def forward(self, query: str) -> str:
107
+ try:
108
+ logger.info(f"Searching Wikipedia for: {query}")
109
+ docs = WikipediaLoader(query=query, load_max_docs=2).load()
110
+ if not docs:
111
+ logger.info(f"No Wikipedia articles found for: {query}")
112
+ return "No Wikipedia articles found."
113
+
114
+ formatted_results = []
115
+ for i, doc in enumerate(docs):
116
+ summary = doc.page_content[:1000] + "..." if len(doc.page_content) > 1000 else doc.page_content
117
+
118
+ add_document_to_vector_store(
119
+ content=doc.page_content,
120
+ source=doc.metadata.get('source', 'Wikipedia'),
121
+ metadata={"title": doc.metadata.get('title', 'N/A')}
122
+ )
123
+
124
+ formatted_results.append(
125
+ f"--- Wikipedia Result {i+1} ---\n"
126
+ f"Title: {doc.metadata.get('title', 'N/A')}\n"
127
+ f"URL: {doc.metadata.get('source', 'N/A')}\n"
128
+ f"Summary: {summary}\n"
129
+ )
130
+ return "\n\n".join(formatted_results)
131
+ except Exception as e:
132
+ logger.error(f"Wikipedia search error for '{query}': {e}")
133
+ return f"Wikipedia search error: {str(e)}"
134
+
135
+ class FileAnalysisTool(Tool):
136
+ """Universal file analyzer for text/PDF/Excel files. Content added to vector store."""
137
+ name = "file_analysis"
138
+ description = "Analyze text, PDF, and Excel files. Returns extracted content. Text and PDF content is also indexed for future retrieval."
139
+ inputs = {"file_path": {"type": "string", "description": "Path to the local file"}}
140
+ output_type = "string"
141
 
142
+ def forward(self, file_path: str) -> str:
143
+ if not os.path.exists(file_path):
144
+ return f"File not found: {file_path}"
 
 
 
 
145
 
146
+ try:
147
+ mime_type, _ = mimetypes.guess_type(file_path)
148
+ logger.info(f"Analyzing file: {file_path} with MIME type: {mime_type}")
149
+
150
+ content = ""
151
+ if mime_type == "application/pdf":
152
+ content = self._process_pdf(file_path)
153
+ elif mime_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"]:
154
+ content = self._process_excel(file_path)
155
+ elif mime_type and ("text" in mime_type or "csv" in mime_type):
156
+ content = self._process_text(file_path)
157
+ else:
158
+ return f"Unsupported file type for analysis: {mime_type}. Only PDF, Excel, and text/CSV files are supported."
159
 
160
+ if mime_type in ["application/pdf", "text/plain", "text/csv"]:
161
+ add_document_to_vector_store(
162
+ content=content,
163
+ source=f"file:{os.path.basename(file_path)}",
164
+ metadata={"file_path": file_path, "mime_type": mime_type}
165
+ )
166
+
167
+ return content
168
 
169
+ except Exception as e:
170
+ logger.error(f"File analysis error for '{file_path}': {e}")
171
+ return f"File analysis error: {str(e)}"
172
+
173
+ def _process_pdf(self, path: str) -> str:
174
+ loader = PyPDFLoader(path)
175
+ docs = loader.load()
176
+ content = "\n\n".join([doc.page_content for doc in docs])
177
+ if len(content) > 8000:
178
+ logger.warning(f"PDF content truncated from {len(content)} to 8000 characters for {path}")
179
+ return content[:8000] + "\n... [Content truncated]"
180
+ return content
181
+
182
+ def _process_excel(self, path: str) -> str:
183
+ df = pd.read_excel(path)
184
+ info = BytesIO()
185
+ df.info(buf=info)
186
+ info_str = info.getvalue().decode('utf-8')
187
+
188
+ return (f"Excel file loaded. First 10 rows:\n{df.head(10).to_markdown()}\n\n"
189
+ f"DataFrame Info:\n{info_str}")
190
+
191
+ def _process_text(self, path: str) -> str:
192
+ with open(path, 'r', encoding='utf-8') as f:
193
+ content = f.read()
194
+ if len(content) > 8000:
195
+ logger.warning(f"Text file content truncated from {len(content)} to 8000 characters for {path}")
196
+ return content[:8000] + "\n... [Content truncated]"
197
+ return content
198
+
199
+ class VideoTranscriptionTool(Tool):
200
+ """Enhanced YouTube transcription with multilingual support and better output. Transcribed content is added to vector store."""
201
+ name = "transcript_video"
202
+ description = "Fetch YouTube video transcripts with optional timestamps. Supports English, French, Spanish, German. Transcribed text is indexed for future retrieval."
203
  inputs = {
204
+ "url": {"type": "string", "description": "YouTube URL or ID"},
205
+ "include_timestamps": {"type": "boolean", "description": "Include timestamps? (default: False)"}
 
 
206
  }
207
  output_type = "string"
208
 
209
+ def forward(self, url: str, include_timestamps: bool = False) -> str:
210
+ try:
211
+ video_id = extract_youtube_id(url)
212
+ if not video_id:
213
+ return "Invalid YouTube URL or ID format. Please provide a valid YouTube URL or an 11-character video ID."
214
+
215
+ logger.info(f"Attempting to transcribe video ID: {video_id}")
216
+ transcript_list = YouTubeTranscriptApi.get_transcript(
217
+ video_id,
218
+ languages=['en', 'fr', 'es', 'de']
219
+ )
220
+
221
+ if not transcript_list:
222
+ return f"No transcript found for video ID: {video_id} in supported languages (en, fr, es, de)."
223
+
224
+ full_transcript_text = " ".join(seg['text'] for seg in transcript_list)
225
+
226
+ add_document_to_vector_store(
227
+ content=full_transcript_text,
228
+ source=f"youtube_video:{video_id}",
229
+ metadata={"video_url": url}
230
+ )
231
 
232
+ if include_timestamps:
233
+ formatted_transcript = "\n".join(
234
+ f"[{int(seg['start']//60):02d}:{int(seg['start']%60):02d}] {seg['text']}"
235
+ for seg in transcript_list
236
+ )
237
+ else:
238
+ formatted_transcript = full_transcript_text
239
+
240
+ return formatted_transcript
241
+ except Exception as e:
242
+ logger.error(f"Transcription error for '{url}': {e}")
243
+ return f"Transcription error: {str(e)}. This might be due to no available transcript or an unsupported video."
244
 
245
+ class DataAnalysisTool(Tool):
246
+ """Perform data analysis using pandas on structured data (CSV/Excel)"""
247
+ name = "data_analysis"
248
+ description = "Analyze CSV/Excel data using pandas operations. Supported operations: 'describe', 'groupby:column:aggfunc' (e.g., 'groupby:Category:mean'). Outputs are NOT added to vector store."
249
  inputs = {
250
+ "file_path": {"type": "string", "description": "Path to the local data file (CSV or Excel)"},
251
+ "operation": {"type": "string", "description": "Pandas operation (e.g., 'describe', 'groupby:column_name:agg_function')"}
 
 
252
  }
253
  output_type = "string"
254
 
255
+ def forward(self, file_path: str, operation: str) -> str:
256
+ if not os.path.exists(file_path):
257
+ return f"File not found: {file_path}"
258
+
259
  try:
260
+ if file_path.endswith('.csv'):
261
+ df = pd.read_csv(file_path)
262
+ elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
263
+ df = pd.read_excel(file_path)
264
+ else:
265
+ return "Unsupported file format for data analysis. Please provide a .csv or .xlsx file."
266
+
267
+ logger.info(f"Performing data analysis operation '{operation}' on {file_path}")
268
+
269
+ if operation == "describe":
270
+ return "Descriptive Statistics:\n" + str(df.describe())
271
+ elif operation.startswith("groupby:"):
272
+ parts = operation.split(":")
273
+ if len(parts) == 3:
274
+ _, col, agg = parts
275
+ if col not in df.columns:
276
+ return f"Column '{col}' not found in the DataFrame."
277
+ try:
278
+ result = df.groupby(col).agg(agg)
279
+ return f"Groupby operation '{agg}' on column '{col}':\n" + str(result)
280
+ except Exception as agg_e:
281
+ return f"Error performing aggregation '{agg}' on column '{col}': {str(agg_e)}"
282
+ else:
283
+ return "Invalid 'groupby' operation format. Use 'groupby:column_name:agg_function'."
284
+ else:
285
+ return "Unsupported operation. Try: 'describe' or 'groupby:column_name:agg_function'."
286
  except Exception as e:
287
+ logger.error(f"Data analysis error for '{file_path}' with operation '{operation}': {e}")
288
+ return f"Data analysis error: {str(e)}. Please check file content and operation."
 
289
 
290
+ class RetrievalTool(Tool):
 
291
  """
292
+ Retrieves relevant information from the in-memory vector store based on a query.
293
+ This tool allows the agent to access previously processed documents and transcripts.
 
 
 
 
 
 
 
 
 
 
 
294
  """
295
+ name = "retrieve_from_vector_store"
296
+ description = "Search for relevant information within previously processed documents and transcripts using a semantic query. Returns top K relevant chunks."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  inputs = {
298
+ "query": {"type": "string", "description": "The semantic query to search the vector store."},
299
+ "k": {"type": "integer", "description": "Number of top results to retrieve (default: 3)", "default": 3}
300
  }
301
  output_type = "string"
302
 
303
+ def forward(self, query: str, k: int = 3) -> str:
304
+ if vectorstore is None:
305
+ return "Vector store is not initialized. No documents available for retrieval."
306
+
307
+ try:
308
+ logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
309
+ retrieved_docs = vectorstore.similarity_search(query, k=k)
310
+
311
+ if not retrieved_docs:
312
+ return "No relevant information found in the vector store for this query."
313
+
314
+ formatted_results = []
315
+ for i, doc in enumerate(retrieved_docs):
316
+ source = doc.metadata.get('source', 'Unknown Source')
317
+ title = doc.metadata.get('title', 'N/A')
318
+ chunk_index = doc.metadata.get('chunk_index', 'N/A')
319
+ formatted_results.append(
320
+ f"--- Retrieved Document Chunk {i+1} ---\n"
321
+ f"Source: {source} (Chunk: {chunk_index})\n"
322
+ f"Title: {title}\n"
323
+ f"Content: {doc.page_content}\n"
324
+ )
325
+ return "\n\n".join(formatted_results)
326
+ except Exception as e:
327
+ logger.error(f"Error retrieving from vector store for query '{query}': {e}")
328
+ return f"Error retrieving from vector store: {str(e)}"
329
 
330
+ class ChessAnalysisAPITool(Tool):
331
+ """
332
+ Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
333
+ """
334
+ name = "analyze_chess_position_api"
335
+ description = (
336
+ "Analyze a chess position provided in FEN (Forsyth-Edwards Notation) format using an online engine. "
337
+ "Returns the best move in algebraic notation for the current player, along with evaluation."
338
+ "Note: This tool cannot interpret chess positions directly from images. "
339
+ "The FEN string must be provided by the user."
340
+ )
341
+ inputs = {
342
+ "fen_string": {"type": "string", "description": "The chess position in FEN format. Example: 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1'"},
343
+ "depth": {"type": "integer", "description": "The analysis depth for the chess engine (higher means better, but slower; max ~18 for this API; default: 15)", "default": 15}
344
+ }
345
+ output_type = "string"
346
 
347
+ def forward(self, fen_string: str, depth: int = 15) -> str:
348
+ actual_depth = min(depth, 18)
349
+
350
  try:
351
+ logger.info(f"Analyzing FEN: {fen_string} at depth {actual_depth} using chess-api.com.")
352
+
353
+ response = requests.post(
354
+ "https://chess-api.com/v1",
355
+ json={"fen": fen_string, "depth": actual_depth}
356
+ )
357
+ response.raise_for_status()
358
+ data = response.json()
359
+
360
+ if data.get("type") == "bestmove":
361
+ move_san = data.get("san", data.get("move"))
362
+ evaluation = data.get("eval")
363
+ mate_in_moves = data.get("mate")
364
+
365
+ result = f"Best move: **{move_san}** (UCI: {data.get('move')})\n"
366
+
367
+ if mate_in_moves is not None:
368
+ player_to_move = "White" if data.get("turn") == 'w' else "Black"
369
+ result += f"Forced mate for {player_to_move} in {abs(mate_in_moves)} moves.\n"
370
+ elif evaluation is not None:
371
+ eval_str = ""
372
+ if evaluation >= 1000:
373
+ eval_str = "Decisive advantage for White"
374
+ elif evaluation <= -1000:
375
+ eval_str = "Decisive advantage for Black"
376
+ elif evaluation > 0:
377
+ eval_str = f"White is up by {evaluation} centipawns"
378
+ elif evaluation < 0:
379
+ eval_str = f"Black is up by {abs(evaluation)} centipawns"
380
+ else:
381
+ eval_str = "Even position"
382
+ result += f"Evaluation: {eval_str} (Depth: {data.get('depth')})\n"
383
+
384
+ result += "(Source: chess-api.com - Stockfish 17 NNUE)"
385
+ return result
386
  else:
387
+ return f"Chess API response: {data.get('text', 'No best move found or error.')}"
388
 
389
+ except requests.exceptions.RequestException as e:
390
+ logger.error(f"Error communicating with remote chess analysis API for FEN '{fen_string}': {e}")
391
+ return f"Error contacting remote chess analysis API: {str(e)}. Please try again later."
392
  except Exception as e:
393
+ logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
394
+ return f"An unexpected error occurred during chess analysis: {str(e)}"
395
 
396
+ # --- Agent Initialization ---
397
  class BasicAgent:
398
  def __init__(self):
399
+ self.model = HfApiModel(
400
+ temperature=0.0,
401
+ os.environ.get("HF_API_TOKEN"),
402
+ max_tokens=2000
403
  )
404
+
405
+ self.tools = self._initialize_tools()
406
+ self.agent = self._create_agent()
407
+
408
+ def _initialize_tools(self) -> list:
409
+ """Initialize all tools with enhanced capabilities"""
410
+ base_tools = [
411
+ DuckDuckGoSearchTool(),
412
+ WikiSearchTool(),
413
+ VisitWebpageTool(),
414
+ SpeechToTextTool(),
415
+ FinalAnswerTool(),
416
+ VideoTranscriptionTool(),
417
+ FileAnalysisTool(),
418
+ DataAnalysisTool(),
419
+ self._create_excel_download_tool(),
420
+ self._create_keywords_tool(),
421
+ ChessAnalysisAPITool(),
422
+ ]
423
+
424
+ if vectorstore and embeddings:
425
+ logger.info("Adding RetrievalTool to the agent's tools.")
426
+ base_tools.append(RetrievalTool())
427
+ else:
428
+ logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
429
 
430
+ return base_tools
431
+
432
+ def _create_excel_download_tool(self):
433
+ """Tool to download and parse Excel files from a specific URL"""
434
+ @tool
435
+ def download_and_parse_excel(task_id: str) -> dict:
436
+ """
437
+ Downloads an Excel file from a predefined URL using a task_id and parses its content.
438
+ Returns a dictionary with status and data (first 10 rows), columns, and shape.
439
+ """
440
+ try:
441
+ url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
442
+ logger.info(f"Attempting to download Excel from: {url}")
443
+ response = requests.get(url, timeout=60)
444
+ response.raise_for_status()
445
+
446
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
447
+ tmp.write(response.content)
448
+ temp_file_path = tmp.name
449
+
450
+ df = pd.read_excel(temp_file_path)
451
+ os.unlink(temp_file_path)
452
+
453
+ logger.info(f"Successfully downloaded and parsed Excel for task_id: {task_id}")
454
+ return {
455
+ "task_id": task_id,
456
+ "data_sample": df.head(10).to_dict(orient="records"),
457
+ "status": "Success",
458
+ "columns": df.columns.tolist(),
459
+ "shape": df.shape
460
+ }
461
+ except requests.exceptions.RequestException as req_err:
462
+ logger.error(f"Network or HTTP error downloading Excel for task_id '{task_id}': {req_err}")
463
+ return {"status": f"Download error: {str(req_err)}"}
464
+ except Exception as e:
465
+ logger.error(f"Error parsing Excel for task_id '{task_id}': {e}")
466
+ return {"status": f"Parsing error: {str(e)}"}
467
+ return download_and_parse_excel
468
+
469
+ def _create_keywords_tool(self):
470
+ """Keywords extractor with TF-IDF like scoring (basic frequency for now)"""
471
+ @tool
472
+ def extract_keywords(text: str, top_n: int = 5) -> list:
473
+ """
474
+ Extracts the most frequent keywords from a given text, excluding common stopwords.
475
+ Args:
476
+ text (str): The input text to extract keywords from.
477
+ top_n (int): The number of top keywords to return.
478
+ Returns:
479
+ list: A list of the most frequent keywords.
480
+ """
481
+ if not text:
482
+ return []
483
+
484
+ stopwords = set([
485
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
486
+ "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
487
+ "they", "this", "to", "was", "will", "with", "he", "she", "it's", "i", "we", "you", "my",
488
+ "your", "our", "us", "him", "her", "his", "hers", "its", "them", "their", "what", "when",
489
+ "where", "why", "how", "which", "who", "whom", "can", "could", "would", "should", "may",
490
+ "might", "must", "have", "has", "had", "do", "does", "did", "am", "are", "is", "were", "been",
491
+ "being", "from", "up", "down", "out", "off", "over", "under", "again", "further", "then",
492
+ "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few",
493
+ "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
494
+ "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
495
+ ])
496
+
497
+ words = re.findall(r'\b\w+\b', text.lower())
498
+ filtered = [w for w in words if w not in stopwords and len(w) > 2]
499
+ counter = Counter(filtered)
500
+ return [word for word, _ in counter.most_common(top_n)]
501
+ return extract_keywords
502
+
503
+ def _create_agent(self) -> CodeAgent:
504
+ """Create agent with improved system prompt"""
505
+ system_prompt = """
506
  You are an advanced, helpful, and highly analytical research assistant. Your goal is to provide accurate, comprehensive, and well-structured answers to user queries, leveraging all available tools efficiently.
507
 
508
  **Follow this robust process:**
 
510
  1. **Understand the User's Need:** Carefully analyze the user's question, including any attached files or specific requests (e.g., "summarize," "analyze data," "find facts").
511
  2. **Formulate a Detailed Plan:** Before acting, create a clear, step-by-step plan. This plan should outline:
512
  * What information needs to be gathered.
513
+ * Which tools are most appropriate for each step.
514
+ * Use `retrieve_from_vector_store` first if the query seems to be related to previously processed information (e.g., "What did we learn about X from the uploaded document?").
515
+ * Use `duckduckgo_search` for general web search.
516
+ * Use `wiki_search` for encyclopedic facts.
517
+ * Use `transcript_video` for YouTube video content.
518
+ * Use `file_analysis` to inspect content of local files.
519
+ * Use `data_analysis` for structured analysis of CSV/Excel files.
520
+ * Use `analyze_chess_position_api` if the user provides a FEN string for a chess position and asks for the best move.
521
  * How you will combine information from different sources.
522
  * How you will verify or synthesize the findings.
523
  3. **Execute the Plan Using Tools:** Call the necessary tools, providing clear and correct arguments. If a tool fails, try to understand why and adapt your plan (e.g., try a different search query or tool).
 
528
  * If the answer is a single number, provide only the number.
529
  * If the answer is a list, provide comma-separated values.
530
  * For complex answers, use structured formats like bullet points or JSON where appropriate to enhance readability.
531
+ * **Crucially, always include sources or references (e.g., URLs, Wikipedia titles, file names, "Internal Knowledge Base", "Remote Chess API") where you obtained the information.** This builds trust and allows for verification.
532
  * If you used `file_analysis` or `data_analysis` tools on an uploaded file, explicitly state that you analyzed the provided file.
533
 
534
  **Important Considerations:**
535
+ * **Prioritize:** If the query involves a specific file, start by analyzing that file if appropriate. If the query seems to refer to previously processed data, try `retrieve_from_vector_store` first.
536
  * **Limitations:** If you cannot answer a question with the available tools, state that clearly.
537
  * **Conciseness:** Be as concise as possible while providing an accurate answer.
538
  """
539
+ agent = CodeAgent(
540
+ model=self.model,
541
+ tools=self.tools,
542
+ add_base_tools=True,
543
+ max_steps=15 # <--- Added this to limit agent's internal reasoning/tool-use steps
544
  )
545
+ agent.prompt_templates["system_prompt"] = system_prompt
546
+ return agent
547
 
548
  def __call__(self, question: str) -> str:
549
  print(f"Agent received question (first 50 chars): {question[:50]}...")
550
  answer = self.agent.run(question)
551
  print(f"Agent returning answer: {answer}")
552
  return answer
553
+
554
+ logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
555
+ global vectorstore
556
+ if embeddings:
557
+ vectorstore = DocArrayInMemorySearch(embedding_function=embeddings)
558
+ logger.info("DocArrayInMemorySearch re-initialized for new session.")
559
+ else:
560
+ logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
561
+ return "Error: Embedding model not loaded, cannot process request."
562
+
563
+ # --- Implement a timeout for the agent's run method ---
564
+ # Max time in seconds for the agent to respond
565
+ AGENT_TIMEOUT_SECONDS = 120
566
+
567
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
568
+ future = executor.submit(self.agent.run, question)
569
+ try:
570
+ response = future.result(timeout=AGENT_TIMEOUT_SECONDS)
571
+ except concurrent.futures.TimeoutError:
572
+ logger.warning(f"Agent execution timed out after {AGENT_TIMEOUT_SECONDS} seconds for question: {question[:100]}...")
573
+ future.cancel() # Cancel the future if it's still running
574
+ return "Error: The agent took too long to respond and timed out. Please try again with a simpler query or check the input."
575
+ except Exception as e:
576
+ # Catch any other exceptions that might occur during agent.run
577
+ logger.error(f"Agent execution failed during run for question '{question[:100]}': {str(e)}", exc_info=True)
578
+ return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
579
+
580
+ logger.info(f"Response generated successfully for question: {question[:200]}")
581
+ return response
582
+ except Exception as e:
583
+ # This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor
584
+ logger.error(f"Agent setup or execution failed (outer catch) for question '{question[:100]}': {str(e)}", exc_info=True)
585
+ return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
586
+
587
+
588
 
589
  def run_and_submit_all( profile: gr.OAuthProfile | None):
590
  """