Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,60 +9,22 @@ from io import BytesIO
|
|
9 |
from youtube_transcript_api import YouTubeTranscriptApi
|
10 |
from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
|
11 |
from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
|
12 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
-
# Use the new import for HuggingFaceEmbeddings
|
14 |
-
from langchain_huggingface import HuggingFaceEmbeddings # <--- IMPORTANT: Updated import
|
15 |
-
from langchain_community.vectorstores import DocArrayInMemorySearch
|
16 |
-
from langchain_core.documents import Document
|
17 |
from dotenv import load_dotenv
|
18 |
import tempfile
|
19 |
import mimetypes
|
20 |
import logging
|
21 |
-
import uuid
|
22 |
-
# For timeout functionality
|
23 |
-
import concurrent.futures
|
24 |
-
import time
|
25 |
-
|
26 |
-
# Import DocList from docarray
|
27 |
-
from docarray import DocList # <--- IMPORTANT: Added this import
|
28 |
-
|
29 |
|
30 |
# --- Initialize logging ---
|
31 |
-
|
32 |
-
logging.basicConfig(
|
33 |
-
level=logging.INFO,
|
34 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
35 |
-
filename=LOG_FILE_PATH,
|
36 |
-
filemode='a'
|
37 |
-
)
|
38 |
logger = logging.getLogger(__name__)
|
39 |
|
40 |
# --- Load environment variables ---
|
41 |
load_dotenv()
|
42 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
43 |
-
HF_EMBEDDING_MODEL_ID = os.getenv("HF_EMBEDDING_MODEL_ID", "sentence-transformers/all-MiniLM-L6-v2")
|
44 |
-
|
45 |
if not HF_API_TOKEN:
|
46 |
logger.error("HF_API_TOKEN not found in environment variables! Please set it to use the HfApiModel.")
|
47 |
-
|
48 |
-
#
|
49 |
-
try:
|
50 |
-
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
51 |
-
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
52 |
-
except Exception as e:
|
53 |
-
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
54 |
-
embeddings = None
|
55 |
-
|
56 |
-
# Initialize DocArrayInMemorySearch WITH the required arguments: doc_index and embedding
|
57 |
-
vectorstore = DocArrayInMemorySearch(doc_index=DocList(), embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE
|
58 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
59 |
-
chunk_size=1000,
|
60 |
-
chunk_overlap=200,
|
61 |
-
length_function=len,
|
62 |
-
is_separator_regex=False,
|
63 |
-
)
|
64 |
-
logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
|
65 |
-
|
66 |
|
67 |
# --- Utility Functions ---
|
68 |
def extract_youtube_id(url: str) -> str:
|
@@ -70,7 +32,7 @@ def extract_youtube_id(url: str) -> str:
|
|
70 |
patterns = [
|
71 |
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
|
72 |
r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
|
73 |
-
r'([a-zA-Z0-9_-]{11})'
|
74 |
]
|
75 |
for pattern in patterns:
|
76 |
match = re.search(pattern, url)
|
@@ -78,31 +40,6 @@ def extract_youtube_id(url: str) -> str:
|
|
78 |
return match.group(1)
|
79 |
return ""
|
80 |
|
81 |
-
def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
|
82 |
-
"""
|
83 |
-
Adds content to the global vector store.
|
84 |
-
Chunks the content and creates LangChain Documents.
|
85 |
-
"""
|
86 |
-
if vectorstore is None:
|
87 |
-
logger.warning("Vector store not initialized. Cannot add document.")
|
88 |
-
return
|
89 |
-
|
90 |
-
try:
|
91 |
-
chunks = text_splitter.split_text(content)
|
92 |
-
docs = []
|
93 |
-
for i, chunk in enumerate(chunks):
|
94 |
-
doc_metadata = {"source": source, "chunk_index": i}
|
95 |
-
if metadata:
|
96 |
-
doc_metadata.update(metadata)
|
97 |
-
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
98 |
-
|
99 |
-
# When vectorstore is initialized with embedding, add_documents might not need it again.
|
100 |
-
# But explicitly passing it is safer if there are multiple ways to initialize.
|
101 |
-
vectorstore.add_documents(docs) # No `embedding` argument needed here if initialized in __init__
|
102 |
-
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
103 |
-
except Exception as e:
|
104 |
-
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
105 |
-
|
106 |
# --- Enhanced Tools ---
|
107 |
class WikiSearchTool(Tool):
|
108 |
"""Enhanced Wikipedia search with better formatting and error handling"""
|
@@ -121,14 +58,8 @@ class WikiSearchTool(Tool):
|
|
121 |
|
122 |
formatted_results = []
|
123 |
for i, doc in enumerate(docs):
|
|
|
124 |
summary = doc.page_content[:1000] + "..." if len(doc.page_content) > 1000 else doc.page_content
|
125 |
-
|
126 |
-
add_document_to_vector_store(
|
127 |
-
content=doc.page_content,
|
128 |
-
source=doc.metadata.get('source', 'Wikipedia'),
|
129 |
-
metadata={"title": doc.metadata.get('title', 'N/A')}
|
130 |
-
)
|
131 |
-
|
132 |
formatted_results.append(
|
133 |
f"--- Wikipedia Result {i+1} ---\n"
|
134 |
f"Title: {doc.metadata.get('title', 'N/A')}\n"
|
@@ -141,9 +72,9 @@ class WikiSearchTool(Tool):
|
|
141 |
return f"Wikipedia search error: {str(e)}"
|
142 |
|
143 |
class FileAnalysisTool(Tool):
|
144 |
-
"""Universal file analyzer for text/PDF/Excel files
|
145 |
name = "file_analysis"
|
146 |
-
description = "Analyze text, PDF, and Excel files. Returns extracted content.
|
147 |
inputs = {"file_path": {"type": "string", "description": "Path to the local file"}}
|
148 |
output_type = "string"
|
149 |
|
@@ -155,25 +86,14 @@ class FileAnalysisTool(Tool):
|
|
155 |
mime_type, _ = mimetypes.guess_type(file_path)
|
156 |
logger.info(f"Analyzing file: {file_path} with MIME type: {mime_type}")
|
157 |
|
158 |
-
content = ""
|
159 |
if mime_type == "application/pdf":
|
160 |
-
|
161 |
elif mime_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"]:
|
162 |
-
|
163 |
elif mime_type and ("text" in mime_type or "csv" in mime_type):
|
164 |
-
|
165 |
else:
|
166 |
return f"Unsupported file type for analysis: {mime_type}. Only PDF, Excel, and text/CSV files are supported."
|
167 |
-
|
168 |
-
if mime_type in ["application/pdf", "text/plain", "text/csv"]:
|
169 |
-
add_document_to_vector_store(
|
170 |
-
content=content,
|
171 |
-
source=f"file:{os.path.basename(file_path)}",
|
172 |
-
metadata={"file_path": file_path, "mime_type": mime_type}
|
173 |
-
)
|
174 |
-
|
175 |
-
return content
|
176 |
-
|
177 |
except Exception as e:
|
178 |
logger.error(f"File analysis error for '{file_path}': {e}")
|
179 |
return f"File analysis error: {str(e)}"
|
@@ -182,6 +102,7 @@ class FileAnalysisTool(Tool):
|
|
182 |
loader = PyPDFLoader(path)
|
183 |
docs = loader.load()
|
184 |
content = "\n\n".join([doc.page_content for doc in docs])
|
|
|
185 |
if len(content) > 8000:
|
186 |
logger.warning(f"PDF content truncated from {len(content)} to 8000 characters for {path}")
|
187 |
return content[:8000] + "\n... [Content truncated]"
|
@@ -189,6 +110,7 @@ class FileAnalysisTool(Tool):
|
|
189 |
|
190 |
def _process_excel(self, path: str) -> str:
|
191 |
df = pd.read_excel(path)
|
|
|
192 |
info = BytesIO()
|
193 |
df.info(buf=info)
|
194 |
info_str = info.getvalue().decode('utf-8')
|
@@ -205,9 +127,9 @@ class FileAnalysisTool(Tool):
|
|
205 |
return content
|
206 |
|
207 |
class VideoTranscriptionTool(Tool):
|
208 |
-
"""Enhanced YouTube transcription with multilingual support and better output
|
209 |
name = "transcript_video"
|
210 |
-
description = "Fetch YouTube video transcripts with optional timestamps. Supports English, French, Spanish, German.
|
211 |
inputs = {
|
212 |
"url": {"type": "string", "description": "YouTube URL or ID"},
|
213 |
"include_timestamps": {"type": "boolean", "description": "Include timestamps? (default: False)"}
|
@@ -221,29 +143,21 @@ class VideoTranscriptionTool(Tool):
|
|
221 |
return "Invalid YouTube URL or ID format. Please provide a valid YouTube URL or an 11-character video ID."
|
222 |
|
223 |
logger.info(f"Attempting to transcribe video ID: {video_id}")
|
224 |
-
|
225 |
video_id,
|
226 |
-
languages=['en', 'fr', 'es', 'de']
|
227 |
)
|
228 |
|
229 |
-
if not
|
230 |
return f"No transcript found for video ID: {video_id} in supported languages (en, fr, es, de)."
|
231 |
|
232 |
-
full_transcript_text = " ".join(seg['text'] for seg in transcript_list)
|
233 |
-
|
234 |
-
add_document_to_vector_store(
|
235 |
-
content=full_transcript_text,
|
236 |
-
source=f"youtube_video:{video_id}",
|
237 |
-
metadata={"video_url": url}
|
238 |
-
)
|
239 |
-
|
240 |
if include_timestamps:
|
241 |
formatted_transcript = "\n".join(
|
242 |
f"[{int(seg['start']//60):02d}:{int(seg['start']%60):02d}] {seg['text']}"
|
243 |
-
for seg in
|
244 |
)
|
245 |
else:
|
246 |
-
formatted_transcript =
|
247 |
|
248 |
return formatted_transcript
|
249 |
except Exception as e:
|
@@ -253,10 +167,10 @@ class VideoTranscriptionTool(Tool):
|
|
253 |
class DataAnalysisTool(Tool):
|
254 |
"""Perform data analysis using pandas on structured data (CSV/Excel)"""
|
255 |
name = "data_analysis"
|
256 |
-
description = "Analyze CSV/Excel data using pandas operations. Supported operations: 'describe', 'groupby:column:aggfunc' (e.g., 'groupby:Category:mean').
|
257 |
inputs = {
|
258 |
"file_path": {"type": "string", "description": "Path to the local data file (CSV or Excel)"},
|
259 |
-
"operation": {"type": "string", "description": "Pandas operation (e.g., 'describe', 'groupby:column_name:
|
260 |
}
|
261 |
output_type = "string"
|
262 |
|
@@ -295,152 +209,32 @@ class DataAnalysisTool(Tool):
|
|
295 |
logger.error(f"Data analysis error for '{file_path}' with operation '{operation}': {e}")
|
296 |
return f"Data analysis error: {str(e)}. Please check file content and operation."
|
297 |
|
298 |
-
class RetrievalTool(Tool):
|
299 |
-
"""
|
300 |
-
Retrieves relevant information from the in-memory vector store based on a query.
|
301 |
-
This tool allows the agent to access previously processed documents and transcripts.
|
302 |
-
"""
|
303 |
-
name = "retrieve_from_vector_store"
|
304 |
-
description = "Search for relevant information within previously processed documents and transcripts using a semantic query. Returns top K relevant chunks."
|
305 |
-
inputs = {
|
306 |
-
"query": {"type": "string", "description": "The semantic query to search the vector store."},
|
307 |
-
"k": {"type": "integer", "description": "Number of top results to retrieve (default: 3)", "default": 3}
|
308 |
-
}
|
309 |
-
output_type = "string"
|
310 |
-
|
311 |
-
def forward(self, query: str, k: int = 3) -> str:
|
312 |
-
if vectorstore is None or embeddings is None:
|
313 |
-
return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
|
314 |
-
|
315 |
-
try:
|
316 |
-
logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
|
317 |
-
# Ensure similarity_search uses the vectorstore's internal embedding if initialized correctly
|
318 |
-
# or if it takes an explicit embedding argument here.
|
319 |
-
# With DocArrayInMemorySearch initialized with `embedding=embeddings`, this call should be fine.
|
320 |
-
retrieved_docs = vectorstore.similarity_search(query, k=k)
|
321 |
-
|
322 |
-
if not retrieved_docs:
|
323 |
-
return "No relevant information found in the vector store for this query."
|
324 |
-
|
325 |
-
formatted_results = []
|
326 |
-
for i, doc in enumerate(retrieved_docs):
|
327 |
-
source = doc.metadata.get('source', 'Unknown Source')
|
328 |
-
title = doc.metadata.get('title', 'N/A')
|
329 |
-
chunk_index = doc.metadata.get('chunk_index', 'N/A')
|
330 |
-
formatted_results.append(
|
331 |
-
f"--- Retrieved Document Chunk {i+1} ---\n"
|
332 |
-
f"Source: {source} (Chunk: {chunk_index})\n"
|
333 |
-
f"Title: {title}\n"
|
334 |
-
f"Content: {doc.page_content}\n"
|
335 |
-
)
|
336 |
-
return "\n\n".join(formatted_results)
|
337 |
-
except Exception as e:
|
338 |
-
logger.error(f"Error retrieving from vector store for query '{query}': {e}")
|
339 |
-
return f"Error retrieving from vector store: {str(e)}"
|
340 |
-
|
341 |
-
class ChessAnalysisAPITool(Tool):
|
342 |
-
"""
|
343 |
-
Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
|
344 |
-
"""
|
345 |
-
name = "analyze_chess_position_api"
|
346 |
-
description = (
|
347 |
-
"Analyze a chess position provided in FEN (Forsyth-Edwards Notation) format using an online engine. "
|
348 |
-
"Returns the best move in algebraic notation for the current player, along with evaluation."
|
349 |
-
"Note: This tool cannot interpret chess positions directly from images. "
|
350 |
-
"The FEN string must be provided by the user."
|
351 |
-
)
|
352 |
-
inputs = {
|
353 |
-
"fen_string": {"type": "string", "description": "The chess position in FEN format. Example: 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1'"},
|
354 |
-
"depth": {"type": "integer", "description": "The analysis depth for the chess engine (higher means better, but slower; max ~18 for this API; default: 15)", "default": 15}
|
355 |
-
}
|
356 |
-
output_type = "string"
|
357 |
-
|
358 |
-
def forward(self, fen_string: str, depth: int = 15) -> str:
|
359 |
-
actual_depth = min(depth, 18)
|
360 |
-
|
361 |
-
try:
|
362 |
-
logger.info(f"Analyzing FEN: {fen_string} at depth {actual_depth} using chess-api.com.")
|
363 |
-
|
364 |
-
response = requests.post(
|
365 |
-
"https://chess-api.com/v1",
|
366 |
-
json={"fen": fen_string, "depth": actual_depth}
|
367 |
-
)
|
368 |
-
response.raise_for_status()
|
369 |
-
data = response.json()
|
370 |
-
|
371 |
-
if data.get("type") == "bestmove":
|
372 |
-
move_san = data.get("san", data.get("move"))
|
373 |
-
evaluation = data.get("eval")
|
374 |
-
mate_in_moves = data.get("mate")
|
375 |
-
|
376 |
-
result = f"Best move: **{move_san}** (UCI: {data.get('move')})\n"
|
377 |
-
|
378 |
-
if mate_in_moves is not None:
|
379 |
-
player_to_move = "White" if data.get("turn") == 'w' else "Black"
|
380 |
-
result += f"Forced mate for {player_to_move} in {abs(mate_in_moves)} moves.\n"
|
381 |
-
elif evaluation is not None:
|
382 |
-
eval_str = ""
|
383 |
-
if evaluation >= 1000:
|
384 |
-
eval_str = "Decisive advantage for White"
|
385 |
-
elif evaluation <= -1000:
|
386 |
-
eval_str = "Decisive advantage for Black"
|
387 |
-
elif evaluation > 0:
|
388 |
-
eval_str = f"White is up by {evaluation} centipawns"
|
389 |
-
elif evaluation < 0:
|
390 |
-
eval_str = f"Black is up by {abs(evaluation)} centipawns"
|
391 |
-
else:
|
392 |
-
eval_str = "Even position"
|
393 |
-
result += f"Evaluation: {eval_str} (Depth: {data.get('depth')})\n"
|
394 |
-
|
395 |
-
result += "(Source: chess-api.com - Stockfish 17 NNUE)"
|
396 |
-
return result
|
397 |
-
else:
|
398 |
-
return f"Chess API response: {data.get('text', 'No best move found or error.')}"
|
399 |
-
|
400 |
-
except requests.exceptions.RequestException as e:
|
401 |
-
logger.error(f"Error communicating with remote chess analysis API for FEN '{fen_string}': {e}")
|
402 |
-
return f"Error contacting remote chess analysis API: {str(e)}. Please try again later."
|
403 |
-
except Exception as e:
|
404 |
-
logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
|
405 |
-
return f"An unexpected error occurred during chess analysis: {str(e)}"
|
406 |
-
|
407 |
-
|
408 |
# --- Agent Initialization ---
|
409 |
-
class BasicAgent:
|
410 |
def __init__(self):
|
411 |
self.model = HfApiModel(
|
412 |
-
temperature=0.
|
413 |
-
token=
|
414 |
max_tokens=2000
|
415 |
)
|
416 |
|
417 |
self.tools = self._initialize_tools()
|
418 |
self.agent = self._create_agent()
|
419 |
-
|
420 |
|
421 |
def _initialize_tools(self) -> list:
|
422 |
"""Initialize all tools with enhanced capabilities"""
|
423 |
-
|
424 |
DuckDuckGoSearchTool(),
|
425 |
WikiSearchTool(),
|
426 |
VisitWebpageTool(),
|
427 |
-
SpeechToTextTool(),
|
428 |
FinalAnswerTool(),
|
429 |
VideoTranscriptionTool(),
|
430 |
FileAnalysisTool(),
|
431 |
DataAnalysisTool(),
|
432 |
-
self._create_excel_download_tool(),
|
433 |
-
self._create_keywords_tool()
|
434 |
-
ChessAnalysisAPITool(),
|
435 |
]
|
436 |
-
|
437 |
-
if vectorstore and embeddings:
|
438 |
-
logger.info("Adding RetrievalTool to the agent's tools.")
|
439 |
-
base_tools.append(RetrievalTool())
|
440 |
-
else:
|
441 |
-
logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
|
442 |
-
|
443 |
-
return base_tools
|
444 |
|
445 |
def _create_excel_download_tool(self):
|
446 |
"""Tool to download and parse Excel files from a specific URL"""
|
@@ -448,28 +242,28 @@ class BasicAgent:
|
|
448 |
def download_and_parse_excel(task_id: str) -> dict:
|
449 |
"""
|
450 |
Downloads an Excel file from a predefined URL using a task_id and parses its content.
|
451 |
-
Returns a dictionary with status and data (first
|
452 |
"""
|
453 |
try:
|
454 |
url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
|
455 |
logger.info(f"Attempting to download Excel from: {url}")
|
456 |
-
response = requests.get(url, timeout=60)
|
457 |
-
response.raise_for_status()
|
458 |
|
459 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
460 |
tmp.write(response.content)
|
461 |
temp_file_path = tmp.name
|
462 |
|
463 |
df = pd.read_excel(temp_file_path)
|
464 |
-
os.unlink(temp_file_path)
|
465 |
|
466 |
logger.info(f"Successfully downloaded and parsed Excel for task_id: {task_id}")
|
467 |
return {
|
468 |
"task_id": task_id,
|
469 |
-
"data_sample": df.head(10).to_dict(orient="records"),
|
470 |
"status": "Success",
|
471 |
-
"columns": df.columns.tolist(),
|
472 |
-
"shape": df.shape
|
473 |
}
|
474 |
except requests.exceptions.RequestException as req_err:
|
475 |
logger.error(f"Network or HTTP error downloading Excel for task_id '{task_id}': {req_err}")
|
@@ -494,6 +288,7 @@ class BasicAgent:
|
|
494 |
if not text:
|
495 |
return []
|
496 |
|
|
|
497 |
stopwords = set([
|
498 |
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
|
499 |
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
|
@@ -507,8 +302,8 @@ class BasicAgent:
|
|
507 |
"than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
|
508 |
])
|
509 |
|
510 |
-
words = re.findall(r'\b\w+\b', text.lower())
|
511 |
-
filtered = [w for w in words if w not in stopwords and len(w) > 2]
|
512 |
counter = Counter(filtered)
|
513 |
return [word for word, _ in counter.most_common(top_n)]
|
514 |
return extract_keywords
|
@@ -523,14 +318,7 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
|
|
523 |
1. **Understand the User's Need:** Carefully analyze the user's question, including any attached files or specific requests (e.g., "summarize," "analyze data," "find facts").
|
524 |
2. **Formulate a Detailed Plan:** Before acting, create a clear, step-by-step plan. This plan should outline:
|
525 |
* What information needs to be gathered.
|
526 |
-
* Which tools are most appropriate for each step.
|
527 |
-
* Use `retrieve_from_vector_store` first if the query seems to be related to previously processed information (e.g., "What did we learn about X from the uploaded document?").
|
528 |
-
* Use `duckduckgo_search` for general web search.
|
529 |
-
* Use `wiki_search` for encyclopedic facts.
|
530 |
-
* Use `transcript_video` for YouTube video content.
|
531 |
-
* Use `file_analysis` to inspect content of local files.
|
532 |
-
* Use `data_analysis` for structured analysis of CSV/Excel files.
|
533 |
-
* Use `analyze_chess_position_api` if the user provides a FEN string for a chess position and asks for the best move.
|
534 |
* How you will combine information from different sources.
|
535 |
* How you will verify or synthesize the findings.
|
536 |
3. **Execute the Plan Using Tools:** Call the necessary tools, providing clear and correct arguments. If a tool fails, try to understand why and adapt your plan (e.g., try a different search query or tool).
|
@@ -541,64 +329,31 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
|
|
541 |
* If the answer is a single number, provide only the number.
|
542 |
* If the answer is a list, provide comma-separated values.
|
543 |
* For complex answers, use structured formats like bullet points or JSON where appropriate to enhance readability.
|
544 |
-
|
545 |
* If you used `file_analysis` or `data_analysis` tools on an uploaded file, explicitly state that you analyzed the provided file.
|
546 |
|
547 |
**Important Considerations:**
|
548 |
-
* **Prioritize:** If the query involves a specific file, start by analyzing that file if appropriate.
|
|
|
549 |
* **Limitations:** If you cannot answer a question with the available tools, state that clearly.
|
550 |
-
* **Conciseness:** Be as concise as possible while providing
|
551 |
"""
|
552 |
agent = CodeAgent(
|
553 |
model=self.model,
|
554 |
tools=self.tools,
|
555 |
-
add_base_tools=True
|
556 |
-
max_steps=15 # <--- Added this to limit agent's internal reasoning/tool-use steps
|
557 |
)
|
558 |
agent.prompt_templates["system_prompt"] = system_prompt
|
559 |
return agent
|
560 |
|
561 |
-
|
562 |
def __call__(self, question: str) -> str:
|
563 |
-
logger.info(f"Received question: {question[:200]}...")
|
564 |
-
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
565 |
-
|
566 |
try:
|
567 |
-
|
568 |
-
# Re-initialize vectorstore for a new session without arguments
|
569 |
-
# This relies on the add_documents and similarity_search methods getting the embedding
|
570 |
-
if embeddings:
|
571 |
-
vectorstore = DocArrayInMemorySearch() # <--- REVERTED TO THIS SIMPLE INIT HERE TOO
|
572 |
-
logger.info("DocArrayInMemorySearch re-initialized for new session.")
|
573 |
-
else:
|
574 |
-
logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
|
575 |
-
return "Error: Embedding model not loaded, cannot process request."
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
# --- Implement a timeout for the agent's run method ---
|
580 |
-
AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
|
581 |
-
|
582 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
583 |
-
future = executor.submit(self.agent.run, question)
|
584 |
-
try:
|
585 |
-
response = future.result(timeout=AGENT_TIMEOUT_SECONDS)
|
586 |
-
except concurrent.futures.TimeoutError:
|
587 |
-
logger.warning(f"Agent execution timed out after {AGENT_TIMEOUT_SECONDS} seconds for question: {question[:100]}...")
|
588 |
-
future.cancel() # Cancel the future if it's still running
|
589 |
-
return "Error: The agent took too long to respond and timed out. Please try again with a simpler query or check the input."
|
590 |
-
except Exception as e:
|
591 |
-
# Catch any other exceptions that might occur during agent.run
|
592 |
-
logger.error(f"Agent execution failed during run for question '{question[:100]}': {str(e)}", exc_info=True)
|
593 |
-
return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
|
594 |
-
|
595 |
logger.info(f"Response generated successfully for question: {question[:200]}")
|
596 |
-
# print statement for immediate console feedback of the final answer
|
597 |
-
print(f"Agent returning answer: {response}")
|
598 |
return response
|
599 |
except Exception as e:
|
600 |
-
|
601 |
-
logger.error(f"Agent setup or execution failed (outer catch) for question '{question[:100]}': {str(e)}", exc_info=True)
|
602 |
return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
|
603 |
|
604 |
|
|
|
9 |
from youtube_transcript_api import YouTubeTranscriptApi
|
10 |
from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
|
11 |
from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
|
|
|
|
|
|
|
|
|
|
|
12 |
from dotenv import load_dotenv
|
13 |
import tempfile
|
14 |
import mimetypes
|
15 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# --- Initialize logging ---
|
18 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
21 |
# --- Load environment variables ---
|
22 |
load_dotenv()
|
23 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
|
|
|
|
24 |
if not HF_API_TOKEN:
|
25 |
logger.error("HF_API_TOKEN not found in environment variables! Please set it to use the HfApiModel.")
|
26 |
+
# Exit or raise an error if the token is critical for functionality
|
27 |
+
# sys.exit(1) # Uncomment if you want to exit the script if token is missing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# --- Utility Functions ---
|
30 |
def extract_youtube_id(url: str) -> str:
|
|
|
32 |
patterns = [
|
33 |
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
|
34 |
r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
|
35 |
+
r'([a-zA-Z0-9_-]{11})' # Catches just the ID if provided directly
|
36 |
]
|
37 |
for pattern in patterns:
|
38 |
match = re.search(pattern, url)
|
|
|
40 |
return match.group(1)
|
41 |
return ""
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# --- Enhanced Tools ---
|
44 |
class WikiSearchTool(Tool):
|
45 |
"""Enhanced Wikipedia search with better formatting and error handling"""
|
|
|
58 |
|
59 |
formatted_results = []
|
60 |
for i, doc in enumerate(docs):
|
61 |
+
# Limit page content length to avoid overwhelming the model, but provide enough context
|
62 |
summary = doc.page_content[:1000] + "..." if len(doc.page_content) > 1000 else doc.page_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
formatted_results.append(
|
64 |
f"--- Wikipedia Result {i+1} ---\n"
|
65 |
f"Title: {doc.metadata.get('title', 'N/A')}\n"
|
|
|
72 |
return f"Wikipedia search error: {str(e)}"
|
73 |
|
74 |
class FileAnalysisTool(Tool):
|
75 |
+
"""Universal file analyzer for text/PDF/Excel files"""
|
76 |
name = "file_analysis"
|
77 |
+
description = "Analyze text, PDF, and Excel files. Returns extracted content."
|
78 |
inputs = {"file_path": {"type": "string", "description": "Path to the local file"}}
|
79 |
output_type = "string"
|
80 |
|
|
|
86 |
mime_type, _ = mimetypes.guess_type(file_path)
|
87 |
logger.info(f"Analyzing file: {file_path} with MIME type: {mime_type}")
|
88 |
|
|
|
89 |
if mime_type == "application/pdf":
|
90 |
+
return self._process_pdf(file_path)
|
91 |
elif mime_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"]:
|
92 |
+
return self._process_excel(file_path)
|
93 |
elif mime_type and ("text" in mime_type or "csv" in mime_type):
|
94 |
+
return self._process_text(file_path)
|
95 |
else:
|
96 |
return f"Unsupported file type for analysis: {mime_type}. Only PDF, Excel, and text/CSV files are supported."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
except Exception as e:
|
98 |
logger.error(f"File analysis error for '{file_path}': {e}")
|
99 |
return f"File analysis error: {str(e)}"
|
|
|
102 |
loader = PyPDFLoader(path)
|
103 |
docs = loader.load()
|
104 |
content = "\n\n".join([doc.page_content for doc in docs])
|
105 |
+
# Truncate to avoid excessive token usage, provide a warning if truncated
|
106 |
if len(content) > 8000:
|
107 |
logger.warning(f"PDF content truncated from {len(content)} to 8000 characters for {path}")
|
108 |
return content[:8000] + "\n... [Content truncated]"
|
|
|
110 |
|
111 |
def _process_excel(self, path: str) -> str:
|
112 |
df = pd.read_excel(path)
|
113 |
+
# Provide a sample of the data and its basic info
|
114 |
info = BytesIO()
|
115 |
df.info(buf=info)
|
116 |
info_str = info.getvalue().decode('utf-8')
|
|
|
127 |
return content
|
128 |
|
129 |
class VideoTranscriptionTool(Tool):
|
130 |
+
"""Enhanced YouTube transcription with multilingual support and better output"""
|
131 |
name = "transcript_video"
|
132 |
+
description = "Fetch YouTube video transcripts with optional timestamps. Supports English, French, Spanish, German."
|
133 |
inputs = {
|
134 |
"url": {"type": "string", "description": "YouTube URL or ID"},
|
135 |
"include_timestamps": {"type": "boolean", "description": "Include timestamps? (default: False)"}
|
|
|
143 |
return "Invalid YouTube URL or ID format. Please provide a valid YouTube URL or an 11-character video ID."
|
144 |
|
145 |
logger.info(f"Attempting to transcribe video ID: {video_id}")
|
146 |
+
transcript = YouTubeTranscriptApi.get_transcript(
|
147 |
video_id,
|
148 |
+
languages=['en', 'fr', 'es', 'de'] # Prioritize common languages
|
149 |
)
|
150 |
|
151 |
+
if not transcript:
|
152 |
return f"No transcript found for video ID: {video_id} in supported languages (en, fr, es, de)."
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
if include_timestamps:
|
155 |
formatted_transcript = "\n".join(
|
156 |
f"[{int(seg['start']//60):02d}:{int(seg['start']%60):02d}] {seg['text']}"
|
157 |
+
for seg in transcript
|
158 |
)
|
159 |
else:
|
160 |
+
formatted_transcript = " ".join(seg['text'] for seg in transcript)
|
161 |
|
162 |
return formatted_transcript
|
163 |
except Exception as e:
|
|
|
167 |
class DataAnalysisTool(Tool):
|
168 |
"""Perform data analysis using pandas on structured data (CSV/Excel)"""
|
169 |
name = "data_analysis"
|
170 |
+
description = "Analyze CSV/Excel data using pandas operations. Supported operations: 'describe', 'groupby:column:aggfunc' (e.g., 'groupby:Category:mean')."
|
171 |
inputs = {
|
172 |
"file_path": {"type": "string", "description": "Path to the local data file (CSV or Excel)"},
|
173 |
+
"operation": {"type": "string", "description": "Pandas operation (e.g., 'describe', 'groupby:column_name:mean')"}
|
174 |
}
|
175 |
output_type = "string"
|
176 |
|
|
|
209 |
logger.error(f"Data analysis error for '{file_path}' with operation '{operation}': {e}")
|
210 |
return f"Data analysis error: {str(e)}. Please check file content and operation."
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
# --- Agent Initialization ---
|
213 |
+
class BasicAgent:
|
214 |
def __init__(self):
|
215 |
self.model = HfApiModel(
|
216 |
+
temperature=0.1, # Slightly increased temperature for more creative responses if appropriate
|
217 |
+
token=HF_API_TOKEN,
|
218 |
max_tokens=2000
|
219 |
)
|
220 |
|
221 |
self.tools = self._initialize_tools()
|
222 |
self.agent = self._create_agent()
|
|
|
223 |
|
224 |
def _initialize_tools(self) -> list:
|
225 |
"""Initialize all tools with enhanced capabilities"""
|
226 |
+
return [
|
227 |
DuckDuckGoSearchTool(),
|
228 |
WikiSearchTool(),
|
229 |
VisitWebpageTool(),
|
230 |
+
SpeechToTextTool(), # Might be less relevant for a text-based research agent but kept if needed
|
231 |
FinalAnswerTool(),
|
232 |
VideoTranscriptionTool(),
|
233 |
FileAnalysisTool(),
|
234 |
DataAnalysisTool(),
|
235 |
+
self._create_excel_download_tool(), # Renamed for clarity
|
236 |
+
self._create_keywords_tool()
|
|
|
237 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
def _create_excel_download_tool(self):
|
240 |
"""Tool to download and parse Excel files from a specific URL"""
|
|
|
242 |
def download_and_parse_excel(task_id: str) -> dict:
|
243 |
"""
|
244 |
Downloads an Excel file from a predefined URL using a task_id and parses its content.
|
245 |
+
Returns a dictionary with status and data (first 20 rows).
|
246 |
"""
|
247 |
try:
|
248 |
url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
|
249 |
logger.info(f"Attempting to download Excel from: {url}")
|
250 |
+
response = requests.get(url, timeout=60) # Increased timeout for larger files
|
251 |
+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
|
252 |
|
253 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
254 |
tmp.write(response.content)
|
255 |
temp_file_path = tmp.name
|
256 |
|
257 |
df = pd.read_excel(temp_file_path)
|
258 |
+
os.unlink(temp_file_path) # Clean up the temporary file
|
259 |
|
260 |
logger.info(f"Successfully downloaded and parsed Excel for task_id: {task_id}")
|
261 |
return {
|
262 |
"task_id": task_id,
|
263 |
+
"data_sample": df.head(10).to_dict(orient="records"), # Reduced to 10 for conciseness
|
264 |
"status": "Success",
|
265 |
+
"columns": df.columns.tolist(), # Added column names for context
|
266 |
+
"shape": df.shape # Added shape for context
|
267 |
}
|
268 |
except requests.exceptions.RequestException as req_err:
|
269 |
logger.error(f"Network or HTTP error downloading Excel for task_id '{task_id}': {req_err}")
|
|
|
288 |
if not text:
|
289 |
return []
|
290 |
|
291 |
+
# Use a more comprehensive list of English stopwords
|
292 |
stopwords = set([
|
293 |
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
|
294 |
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
|
|
|
302 |
"than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
|
303 |
])
|
304 |
|
305 |
+
words = re.findall(r'\b\w+\b', text.lower()) # Relaxed regex to capture all words
|
306 |
+
filtered = [w for w in words if w not in stopwords and len(w) > 2] # Filter words less than 3 chars
|
307 |
counter = Counter(filtered)
|
308 |
return [word for word, _ in counter.most_common(top_n)]
|
309 |
return extract_keywords
|
|
|
318 |
1. **Understand the User's Need:** Carefully analyze the user's question, including any attached files or specific requests (e.g., "summarize," "analyze data," "find facts").
|
319 |
2. **Formulate a Detailed Plan:** Before acting, create a clear, step-by-step plan. This plan should outline:
|
320 |
* What information needs to be gathered.
|
321 |
+
* Which tools are most appropriate for each step (e.g., `duckduckgo_search` for general web search, `wiki_search` for encyclopedic facts, `transcript_video` for YouTube, `file_analysis` or `data_analysis` for local files).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
* How you will combine information from different sources.
|
323 |
* How you will verify or synthesize the findings.
|
324 |
3. **Execute the Plan Using Tools:** Call the necessary tools, providing clear and correct arguments. If a tool fails, try to understand why and adapt your plan (e.g., try a different search query or tool).
|
|
|
329 |
* If the answer is a single number, provide only the number.
|
330 |
* If the answer is a list, provide comma-separated values.
|
331 |
* For complex answers, use structured formats like bullet points or JSON where appropriate to enhance readability.
|
332 |
+
#* **Crucially, always include sources or references (e.g., URLs, Wikipedia titles, file names) where you obtained the information.** This builds trust and allows for verification.
|
333 |
* If you used `file_analysis` or `data_analysis` tools on an uploaded file, explicitly state that you analyzed the provided file.
|
334 |
|
335 |
**Important Considerations:**
|
336 |
+
* **Prioritize:** If the query involves a specific file, start by analyzing that file if appropriate.
|
337 |
+
* **Ambiguity:** If the question is ambiguous, ask for clarification.
|
338 |
* **Limitations:** If you cannot answer a question with the available tools, state that clearly.
|
339 |
+
* **Conciseness:** Be as concise as possible while providing a complete and accurate answer.
|
340 |
"""
|
341 |
agent = CodeAgent(
|
342 |
model=self.model,
|
343 |
tools=self.tools,
|
344 |
+
add_base_tools=True
|
|
|
345 |
)
|
346 |
agent.prompt_templates["system_prompt"] = system_prompt
|
347 |
return agent
|
348 |
|
|
|
349 |
def __call__(self, question: str) -> str:
|
350 |
+
logger.info(f"Received question: {question[:200]}...") # Log more of the question
|
|
|
|
|
351 |
try:
|
352 |
+
response = self.agent.run(question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
logger.info(f"Response generated successfully for question: {question[:200]}")
|
|
|
|
|
354 |
return response
|
355 |
except Exception as e:
|
356 |
+
logger.error(f"Agent execution failed for question '{question[:100]}': {str(e)}", exc_info=True) # Log full traceback
|
|
|
357 |
return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
|
358 |
|
359 |
|