Spaces:
Sleeping
Sleeping
Arxiv
Browse files- requirements.txt +11 -14
- tools.py +33 -55
requirements.txt
CHANGED
@@ -1,19 +1,16 @@
|
|
|
|
|
|
1 |
gradio
|
2 |
-
requests
|
3 |
-
pillow
|
4 |
-
pytesseract
|
5 |
-
langgraph
|
6 |
langchain
|
7 |
-
openai
|
8 |
-
pandas
|
9 |
-
langchain_openai
|
10 |
langchain_community
|
|
|
|
|
11 |
openai
|
12 |
-
duckduckgo-search
|
13 |
-
regex
|
14 |
-
pytesseract
|
15 |
openpyxl
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
1 |
+
arxiv
|
2 |
+
duckduckgo-search
|
3 |
gradio
|
|
|
|
|
|
|
|
|
4 |
langchain
|
|
|
|
|
|
|
5 |
langchain_community
|
6 |
+
langchain_openai
|
7 |
+
langgraph
|
8 |
openai
|
|
|
|
|
|
|
9 |
openpyxl
|
10 |
+
pandas
|
11 |
+
pillow
|
12 |
+
PyMuPDF
|
13 |
+
pytesseract
|
14 |
+
regex
|
15 |
+
requests
|
16 |
+
wikipedia
|
tools.py
CHANGED
@@ -10,6 +10,9 @@ import os
|
|
10 |
from duckduckgo_search import DDGS
|
11 |
from langchain_core.tools import tool
|
12 |
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
|
|
|
|
|
|
|
13 |
|
14 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
15 |
|
@@ -272,7 +275,7 @@ def wikipedia_search_tool(wiki_query: str) -> str:
|
|
272 |
return error_msg
|
273 |
|
274 |
@tool
|
275 |
-
def arxiv_search_tool(
|
276 |
"""
|
277 |
TOOL NAME: ArXiv Academic Search Tool
|
278 |
|
@@ -285,62 +288,37 @@ def arxiv_search_tool(arxiv_query: str) -> str:
|
|
285 |
- "What are recent studies on climate change?"
|
286 |
- "Search for papers on quantum computing"
|
287 |
"""
|
288 |
-
print(f"DEBUG: reached arxiv_search_tool with query: {arxiv_query}")
|
289 |
try:
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
print(f"DEBUG: Using title: {title}")
|
319 |
-
|
320 |
-
# Trim content to key information only (reduced from 2000 to 800 characters)
|
321 |
-
content = doc.page_content[:800] if len(doc.page_content) > 800 else doc.page_content
|
322 |
-
|
323 |
-
# Add document but keep it concise
|
324 |
-
result += f"\n\nArXiv Result {counter}: {title}\nAbstract/Summary: {content}..."
|
325 |
-
counter += 1
|
326 |
-
|
327 |
-
# Stop after 2 documents to keep response manageable
|
328 |
-
if counter > 2:
|
329 |
-
break
|
330 |
-
|
331 |
-
if not result.strip():
|
332 |
-
return "No ArXiv results found for the given query. [END_OF_SEARCH]"
|
333 |
-
|
334 |
-
# Add clear end marker
|
335 |
-
result += "\n\n[END_OF_SEARCH] - ArXiv search complete. Use this information to answer the question."
|
336 |
-
|
337 |
-
print(f"DEBUG: Final ArXiv result length: {len(result)}")
|
338 |
-
return result
|
339 |
-
|
340 |
except Exception as e:
|
341 |
-
|
342 |
-
print(f"DEBUG: {error_msg}")
|
343 |
-
return error_msg
|
344 |
|
345 |
|
346 |
from langchain_openai import ChatOpenAI
|
|
|
10 |
from duckduckgo_search import DDGS
|
11 |
from langchain_core.tools import tool
|
12 |
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
|
13 |
+
import arxiv
|
14 |
+
import fitz # PyMuPDF
|
15 |
+
import tempfile
|
16 |
|
17 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
18 |
|
|
|
275 |
return error_msg
|
276 |
|
277 |
@tool
|
278 |
+
def arxiv_search_tool(query: str) -> str:
|
279 |
"""
|
280 |
TOOL NAME: ArXiv Academic Search Tool
|
281 |
|
|
|
288 |
- "What are recent studies on climate change?"
|
289 |
- "Search for papers on quantum computing"
|
290 |
"""
|
|
|
291 |
try:
|
292 |
+
# Search arXiv for the top result
|
293 |
+
search = arxiv.Search(query=query, max_results=1, sort_by=arxiv.SortCriterion.Relevance)
|
294 |
+
result = next(search.results(), None)
|
295 |
+
|
296 |
+
if not result:
|
297 |
+
return "No results found. [END_OF_SEARCH]"
|
298 |
+
|
299 |
+
# Download PDF
|
300 |
+
pdf_url = result.pdf_url
|
301 |
+
response = requests.get(pdf_url)
|
302 |
+
response.raise_for_status()
|
303 |
+
|
304 |
+
# Save and open PDF
|
305 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
|
306 |
+
tmp.write(response.content)
|
307 |
+
tmp.flush()
|
308 |
+
|
309 |
+
doc = fitz.open(tmp.name)
|
310 |
+
text = ""
|
311 |
+
for page in doc:
|
312 |
+
text += page.get_text()
|
313 |
+
|
314 |
+
# Clean and trim text
|
315 |
+
text = " ".join(text.split())
|
316 |
+
summary = text[:3000] + "..." if len(text) > 3000 else text
|
317 |
+
|
318 |
+
return f"Title: {result.title}\n\nSummary:\n{summary}\n\n[END_OF_SEARCH]"
|
319 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
except Exception as e:
|
321 |
+
return f"Error fetching arXiv content: {e} [END_OF_SEARCH]"
|
|
|
|
|
322 |
|
323 |
|
324 |
from langchain_openai import ChatOpenAI
|