naman1102 commited on
Commit
1ae0aa0
·
1 Parent(s): ebf7d5c
Files changed (2) hide show
  1. requirements.txt +11 -14
  2. tools.py +33 -55
requirements.txt CHANGED
@@ -1,19 +1,16 @@
 
 
1
  gradio
2
- requests
3
- pillow
4
- pytesseract
5
- langgraph
6
  langchain
7
- openai
8
- pandas
9
- langchain_openai
10
  langchain_community
 
 
11
  openai
12
- duckduckgo-search
13
- regex
14
- pytesseract
15
  openpyxl
16
- duckduckgo-search
17
- wikipedia
18
- arxiv
19
- PyMuPDF
 
 
 
 
1
+ arxiv
2
+ duckduckgo-search
3
  gradio
 
 
 
 
4
  langchain
 
 
 
5
  langchain_community
6
+ langchain_openai
7
+ langgraph
8
  openai
 
 
 
9
  openpyxl
10
+ pandas
11
+ pillow
12
+ PyMuPDF
13
+ pytesseract
14
+ regex
15
+ requests
16
+ wikipedia
tools.py CHANGED
@@ -10,6 +10,9 @@ import os
10
  from duckduckgo_search import DDGS
11
  from langchain_core.tools import tool
12
  from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
 
 
 
13
 
14
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
 
@@ -272,7 +275,7 @@ def wikipedia_search_tool(wiki_query: str) -> str:
272
  return error_msg
273
 
274
  @tool
275
- def arxiv_search_tool(arxiv_query: str) -> str:
276
  """
277
  TOOL NAME: ArXiv Academic Search Tool
278
 
@@ -285,62 +288,37 @@ def arxiv_search_tool(arxiv_query: str) -> str:
285
  - "What are recent studies on climate change?"
286
  - "Search for papers on quantum computing"
287
  """
288
- print(f"DEBUG: reached arxiv_search_tool with query: {arxiv_query}")
289
  try:
290
- docs = ArxivLoader(query=arxiv_query, load_max_docs=3).load() # Reduced from 5 to 3
291
- print(f"DEBUG: ArxivLoader returned {len(docs)} documents")
292
-
293
- result = ""
294
- counter = 1
295
- for doc in docs:
296
- print(f"DEBUG: Processing document {counter}")
297
- print(f"DEBUG: Document metadata: {doc.metadata}")
298
- print(f"DEBUG: Document content length: {len(doc.page_content)}")
299
-
300
- # Handle different metadata structures
301
- title = "Unknown Title"
302
- if hasattr(doc, 'metadata') and doc.metadata:
303
- # Try different possible title keys
304
- if 'title' in doc.metadata:
305
- title = doc.metadata['title']
306
- elif 'Title' in doc.metadata:
307
- title = doc.metadata['Title']
308
- elif 'entry_id' in doc.metadata:
309
- title = doc.metadata['entry_id']
310
- elif 'summary' in doc.metadata:
311
- title = f"ArXiv Paper {counter}"
312
- else:
313
- # Use first available key as title
314
- if doc.metadata:
315
- first_key = list(doc.metadata.keys())[0]
316
- title = f"{first_key}: {doc.metadata[first_key]}"
317
-
318
- print(f"DEBUG: Using title: {title}")
319
-
320
- # Trim content to key information only (reduced from 2000 to 800 characters)
321
- content = doc.page_content[:800] if len(doc.page_content) > 800 else doc.page_content
322
-
323
- # Add document but keep it concise
324
- result += f"\n\nArXiv Result {counter}: {title}\nAbstract/Summary: {content}..."
325
- counter += 1
326
-
327
- # Stop after 2 documents to keep response manageable
328
- if counter > 2:
329
- break
330
-
331
- if not result.strip():
332
- return "No ArXiv results found for the given query. [END_OF_SEARCH]"
333
-
334
- # Add clear end marker
335
- result += "\n\n[END_OF_SEARCH] - ArXiv search complete. Use this information to answer the question."
336
-
337
- print(f"DEBUG: Final ArXiv result length: {len(result)}")
338
- return result
339
-
340
  except Exception as e:
341
- error_msg = f"Error during Arxiv search: {str(e)} [END_OF_SEARCH]"
342
- print(f"DEBUG: {error_msg}")
343
- return error_msg
344
 
345
 
346
  from langchain_openai import ChatOpenAI
 
10
  from duckduckgo_search import DDGS
11
  from langchain_core.tools import tool
12
  from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
13
+ import arxiv
14
+ import fitz # PyMuPDF
15
+ import tempfile
16
 
17
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
18
 
 
275
  return error_msg
276
 
277
  @tool
278
+ def arxiv_search_tool(query: str) -> str:
279
  """
280
  TOOL NAME: ArXiv Academic Search Tool
281
 
 
288
  - "What are recent studies on climate change?"
289
  - "Search for papers on quantum computing"
290
  """
 
291
  try:
292
+ # Search arXiv for the top result
293
+ search = arxiv.Search(query=query, max_results=1, sort_by=arxiv.SortCriterion.Relevance)
294
+ result = next(search.results(), None)
295
+
296
+ if not result:
297
+ return "No results found. [END_OF_SEARCH]"
298
+
299
+ # Download PDF
300
+ pdf_url = result.pdf_url
301
+ response = requests.get(pdf_url)
302
+ response.raise_for_status()
303
+
304
+ # Save and open PDF
305
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
306
+ tmp.write(response.content)
307
+ tmp.flush()
308
+
309
+ doc = fitz.open(tmp.name)
310
+ text = ""
311
+ for page in doc:
312
+ text += page.get_text()
313
+
314
+ # Clean and trim text
315
+ text = " ".join(text.split())
316
+ summary = text[:3000] + "..." if len(text) > 3000 else text
317
+
318
+ return f"Title: {result.title}\n\nSummary:\n{summary}\n\n[END_OF_SEARCH]"
319
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  except Exception as e:
321
+ return f"Error fetching arXiv content: {e} [END_OF_SEARCH]"
 
 
322
 
323
 
324
  from langchain_openai import ChatOpenAI