|
import json |
|
|
|
from langchain_community.document_loaders import ArxivLoader, WikipediaLoader |
|
from markitdown import MarkItDown |
|
from smolagents import ( |
|
tool, |
|
) |
|
|
|
md = MarkItDown(enable_plugins=True) |
|
|
|
|
|
@tool |
|
def arvix_search(query: str) -> str: |
|
"""Search Arxiv for a query and return maximum 3 result. |
|
|
|
Args: |
|
query: The search query.""" |
|
search_docs = ArxivLoader(query=query, load_max_docs=3).load() |
|
formatted_search_docs = "\n\n---\n\n".join( |
|
[ |
|
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>' |
|
for doc in search_docs |
|
] |
|
) |
|
return formatted_search_docs |
|
|
|
|
|
@tool |
|
def read_excel_content_to_markdown_content(file_location: str) -> str: |
|
"""Read the content of an Excel file and convert it to markdown content. |
|
|
|
Args: |
|
file_location: The path to the Excel file.""" |
|
|
|
result = md.convert(file_location) |
|
return result.text_content |
|
|
|
|
|
@tool |
|
def read_pdf_content_to_markdown(file_location: str) -> str: |
|
"""Read the content of a PDF file and convert it to markdown. |
|
|
|
Args: |
|
file_location: The path to the PDF file.""" |
|
|
|
result = md.convert(file_location) |
|
return result.text_content |
|
|
|
|
|
@tool |
|
def get_audio_transcription(file_path: str) -> str: |
|
"""Get the transcription of the audio file using the file path. |
|
|
|
Args: |
|
file_path: The path of the audio file.""" |
|
|
|
result = md.convert(file_path) |
|
return result.text_content |
|
|
|
|
|
@tool |
|
def get_python_file_content(file_name: str) -> str: |
|
"""Get the content of a mentioned Python file. |
|
|
|
Args: |
|
file_name: The name of the file.""" |
|
file_path = f"{file_name}" |
|
with open(file_path, "r") as f: |
|
content = f.read() |
|
return content |
|
|
|
|
|
@tool |
|
def visit_webpage_to_markdown(url: str) -> str: |
|
"""Visit a web page and return its content in markdown format. |
|
|
|
Args: |
|
url: The URL of the web page.""" |
|
result = md.convert(url) |
|
return result.text_content |
|
|
|
|
|
@tool |
|
def extract_markdown_tables_from_markdown_content(markdown_content: str) -> str: |
|
"""Extract and return the markdown tables from a given markdown content string in a structured json format. |
|
|
|
Args: |
|
markdown_content: The markdown string containing the table.""" |
|
from mrkdwn_analysis import MarkdownAnalyzer |
|
|
|
analyzer = MarkdownAnalyzer.from_string(markdown_content) |
|
analyzer.analyse() |
|
return json.dumps(analyzer.identify_tables()) |
|
|
|
|
|
@tool |
|
def wiki_search(query: str) -> str: |
|
"""Search Wikipedia for a query and return maximum 2 results. |
|
|
|
Args: |
|
query: The search query.""" |
|
search_docs = WikipediaLoader(query=query, load_max_docs=2).load() |
|
formatted_search_docs = "\n\n---\n\n".join( |
|
[ |
|
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>' |
|
for doc in search_docs |
|
] |
|
) |
|
return formatted_search_docs |
|
|