Spaces:
Sleeping
Sleeping
from smolagents import Tool, tool | |
from youtube_transcript_api import YouTubeTranscriptApi | |
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: | |
""" | |
Visit a website / url and fetch the content of the webpage. | |
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML | |
Args: | |
url (str): The URL to fetch. | |
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. | |
Returns: | |
str: The HTML content of the URL. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from markdownify import markdownify as md | |
content = None | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, timeout=30, headers=headers) | |
if (convert_to_markdown): | |
soup = BeautifulSoup(response.text, "html.parser") | |
# remove script and style tags | |
for script in soup(["script", "style"]): | |
script.extract() | |
# for wikipedia only keep the main content | |
if "wikipedia.org" in url: | |
main_content = soup.find("main",{"id":"content"}) | |
if main_content: | |
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
# Fallback for all other sites - from chatgpt - not tested | |
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = response.text | |
return content | |
def read_file_tool(file_path: str) -> str: | |
""" | |
Tool to read a file and return its content. | |
Args: | |
file_path (str): Path to the file to read. | |
Returns: | |
str: Content of the file or error message. | |
""" | |
try: | |
with open(file_path, "r") as file: | |
return file.read() | |
except Exception as e: | |
return f"Error reading file: {str(e)}" | |
def get_youtube_transcript(video_id: str) -> str: | |
""" | |
Fetches the transcript of a YouTube video given its video ID. | |
Args: | |
video_id (str): The ID of the YouTube video. Pass in the video ID, NOT the video URL. For a video with the URL https://www.youtube.com/watch?v=12345 the ID is 12345. | |
Returns: | |
str: The transcript of the YouTube video. as a single string with each line separated by a newline character. | |
""" | |
# Initialize the YouTubeTranscriptApi | |
ytt_api = YouTubeTranscriptApi() | |
fetched_transcript = ytt_api.fetch(video_id) | |
raw_data = fetched_transcript.to_raw_data() | |
# raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines | |
transcript = "\n".join([item['text'] for item in raw_data]) | |
return transcript | |