File size: 3,193 Bytes
396f5a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from smolagents import Tool, tool
from youtube_transcript_api import YouTubeTranscriptApi

@tool
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
    """
    Visit a website / url and fetch the content of the webpage. 
    if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
    Args:
        url (str): The URL to fetch.
        convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
    Returns:
        str: The HTML content of the URL.
    """
    import requests
    from bs4 import BeautifulSoup
    from markdownify import markdownify as md

    content = None
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, timeout=30, headers=headers)
    
    if (convert_to_markdown):
        soup = BeautifulSoup(response.text, "html.parser")

        # remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()

        # for wikipedia only keep the main content
        if "wikipedia.org" in url:
            main_content = soup.find("main",{"id":"content"})
            if main_content:
                content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
            else:
                content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
        else:
            # Fallback for all other sites - from chatgpt - not tested
            content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
    else:
        content = response.text
           
    return content


@tool
def read_file_tool(file_path: str) -> str:
    """
    Tool to read a file and return its content.

    Args:
        file_path (str): Path to the file to read.

    Returns:
        str: Content of the file or error message.
    """
    try:
        with open(file_path, "r") as file:
            return file.read()
    except Exception as e:
        return f"Error reading file: {str(e)}"
    

@tool
def get_youtube_transcript(video_id: str) -> str:
    """
    Fetches the transcript of a YouTube video given its video ID.    
    Args:
        video_id (str): The ID of the YouTube video. Pass in the video ID, NOT the video URL. For a video with the URL https://www.youtube.com/watch?v=12345 the ID is 12345.
    Returns:
        str: The transcript of the YouTube video. as a single string with each line separated by a newline character.
    """
    # Initialize the YouTubeTranscriptApi
    ytt_api = YouTubeTranscriptApi()
    fetched_transcript = ytt_api.fetch(video_id)
    raw_data = fetched_transcript.to_raw_data()
    # raw data is in the form of [{        'text': 'Hey there',        'start': 0.0,        'duration': 1.54    },    {        'text': 'how are you',,        'start': 1.54,        'duration': 4.16    },   ... ] we will return ony the text element as lines
    transcript = "\n".join([item['text'] for item in raw_data])
    return transcript