File size: 1,938 Bytes
396f5a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from smolagents import tool

@tool
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
    """
    Visits a website and fetches the content of a given URL / webpage. 
    if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
    Args:
        url (str): The URL to fetch.
        convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
    Returns:
        str: The HTML content of the URL.
    """
    import requests
    from bs4 import BeautifulSoup
    from markdownify import markdownify as md

    content = None
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, timeout=30, headers=headers)
    # print(response.text)
    if response.text is not None:
        print("not none")
    if (convert_to_markdown):
        soup = BeautifulSoup(response.text, "html.parser")
        # remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()

        # for wikipedia only keep the main content
        if "wikipedia.org" in url:
            main_content = soup.find("main",{"id":"content"})
            if main_content:
                content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
            else:
                content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
        else:
            # Fallback for all other sites - from chatgpt - not tested
            content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
    else:
        content = response.text
    
    # save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")
           
    return content