File size: 3,736 Bytes
7acb2e7
 
 
 
 
bbc85bc
7acb2e7
 
 
 
 
 
 
 
 
 
 
 
5675d05
 
7acb2e7
 
 
 
 
 
 
 
 
 
5675d05
7acb2e7
5675d05
7acb2e7
 
 
5675d05
 
7acb2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# this is  asmolagent too to fetch html content from a url
from smolagents import tool
import requests
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from common.mylogger import save_file_with_timestamp, mylog

@tool
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
    """

    Fetches the HTML content of a given URL. 

    if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML

    Args:

        url (str): The URL to fetch.

        convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.

    Returns:

        str: The HTML content of the URL.

    """    
    content = None
    response = requests.get(url, timeout=30)
    if (convert_to_markdown):
        soup = BeautifulSoup(response.text, "html.parser")
        # remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()

        # for wikipedia only keep the main content
        if "wikipedia.org" in url:
            main_content = soup.find("main",{"id":"content"})
            if main_content:
                content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
            else:
                content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
    else:
        content = response.text
    
    save_file_with_timestamp(content, "webpage", ".md" if convert_to_markdown else ".html")
           
    return content

@tool
# this tool allow web search on a local SearXNG instance
def search_web(query: str, num_results: int = 5) -> list:
    """

    Perform a web search using local SearXNG instance.

    Args:

        query (str): The search query.

        num_results (int): The number of results to return.

    Returns:

        list: A list of search results sorted by score with {url, title, content, score} for each result.

    """
    # local metaserach engine searxng, run on localhost:8888
    searxng_url = "http://localhost:8888/search"
    params = {"q": query, "format": 'json'}
    response = requests.get(searxng_url, params=params)
    if response.status_code == 200:
        ret = response.json()
        # keep only the response'results' array
        results = ret.get("results", [])
        # keep only the first num_results   
        results = results[:num_results]
        # for each result keep only the url, title and content ans score
        results = [
            {
                "url": result.get("url"),
                "title": result.get("title"),
                "content": result.get("content"),
                "score": result.get("score"),
            }
            for result in results
        ]
        
        return results
        
    else:
        print(f"Error: {response.status_code}")
        return []

if __name__ == "__main__":
   
    try:
        # Test the function
        query = "What is the capital of France?"
        results = search_web(query,3)
        print(results)
    except Exception as e:
        print(f"An error occurred: {e}")

    try:
        # Test the function
        video_id = "L1vXCYZAYYM"  # Replace with your YouTube video ID
        video_url = "https://www.youtube.com/watch?v=" + video_id
        url = "https://en.wikipedia.org/wiki/Malko_Competition"
        # page_content = fetch_webpage(video_url)
        page_content = fetch_webpage(url, convert_to_markdown=True)
        print(page_content.encode("utf-8"))
    except Exception as e:
        print(f"An error occurred: {e}")