File size: 2,421 Bytes
372531f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from bs4 import BeautifulSoup
import os
from ..utils import get_relevant_images, extract_title

class TavilyExtract:

    def __init__(self, link, session=None):
        self.link = link
        self.session = session
        from tavily import TavilyClient
        self.tavily_client = TavilyClient(api_key=self.get_api_key())

    def get_api_key(self) -> str:
        """

        Gets the Tavily API key

        Returns:

        Api key (str)

        """
        try:
            api_key = os.environ["TAVILY_API_KEY"]
        except KeyError:
            raise Exception(
                "Tavily API key not found. Please set the TAVILY_API_KEY environment variable.")
        return api_key

    def scrape(self) -> tuple:
        """

        This function extracts content from a specified link using the Tavily Python SDK, the title and

        images from the link are extracted using the functions from `gpt_researcher/scraper/utils.py`.



        Returns:

          The `scrape` method returns a tuple containing the extracted content, a list of image URLs, and

        the title of the webpage specified by the `self.link` attribute. It uses the Tavily Python SDK to

        extract and clean content from the webpage. If any exception occurs during the process, an error

        message is printed and an empty result is returned.

        """

        try:
            response = self.tavily_client.extract(urls=self.link)
            if response['failed_results']:
                return "", [], ""

            # Parse the HTML content of the response to create a BeautifulSoup object for the utility functions
            response_bs = self.session.get(self.link, timeout=4)
            soup = BeautifulSoup(
                response_bs.content, "lxml", from_encoding=response_bs.encoding
            )

            # Since only a single link is provided to tavily_client, the results will contain only one entry.
            content = response['results'][0]['raw_content']

            # Get relevant images using the utility function
            image_urls = get_relevant_images(soup, self.link)

            # Extract the title using the utility function
            title = extract_title(soup)

            return content, image_urls, title

        except Exception as e:
            print("Error! : " + str(e))
            return "", [], ""