File size: 8,167 Bytes
b5e7375
 
 
 
 
bc90a07
 
b5e7375
 
bc90a07
b5e7375
 
 
 
bc90a07
 
 
b5e7375
 
bc90a07
 
 
 
b5e7375
 
 
 
 
 
 
bc90a07
b5e7375
 
 
bc90a07
b5e7375
bc90a07
 
 
 
 
 
 
 
b5e7375
bc90a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e7375
 
bc90a07
 
b5e7375
 
bc90a07
b5e7375
 
bc90a07
b5e7375
bc90a07
 
 
 
 
 
b5e7375
bc90a07
b5e7375
 
bc90a07
 
b5e7375
 
bc90a07
 
 
b5e7375
 
bc90a07
b5e7375
bc90a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#

import aiohttp  # Import the aiohttp library to perform asynchronous HTTP requests
import asyncio  # Import asyncio library to handle asynchronous operations and implement delay mechanisms
from src.utils.ip_generator import generate_ip  # Import function to generate random IP addresses for request headers

# Define the main SearchTools class that provides web searching and URL reading capabilities
class SearchTools:
    # This class provides methods to connect to the web

    """
    A comprehensive class providing tools to perform web searches and read content from URLs using various search engines
    and a reader API service. This class implements full asynchronous operations with robust retry mechanisms to ensure
    connections remain active even when encountering errors.

    Attributes:
        searxng_url (str): Base URL for the SearXNG search proxy service that handles Google and other search engines.
        baidu_url (str): Base URL for Baidu search engine for Chinese language searches.
        timeout (int): Timeout duration in seconds for HTTP requests to prevent indefinite hanging.
        reader_api (str): Base URL for the reader API service used to extract clean content from URLs.

    Methods:
        read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
        search(query, engine): Asynchronously performs a web search with the given query on the specified search engine,
                               returning the raw HTML response text.
    """

    # Constructor method to initialize the SearchTools instance with all necessary configuration values
    def __init__(self):
        """
        Initialize the SearchTools instance with predefined URLs and timeout settings.
        This method sets up all the base URLs and configuration parameters needed for web searching and content reading.
        """
        # Set the base URL for SearXNG search proxy service which provides access to multiple search engines
        self.searxng_url = "https://paulgo.io/search"
        # Set the base URL for Baidu search engine for handling Chinese language queries
        self.baidu_url = "https://www.baidu.com/s"
        # Set timeout duration to 30 seconds to balance between allowing slow responses and preventing infinite waits
        self.timeout = 30
        # Set the reader API endpoint that converts web pages into clean, readable text format
        self.reader_api = "https://r.jina.ai/"

    # Private helper method that implements the core retry logic for all HTTP requests
    async def _fetch_with_retry(self, session, method, url, **kwargs):
        """
        Helper method to perform HTTP requests with infinite retry until a valid response is obtained.
        This method ensures that connections never fail permanently and will keep trying until success.

        Args:
            session (aiohttp.ClientSession): The aiohttp session object to use for making HTTP requests.
            method (str): HTTP method to use for the request (e.g., 'get', 'post', 'put', 'delete').
            url (str): The complete URL to send the request to.
            **kwargs: Additional keyword arguments to pass to the aiohttp request method (headers, data, etc.).

        Returns:
            str: The response text content when a successful request is finally achieved.
        """
        # Create an infinite loop that will only break when a successful response is received
        while True:
            # Use a try-except block to catch any type of exception that might occur during the request
            try:
                # Make the actual HTTP request using the provided session, method, URL and additional arguments
                async with session.request(method, url, **kwargs) as response:
                    # Check if the response status indicates success, raise exception if it's an error status
                    response.raise_for_status()
                    # Return the text content of the successful response
                    return await response.text()
            # Catch any exception that occurs during the request process
            except Exception:
                # Retry on any exception without stopping the loop or raising the error
                # Wait for 5 second before attempting the next retry to avoid overwhelming the server
                await asyncio.sleep(5)

    # Public method to read and extract content from any given URL
    async def read_url(self, url: str) -> str:
        """
        Asynchronously read and retrieve the textual content of a given URL using the reader API with infinite retry.
        This method will keep trying until it successfully retrieves the content from the specified URL.

        Args:
            url (str): The complete URL of the webpage to read content from.

        Returns:
            str: The clean textual content extracted from the URL by the reader API service.
        """
        # Prepare the POST data payload containing the target URL for the reader API
        data = {"url": url}
        # Create an aiohttp client session with the configured timeout settings
        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
            # Use the retry helper method to POST the URL to the reader API and get the content
            return await self._fetch_with_retry(session, 'post', self.reader_api, data=data)

    # Public method to perform web searches using different search engines
    async def search(self, query: str, engine: str = "google") -> str:
        """
        Asynchronously perform a web search for the given query using the specified search engine with infinite retry.
        This method will keep trying until it successfully retrieves search results from the chosen search engine.

        Args:
            query (str): The search query string containing the terms to search for.
            engine (str, optional): The search engine to use for the search. Supported values are "google" and "baidu".
                                    Defaults to "google" if not specified.

        Returns:
            str: The raw HTML content of the search results page from the specified search engine.
        """
        # Check if the user wants to use Baidu search engine for the query
        if engine == "baidu":
            # Construct the full URL by combining reader API, Baidu URL and the search query parameter
            url = f"{self.reader_api}{self.baidu_url}?wd={query}"
            # Set HTTP headers specific to Baidu search results extraction
            headers = {
                # Target the main content container where Baidu displays search results
                "X-Target-Selector": "#content_left",
                "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
            }
        # Handle all other search engines (Google, Bing, etc.) through SearXNG proxy
        else:
            # Determine the search prefix based on the requested engine (Google or Bing)
            prefix = "!go" if engine == "google" else "!bi"
            # Construct the full URL by combining reader API, SearXNG URL, prefix and query
            url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
            # Set HTTP headers specific to SearXNG search results extraction
            headers = {
                # Target the URLs container where SearXNG displays search result links
                "X-Target-Selector": "#urls",
                "X-Forwarded-For": generate_ip()  # Random IP address to simulate different client origins
            }
        # Create an aiohttp client session with the configured timeout settings
        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
            # Use the retry helper method to GET the search results and return the HTML content
            return await self._fetch_with_retry(session, 'get', url, headers=headers)