Spaces:
Running
Running
# | |
# SPDX-FileCopyrightText: Hadad <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
import aiohttp # Import the aiohttp library to perform asynchronous HTTP requests | |
import asyncio # Import asyncio library to handle asynchronous operations and implement delay mechanisms | |
from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers | |
# Define the main SearchTools class that provides web searching and URL reading capabilities | |
class SearchTools: | |
# This class provides methods to connect to the web | |
""" | |
A comprehensive class providing tools to perform web searches and read content from URLs using various search engines | |
and a reader API service. This class implements full asynchronous operations with robust retry mechanisms to ensure | |
connections remain active even when encountering errors. | |
Attributes: | |
searxng_url (str): Base URL for the SearXNG search proxy service that handles Google and other search engines. | |
baidu_url (str): Base URL for Baidu search engine for Chinese language searches. | |
timeout (int): Timeout duration in seconds for HTTP requests to prevent indefinite hanging. | |
reader_api (str): Base URL for the reader API service used to extract clean content from URLs. | |
Methods: | |
read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API. | |
search(query, engine): Asynchronously performs a web search with the given query on the specified search engine, | |
returning the raw HTML response text. | |
""" | |
# Constructor method to initialize the SearchTools instance with all necessary configuration values | |
def __init__(self): | |
""" | |
Initialize the SearchTools instance with predefined URLs and timeout settings. | |
This method sets up all the base URLs and configuration parameters needed for web searching and content reading. | |
""" | |
# Set the base URL for SearXNG search proxy service which provides access to multiple search engines | |
self.searxng_url = "https://paulgo.io/search" | |
# Set the base URL for Baidu search engine for handling Chinese language queries | |
self.baidu_url = "https://www.baidu.com/s" | |
# Set timeout duration to 30 seconds to balance between allowing slow responses and preventing infinite waits | |
self.timeout = 30 | |
# Set the reader API endpoint that converts web pages into clean, readable text format | |
self.reader_api = "https://r.jina.ai/" | |
# Private helper method that implements the core retry logic for all HTTP requests | |
async def _fetch_with_retry(self, session, method, url, **kwargs): | |
""" | |
Helper method to perform HTTP requests with infinite retry until a valid response is obtained. | |
This method ensures that connections never fail permanently and will keep trying until success. | |
Args: | |
session (aiohttp.ClientSession): The aiohttp session object to use for making HTTP requests. | |
method (str): HTTP method to use for the request (e.g., 'get', 'post', 'put', 'delete'). | |
url (str): The complete URL to send the request to. | |
**kwargs: Additional keyword arguments to pass to the aiohttp request method (headers, data, etc.). | |
Returns: | |
str: The response text content when a successful request is finally achieved. | |
""" | |
# Create an infinite loop that will only break when a successful response is received | |
while True: | |
# Use a try-except block to catch any type of exception that might occur during the request | |
try: | |
# Make the actual HTTP request using the provided session, method, URL and additional arguments | |
async with session.request(method, url, **kwargs) as response: | |
# Check if the response status indicates success, raise exception if it's an error status | |
response.raise_for_status() | |
# Return the text content of the successful response | |
return await response.text() | |
# Catch any exception that occurs during the request process | |
except Exception: | |
# Retry on any exception without stopping the loop or raising the error | |
# Wait for 5 second before attempting the next retry to avoid overwhelming the server | |
await asyncio.sleep(5) | |
# Public method to read and extract content from any given URL | |
async def read_url(self, url: str) -> str: | |
""" | |
Asynchronously read and retrieve the textual content of a given URL using the reader API with infinite retry. | |
This method will keep trying until it successfully retrieves the content from the specified URL. | |
Args: | |
url (str): The complete URL of the webpage to read content from. | |
Returns: | |
str: The clean textual content extracted from the URL by the reader API service. | |
""" | |
# Prepare the POST data payload containing the target URL for the reader API | |
data = {"url": url} | |
# Create an aiohttp client session with the configured timeout settings | |
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session: | |
# Use the retry helper method to POST the URL to the reader API and get the content | |
return await self._fetch_with_retry(session, 'post', self.reader_api, data=data) | |
# Public method to perform web searches using different search engines | |
async def search(self, query: str, engine: str = "google") -> str: | |
""" | |
Asynchronously perform a web search for the given query using the specified search engine with infinite retry. | |
This method will keep trying until it successfully retrieves search results from the chosen search engine. | |
Args: | |
query (str): The search query string containing the terms to search for. | |
engine (str, optional): The search engine to use for the search. Supported values are "google" and "baidu". | |
Defaults to "google" if not specified. | |
Returns: | |
str: The raw HTML content of the search results page from the specified search engine. | |
""" | |
# Check if the user wants to use Baidu search engine for the query | |
if engine == "baidu": | |
# Construct the full URL by combining reader API, Baidu URL and the search query parameter | |
url = f"{self.reader_api}{self.baidu_url}?wd={query}" | |
# Set HTTP headers specific to Baidu search results extraction | |
headers = { | |
# Target the main content container where Baidu displays search results | |
"X-Target-Selector": "#content_left", | |
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins | |
} | |
# Handle all other search engines (Google, Bing, etc.) through SearXNG proxy | |
else: | |
# Determine the search prefix based on the requested engine (Google or Bing) | |
prefix = "!go" if engine == "google" else "!bi" | |
# Construct the full URL by combining reader API, SearXNG URL, prefix and query | |
url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}" | |
# Set HTTP headers specific to SearXNG search results extraction | |
headers = { | |
# Target the URLs container where SearXNG displays search result links | |
"X-Target-Selector": "#urls", | |
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins | |
} | |
# Create an aiohttp client session with the configured timeout settings | |
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session: | |
# Use the retry helper method to GET the search results and return the HTML content | |
return await self._fetch_with_retry(session, 'get', url, headers=headers) |