Spaces:
Running
Running
File size: 8,167 Bytes
b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 b5e7375 bc90a07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#
import aiohttp # Import the aiohttp library to perform asynchronous HTTP requests
import asyncio # Import asyncio library to handle asynchronous operations and implement delay mechanisms
from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers
# Define the main SearchTools class that provides web searching and URL reading capabilities
class SearchTools:
# This class provides methods to connect to the web
"""
A comprehensive class providing tools to perform web searches and read content from URLs using various search engines
and a reader API service. This class implements full asynchronous operations with robust retry mechanisms to ensure
connections remain active even when encountering errors.
Attributes:
searxng_url (str): Base URL for the SearXNG search proxy service that handles Google and other search engines.
baidu_url (str): Base URL for Baidu search engine for Chinese language searches.
timeout (int): Timeout duration in seconds for HTTP requests to prevent indefinite hanging.
reader_api (str): Base URL for the reader API service used to extract clean content from URLs.
Methods:
read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
search(query, engine): Asynchronously performs a web search with the given query on the specified search engine,
returning the raw HTML response text.
"""
# Constructor method to initialize the SearchTools instance with all necessary configuration values
def __init__(self):
"""
Initialize the SearchTools instance with predefined URLs and timeout settings.
This method sets up all the base URLs and configuration parameters needed for web searching and content reading.
"""
# Set the base URL for SearXNG search proxy service which provides access to multiple search engines
self.searxng_url = "https://paulgo.io/search"
# Set the base URL for Baidu search engine for handling Chinese language queries
self.baidu_url = "https://www.baidu.com/s"
# Set timeout duration to 30 seconds to balance between allowing slow responses and preventing infinite waits
self.timeout = 30
# Set the reader API endpoint that converts web pages into clean, readable text format
self.reader_api = "https://r.jina.ai/"
# Private helper method that implements the core retry logic for all HTTP requests
async def _fetch_with_retry(self, session, method, url, **kwargs):
"""
Helper method to perform HTTP requests with infinite retry until a valid response is obtained.
This method ensures that connections never fail permanently and will keep trying until success.
Args:
session (aiohttp.ClientSession): The aiohttp session object to use for making HTTP requests.
method (str): HTTP method to use for the request (e.g., 'get', 'post', 'put', 'delete').
url (str): The complete URL to send the request to.
**kwargs: Additional keyword arguments to pass to the aiohttp request method (headers, data, etc.).
Returns:
str: The response text content when a successful request is finally achieved.
"""
# Create an infinite loop that will only break when a successful response is received
while True:
# Use a try-except block to catch any type of exception that might occur during the request
try:
# Make the actual HTTP request using the provided session, method, URL and additional arguments
async with session.request(method, url, **kwargs) as response:
# Check if the response status indicates success, raise exception if it's an error status
response.raise_for_status()
# Return the text content of the successful response
return await response.text()
# Catch any exception that occurs during the request process
except Exception:
# Retry on any exception without stopping the loop or raising the error
# Wait for 5 second before attempting the next retry to avoid overwhelming the server
await asyncio.sleep(5)
# Public method to read and extract content from any given URL
async def read_url(self, url: str) -> str:
"""
Asynchronously read and retrieve the textual content of a given URL using the reader API with infinite retry.
This method will keep trying until it successfully retrieves the content from the specified URL.
Args:
url (str): The complete URL of the webpage to read content from.
Returns:
str: The clean textual content extracted from the URL by the reader API service.
"""
# Prepare the POST data payload containing the target URL for the reader API
data = {"url": url}
# Create an aiohttp client session with the configured timeout settings
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
# Use the retry helper method to POST the URL to the reader API and get the content
return await self._fetch_with_retry(session, 'post', self.reader_api, data=data)
# Public method to perform web searches using different search engines
async def search(self, query: str, engine: str = "google") -> str:
"""
Asynchronously perform a web search for the given query using the specified search engine with infinite retry.
This method will keep trying until it successfully retrieves search results from the chosen search engine.
Args:
query (str): The search query string containing the terms to search for.
engine (str, optional): The search engine to use for the search. Supported values are "google" and "baidu".
Defaults to "google" if not specified.
Returns:
str: The raw HTML content of the search results page from the specified search engine.
"""
# Check if the user wants to use Baidu search engine for the query
if engine == "baidu":
# Construct the full URL by combining reader API, Baidu URL and the search query parameter
url = f"{self.reader_api}{self.baidu_url}?wd={query}"
# Set HTTP headers specific to Baidu search results extraction
headers = {
# Target the main content container where Baidu displays search results
"X-Target-Selector": "#content_left",
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
}
# Handle all other search engines (Google, Bing, etc.) through SearXNG proxy
else:
# Determine the search prefix based on the requested engine (Google or Bing)
prefix = "!go" if engine == "google" else "!bi"
# Construct the full URL by combining reader API, SearXNG URL, prefix and query
url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
# Set HTTP headers specific to SearXNG search results extraction
headers = {
# Target the URLs container where SearXNG displays search result links
"X-Target-Selector": "#urls",
"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
}
# Create an aiohttp client session with the configured timeout settings
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
# Use the retry helper method to GET the search results and return the HTML content
return await self._fetch_with_retry(session, 'get', url, headers=headers) |