Spaces:

hadadrjt
/

ai

Running

App Files Files Community

ai / src /tools /deep_search.py

hadadrjt

ai: Switch to Docker container.

bc90a07 23 days ago

raw

history blame

8.17 kB

	#
	# SPDX-FileCopyrightText: Hadad <[email protected]>
	# SPDX-License-Identifier: Apache-2.0
	#

	import aiohttp # Import the aiohttp library to perform asynchronous HTTP requests
	import asyncio # Import asyncio library to handle asynchronous operations and implement delay mechanisms
	from src.utils.ip_generator import generate_ip # Import function to generate random IP addresses for request headers

	# Define the main SearchTools class that provides web searching and URL reading capabilities
	class SearchTools:
	# This class provides methods to connect to the web

	"""
	A comprehensive class providing tools to perform web searches and read content from URLs using various search engines
	and a reader API service. This class implements full asynchronous operations with robust retry mechanisms to ensure
	connections remain active even when encountering errors.

	Attributes:
	searxng_url (str): Base URL for the SearXNG search proxy service that handles Google and other search engines.
	baidu_url (str): Base URL for Baidu search engine for Chinese language searches.
	timeout (int): Timeout duration in seconds for HTTP requests to prevent indefinite hanging.
	reader_api (str): Base URL for the reader API service used to extract clean content from URLs.

	Methods:
	read_url(url): Asynchronously reads and returns the textual content of the specified URL using the reader API.
	search(query, engine): Asynchronously performs a web search with the given query on the specified search engine,
	returning the raw HTML response text.
	"""

	# Constructor method to initialize the SearchTools instance with all necessary configuration values
	def __init__(self):
	"""
	Initialize the SearchTools instance with predefined URLs and timeout settings.
	This method sets up all the base URLs and configuration parameters needed for web searching and content reading.
	"""
	# Set the base URL for SearXNG search proxy service which provides access to multiple search engines
	self.searxng_url = "https://paulgo.io/search"
	# Set the base URL for Baidu search engine for handling Chinese language queries
	self.baidu_url = "https://www.baidu.com/s"
	# Set timeout duration to 30 seconds to balance between allowing slow responses and preventing infinite waits
	self.timeout = 30
	# Set the reader API endpoint that converts web pages into clean, readable text format
	self.reader_api = "https://r.jina.ai/"

	# Private helper method that implements the core retry logic for all HTTP requests
	async def _fetch_with_retry(self, session, method, url, **kwargs):
	"""
	Helper method to perform HTTP requests with infinite retry until a valid response is obtained.
	This method ensures that connections never fail permanently and will keep trying until success.

	Args:
	session (aiohttp.ClientSession): The aiohttp session object to use for making HTTP requests.
	method (str): HTTP method to use for the request (e.g., 'get', 'post', 'put', 'delete').
	url (str): The complete URL to send the request to.
	**kwargs: Additional keyword arguments to pass to the aiohttp request method (headers, data, etc.).

	Returns:
	str: The response text content when a successful request is finally achieved.
	"""
	# Create an infinite loop that will only break when a successful response is received
	while True:
	# Use a try-except block to catch any type of exception that might occur during the request
	try:
	# Make the actual HTTP request using the provided session, method, URL and additional arguments
	async with session.request(method, url, **kwargs) as response:
	# Check if the response status indicates success, raise exception if it's an error status
	response.raise_for_status()
	# Return the text content of the successful response
	return await response.text()
	# Catch any exception that occurs during the request process
	except Exception:
	# Retry on any exception without stopping the loop or raising the error
	# Wait for 5 second before attempting the next retry to avoid overwhelming the server
	await asyncio.sleep(5)

	# Public method to read and extract content from any given URL
	async def read_url(self, url: str) -> str:
	"""
	Asynchronously read and retrieve the textual content of a given URL using the reader API with infinite retry.
	This method will keep trying until it successfully retrieves the content from the specified URL.

	Args:
	url (str): The complete URL of the webpage to read content from.

	Returns:
	str: The clean textual content extracted from the URL by the reader API service.
	"""
	# Prepare the POST data payload containing the target URL for the reader API
	data = {"url": url}
	# Create an aiohttp client session with the configured timeout settings
	async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
	# Use the retry helper method to POST the URL to the reader API and get the content
	return await self._fetch_with_retry(session, 'post', self.reader_api, data=data)

	# Public method to perform web searches using different search engines
	async def search(self, query: str, engine: str = "google") -> str:
	"""
	Asynchronously perform a web search for the given query using the specified search engine with infinite retry.
	This method will keep trying until it successfully retrieves search results from the chosen search engine.

	Args:
	query (str): The search query string containing the terms to search for.
	engine (str, optional): The search engine to use for the search. Supported values are "google" and "baidu".
	Defaults to "google" if not specified.

	Returns:
	str: The raw HTML content of the search results page from the specified search engine.
	"""
	# Check if the user wants to use Baidu search engine for the query
	if engine == "baidu":
	# Construct the full URL by combining reader API, Baidu URL and the search query parameter
	url = f"{self.reader_api}{self.baidu_url}?wd={query}"
	# Set HTTP headers specific to Baidu search results extraction
	headers = {
	# Target the main content container where Baidu displays search results
	"X-Target-Selector": "#content_left",
	"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
	}
	# Handle all other search engines (Google, Bing, etc.) through SearXNG proxy
	else:
	# Determine the search prefix based on the requested engine (Google or Bing)
	prefix = "!go" if engine == "google" else "!bi"
	# Construct the full URL by combining reader API, SearXNG URL, prefix and query
	url = f"{self.reader_api}{self.searxng_url}?q={prefix} {query}"
	# Set HTTP headers specific to SearXNG search results extraction
	headers = {
	# Target the URLs container where SearXNG displays search result links
	"X-Target-Selector": "#urls",
	"X-Forwarded-For": generate_ip() # Random IP address to simulate different client origins
	}
	# Create an aiohttp client session with the configured timeout settings
	async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.timeout)) as session:
	# Use the retry helper method to GET the search results and return the HTML content
	return await self._fetch_with_retry(session, 'get', url, headers=headers)