Spaces:
Sleeping
Sleeping
import trafilatura | |
from tqdm.asyncio import tqdm_asyncio as tqdm_async | |
from graphgen.models import GoogleSearch | |
from graphgen.utils import logger | |
async def _process_single_entity( | |
entity_name: str, google_search_client: GoogleSearch | |
) -> str | None: | |
search_results = google_search_client.search(entity_name) | |
if not search_results: | |
return None | |
# Get more details from the first search result | |
first_result = search_results[0] | |
content = trafilatura.fetch_url(first_result["link"]) | |
summary = trafilatura.extract(content, include_comments=False, include_links=False) | |
summary = summary.strip() | |
logger.info( | |
"Entity %s search result: %s", | |
entity_name, | |
summary, | |
) | |
return summary | |
async def search_google( | |
google_search_client: GoogleSearch, | |
entities: set[str], | |
) -> dict: | |
""" | |
Search with Google and return the contexts. | |
:param google_search_client: Google search client | |
:param entities: list of entities to search | |
:return: | |
""" | |
google_data = {} | |
async for entity in tqdm_async( | |
entities, desc="Searching Google", total=len(entities) | |
): | |
try: | |
summary = await _process_single_entity(entity, google_search_client) | |
if summary: | |
google_data[entity] = summary | |
except Exception as e: # pylint: disable=broad-except | |
logger.error("Error processing entity %s: %s", entity, str(e)) | |
return google_data | |