github-actions[bot]
Auto-sync from demo at Thu Aug 28 09:22:58 UTC 2025
fb9c306
raw
history blame
1.49 kB
import trafilatura
from tqdm.asyncio import tqdm_asyncio as tqdm_async
from graphgen.models import GoogleSearch
from graphgen.utils import logger
async def _process_single_entity(
entity_name: str, google_search_client: GoogleSearch
) -> str | None:
search_results = google_search_client.search(entity_name)
if not search_results:
return None
# Get more details from the first search result
first_result = search_results[0]
content = trafilatura.fetch_url(first_result["link"])
summary = trafilatura.extract(content, include_comments=False, include_links=False)
summary = summary.strip()
logger.info(
"Entity %s search result: %s",
entity_name,
summary,
)
return summary
async def search_google(
google_search_client: GoogleSearch,
entities: set[str],
) -> dict:
"""
Search with Google and return the contexts.
:param google_search_client: Google search client
:param entities: list of entities to search
:return:
"""
google_data = {}
async for entity in tqdm_async(
entities, desc="Searching Google", total=len(entities)
):
try:
summary = await _process_single_entity(entity, google_search_client)
if summary:
google_data[entity] = summary
except Exception as e: # pylint: disable=broad-except
logger.error("Error processing entity %s: %s", entity, str(e))
return google_data