Spaces:
Sleeping
Sleeping
File size: 1,485 Bytes
fb9c306 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import trafilatura
from tqdm.asyncio import tqdm_asyncio as tqdm_async
from graphgen.models import GoogleSearch
from graphgen.utils import logger
async def _process_single_entity(
entity_name: str, google_search_client: GoogleSearch
) -> str | None:
search_results = google_search_client.search(entity_name)
if not search_results:
return None
# Get more details from the first search result
first_result = search_results[0]
content = trafilatura.fetch_url(first_result["link"])
summary = trafilatura.extract(content, include_comments=False, include_links=False)
summary = summary.strip()
logger.info(
"Entity %s search result: %s",
entity_name,
summary,
)
return summary
async def search_google(
google_search_client: GoogleSearch,
entities: set[str],
) -> dict:
"""
Search with Google and return the contexts.
:param google_search_client: Google search client
:param entities: list of entities to search
:return:
"""
google_data = {}
async for entity in tqdm_async(
entities, desc="Searching Google", total=len(entities)
):
try:
summary = await _process_single_entity(entity, google_search_client)
if summary:
google_data[entity] = summary
except Exception as e: # pylint: disable=broad-except
logger.error("Error processing entity %s: %s", entity, str(e))
return google_data
|