Spaces:

VelaTest
/

PDFExtractor

Sleeping

PDFExtractor / application /tools /web_search_tools.py

Vela

enhanced graph

75115cd 2 months ago

5.98 kB

	import os
	from dotenv import load_dotenv
	from openai import OpenAI
	from pydantic import BaseModel
	from typing import List
	from application.utils.logger import get_logger
	from typing import Literal
	from duckduckgo_search import DDGS
	from tavily import TavilyClient
	from langchain_core.tools import tool
	import ast

	logger = get_logger()
	load_dotenv()
	os.makedirs("reports", exist_ok=True)

	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
	client = OpenAI(api_key=OPENAI_API_KEY)

	class CompanyListResponse(BaseModel):
	companies: List[str]

	# parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss']

	@tool
	def get_top_companies_from_web(query: str):

	"""
	# Searches the web for a list of top companies based on a given query.

	Extracts the number of companies from the query if specified; defaults to 5 otherwise.
	Returns only the specified number of company names in a list format.

	Args:
	query (str): The search query from the user.

	Returns:
	CompanyListResponse: A structured list of top company names.
	"""
	prompt = (
	f"{query} "
	"focusing on globally recognized companies known for size, influence, or sustainability efforts. "
	"Respond with a Python list of company names only, no explanation. "
	"Example: ['Company A', 'Company B', 'Company C']. "
	"Please do not include any other text or formatting."
	)
	logger.info(f'User query : {query}')
	try:
	response = client.responses.create(
	model="gpt-4o-mini",
	tools=[{"type": "web_search_preview"}],
	input=prompt,
	)

	output = response.output_text
	# logger.info(f"Raw Output: {output}")
	parsed_list = ast.literal_eval(output.strip())
	# parsed_list = eval(output.strip())
	logger.info(f"Parsed List: {parsed_list}")
	result = CompanyListResponse(companies=parsed_list)
	return result
	except Exception as e:
	logger.error(f"Error parsing response: {e}")
	raise ValueError(f"Failed to parse company list: {output}")

	@tool
	def get_sustainability_report_pdf(
	company_name: str,
	year: int \| None = None,
	max_results: int = 1,
	search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo",
	) -> str \| None:

	"""
	Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company.
	Use this tool when the user provides the exact name of the company they want the report for.
	Optionally, a specific 'year' can be provided.

	Args:
	company_name (str): The name of the company.
	year (int, optional): The year of the sustainability report. Defaults to None.
	max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1.
	search_engine (str, optional): Search engine to use.
	- "tavily" : only use Tavily search
	- "duckduckgo" : only use DuckDuckGo
	- "both" (default): try Tavily first, fallback to DuckDuckGo if needed

	Returns:
	str or None: The URL of the sustainability report PDF if found, otherwise None.

	Search Strategy:
	- Tavily: Searches with advanced search settings.
	- DuckDuckGo: Searches public web with 'filetype:pdf' filter.
	- Only URLs ending with '.pdf' are considered valid.

	Notes:
	- Any search failures are internally handled and logged.
	"""

	def search_with_tavily(query: str) -> str \| None:
	try:
	logger.info(f"Searching Tavily for: {query}")
	result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results)
	urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")]
	if urls:
	logger.info(f"Found PDF via Tavily: {urls[0]}")
	return urls[0]
	logger.info("No PDF found via Tavily.")
	except Exception as e:
	logger.error(f"Tavily search error: {e}")
	return None

	def search_with_duckduckgo(query: str, max_results: int) -> str \| None:
	try:
	logger.info(f"Searching DuckDuckGo for: {query}")
	with DDGS() as ddgs:
	search_results = ddgs.text(query.strip(), max_results=max_results)
	for result in search_results:
	pdf_url = result.get('href', '')
	if pdf_url.lower().endswith('.pdf'):
	logger.info(f"Found PDF via DuckDuckGo: {pdf_url}")
	return pdf_url
	else:
	logger.info(f"Skipped non-PDF link: {pdf_url}")
	except Exception as error:
	logger.error(f"DuckDuckGo search error: {error}")
	return None

	# Compose search query
	query = f"{company_name} sustainability report filetype:pdf"
	if year:
	query += f" {year}"

	logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.")

	# Perform search according to engine selection
	if search_engine == "tavily":
	return search_with_tavily(query)

	elif search_engine == "duckduckgo":
	return search_with_duckduckgo(query, max_results=max_results)

	elif search_engine == "both":
	pdf_url = search_with_tavily(query)
	if not pdf_url:
	pdf_url = search_with_duckduckgo(query, max_results=max_results)
	return pdf_url

	else:
	logger.error(f"Invalid search engine option provided: {search_engine}")
	raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")