PDFExtractor / application /tools /web_search_tools.py
Vela
enhanced graph
75115cd
import os
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
from typing import List
from application.utils.logger import get_logger
from typing import Literal
from duckduckgo_search import DDGS
from tavily import TavilyClient
from langchain_core.tools import tool
import ast
logger = get_logger()
load_dotenv()
os.makedirs("reports", exist_ok=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
client = OpenAI(api_key=OPENAI_API_KEY)
class CompanyListResponse(BaseModel):
companies: List[str]
# parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss']
@tool
def get_top_companies_from_web(query: str):
"""
# Searches the web for a list of top companies based on a given query.
Extracts the number of companies from the query if specified; defaults to 5 otherwise.
Returns only the specified number of company names in a list format.
Args:
query (str): The search query from the user.
Returns:
CompanyListResponse: A structured list of top company names.
"""
prompt = (
f"{query} "
"focusing on globally recognized companies known for size, influence, or sustainability efforts. "
"Respond with a Python list of company names only, no explanation. "
"Example: ['Company A', 'Company B', 'Company C']. "
"Please do not include any other text or formatting."
)
logger.info(f'User query : {query}')
try:
response = client.responses.create(
model="gpt-4o-mini",
tools=[{"type": "web_search_preview"}],
input=prompt,
)
output = response.output_text
# logger.info(f"Raw Output: {output}")
parsed_list = ast.literal_eval(output.strip())
# parsed_list = eval(output.strip())
logger.info(f"Parsed List: {parsed_list}")
result = CompanyListResponse(companies=parsed_list)
return result
except Exception as e:
logger.error(f"Error parsing response: {e}")
raise ValueError(f"Failed to parse company list: {output}")
@tool
def get_sustainability_report_pdf(
company_name: str,
year: int | None = None,
max_results: int = 1,
search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo",
) -> str | None:
"""
Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company.
Use this tool when the user provides the exact name of the company they want the report for.
Optionally, a specific 'year' can be provided.
Args:
company_name (str): The name of the company.
year (int, optional): The year of the sustainability report. Defaults to None.
max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1.
search_engine (str, optional): Search engine to use.
- "tavily" : only use Tavily search
- "duckduckgo" : only use DuckDuckGo
- "both" (default): try Tavily first, fallback to DuckDuckGo if needed
Returns:
str or None: The URL of the sustainability report PDF if found, otherwise None.
Search Strategy:
- Tavily: Searches with advanced search settings.
- DuckDuckGo: Searches public web with 'filetype:pdf' filter.
- Only URLs ending with '.pdf' are considered valid.
Notes:
- Any search failures are internally handled and logged.
"""
def search_with_tavily(query: str) -> str | None:
try:
logger.info(f"Searching Tavily for: {query}")
result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results)
urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")]
if urls:
logger.info(f"Found PDF via Tavily: {urls[0]}")
return urls[0]
logger.info("No PDF found via Tavily.")
except Exception as e:
logger.error(f"Tavily search error: {e}")
return None
def search_with_duckduckgo(query: str, max_results: int) -> str | None:
try:
logger.info(f"Searching DuckDuckGo for: {query}")
with DDGS() as ddgs:
search_results = ddgs.text(query.strip(), max_results=max_results)
for result in search_results:
pdf_url = result.get('href', '')
if pdf_url.lower().endswith('.pdf'):
logger.info(f"Found PDF via DuckDuckGo: {pdf_url}")
return pdf_url
else:
logger.info(f"Skipped non-PDF link: {pdf_url}")
except Exception as error:
logger.error(f"DuckDuckGo search error: {error}")
return None
# Compose search query
query = f"{company_name} sustainability report filetype:pdf"
if year:
query += f" {year}"
logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.")
# Perform search according to engine selection
if search_engine == "tavily":
return search_with_tavily(query)
elif search_engine == "duckduckgo":
return search_with_duckduckgo(query, max_results=max_results)
elif search_engine == "both":
pdf_url = search_with_tavily(query)
if not pdf_url:
pdf_url = search_with_duckduckgo(query, max_results=max_results)
return pdf_url
else:
logger.error(f"Invalid search engine option provided: {search_engine}")
raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")