import os from dotenv import load_dotenv from openai import OpenAI from pydantic import BaseModel from typing import List from application.utils.logger import get_logger from typing import Literal from duckduckgo_search import DDGS from tavily import TavilyClient from langchain_core.tools import tool import ast logger = get_logger() load_dotenv() os.makedirs("reports", exist_ok=True) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) client = OpenAI(api_key=OPENAI_API_KEY) class CompanyListResponse(BaseModel): companies: List[str] # parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss'] @tool def get_top_companies_from_web(query: str): """ # Searches the web for a list of top companies based on a given query. Extracts the number of companies from the query if specified; defaults to 5 otherwise. Returns only the specified number of company names in a list format. Args: query (str): The search query from the user. Returns: CompanyListResponse: A structured list of top company names. """ prompt = ( f"{query} " "focusing on globally recognized companies known for size, influence, or sustainability efforts. " "Respond with a Python list of company names only, no explanation. " "Example: ['Company A', 'Company B', 'Company C']. " "Please do not include any other text or formatting." ) logger.info(f'User query : {query}') try: response = client.responses.create( model="gpt-4o-mini", tools=[{"type": "web_search_preview"}], input=prompt, ) output = response.output_text # logger.info(f"Raw Output: {output}") parsed_list = ast.literal_eval(output.strip()) # parsed_list = eval(output.strip()) logger.info(f"Parsed List: {parsed_list}") result = CompanyListResponse(companies=parsed_list) return result except Exception as e: logger.error(f"Error parsing response: {e}") raise ValueError(f"Failed to parse company list: {output}") @tool def get_sustainability_report_pdf( company_name: str, year: int | None = None, max_results: int = 1, search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo", ) -> str | None: """ Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company. Use this tool when the user provides the exact name of the company they want the report for. Optionally, a specific 'year' can be provided. Args: company_name (str): The name of the company. year (int, optional): The year of the sustainability report. Defaults to None. max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1. search_engine (str, optional): Search engine to use. - "tavily" : only use Tavily search - "duckduckgo" : only use DuckDuckGo - "both" (default): try Tavily first, fallback to DuckDuckGo if needed Returns: str or None: The URL of the sustainability report PDF if found, otherwise None. Search Strategy: - Tavily: Searches with advanced search settings. - DuckDuckGo: Searches public web with 'filetype:pdf' filter. - Only URLs ending with '.pdf' are considered valid. Notes: - Any search failures are internally handled and logged. """ def search_with_tavily(query: str) -> str | None: try: logger.info(f"Searching Tavily for: {query}") result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results) urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")] if urls: logger.info(f"Found PDF via Tavily: {urls[0]}") return urls[0] logger.info("No PDF found via Tavily.") except Exception as e: logger.error(f"Tavily search error: {e}") return None def search_with_duckduckgo(query: str, max_results: int) -> str | None: try: logger.info(f"Searching DuckDuckGo for: {query}") with DDGS() as ddgs: search_results = ddgs.text(query.strip(), max_results=max_results) for result in search_results: pdf_url = result.get('href', '') if pdf_url.lower().endswith('.pdf'): logger.info(f"Found PDF via DuckDuckGo: {pdf_url}") return pdf_url else: logger.info(f"Skipped non-PDF link: {pdf_url}") except Exception as error: logger.error(f"DuckDuckGo search error: {error}") return None # Compose search query query = f"{company_name} sustainability report filetype:pdf" if year: query += f" {year}" logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.") # Perform search according to engine selection if search_engine == "tavily": return search_with_tavily(query) elif search_engine == "duckduckgo": return search_with_duckduckgo(query, max_results=max_results) elif search_engine == "both": pdf_url = search_with_tavily(query) if not pdf_url: pdf_url = search_with_duckduckgo(query, max_results=max_results) return pdf_url else: logger.error(f"Invalid search engine option provided: {search_engine}") raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")