Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
from openai import OpenAI | |
from pydantic import BaseModel | |
from typing import List | |
from application.utils.logger import get_logger | |
from typing import Literal | |
from duckduckgo_search import DDGS | |
from tavily import TavilyClient | |
from langchain_core.tools import tool | |
import ast | |
logger = get_logger() | |
load_dotenv() | |
os.makedirs("reports", exist_ok=True) | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) | |
client = OpenAI(api_key=OPENAI_API_KEY) | |
class CompanyListResponse(BaseModel): | |
companies: List[str] | |
# parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss'] | |
def get_top_companies_from_web(query: str): | |
""" | |
# Searches the web for a list of top companies based on a given query. | |
Extracts the number of companies from the query if specified; defaults to 5 otherwise. | |
Returns only the specified number of company names in a list format. | |
Args: | |
query (str): The search query from the user. | |
Returns: | |
CompanyListResponse: A structured list of top company names. | |
""" | |
prompt = ( | |
f"{query} " | |
"focusing on globally recognized companies known for size, influence, or sustainability efforts. " | |
"Respond with a Python list of company names only, no explanation. " | |
"Example: ['Company A', 'Company B', 'Company C']. " | |
"Please do not include any other text or formatting." | |
) | |
logger.info(f'User query : {query}') | |
try: | |
response = client.responses.create( | |
model="gpt-4o-mini", | |
tools=[{"type": "web_search_preview"}], | |
input=prompt, | |
) | |
output = response.output_text | |
# logger.info(f"Raw Output: {output}") | |
parsed_list = ast.literal_eval(output.strip()) | |
# parsed_list = eval(output.strip()) | |
logger.info(f"Parsed List: {parsed_list}") | |
result = CompanyListResponse(companies=parsed_list) | |
return result | |
except Exception as e: | |
logger.error(f"Error parsing response: {e}") | |
raise ValueError(f"Failed to parse company list: {output}") | |
def get_sustainability_report_pdf( | |
company_name: str, | |
year: int | None = None, | |
max_results: int = 1, | |
search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo", | |
) -> str | None: | |
""" | |
Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company. | |
Use this tool when the user provides the exact name of the company they want the report for. | |
Optionally, a specific 'year' can be provided. | |
Args: | |
company_name (str): The name of the company. | |
year (int, optional): The year of the sustainability report. Defaults to None. | |
max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1. | |
search_engine (str, optional): Search engine to use. | |
- "tavily" : only use Tavily search | |
- "duckduckgo" : only use DuckDuckGo | |
- "both" (default): try Tavily first, fallback to DuckDuckGo if needed | |
Returns: | |
str or None: The URL of the sustainability report PDF if found, otherwise None. | |
Search Strategy: | |
- Tavily: Searches with advanced search settings. | |
- DuckDuckGo: Searches public web with 'filetype:pdf' filter. | |
- Only URLs ending with '.pdf' are considered valid. | |
Notes: | |
- Any search failures are internally handled and logged. | |
""" | |
def search_with_tavily(query: str) -> str | None: | |
try: | |
logger.info(f"Searching Tavily for: {query}") | |
result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results) | |
urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")] | |
if urls: | |
logger.info(f"Found PDF via Tavily: {urls[0]}") | |
return urls[0] | |
logger.info("No PDF found via Tavily.") | |
except Exception as e: | |
logger.error(f"Tavily search error: {e}") | |
return None | |
def search_with_duckduckgo(query: str, max_results: int) -> str | None: | |
try: | |
logger.info(f"Searching DuckDuckGo for: {query}") | |
with DDGS() as ddgs: | |
search_results = ddgs.text(query.strip(), max_results=max_results) | |
for result in search_results: | |
pdf_url = result.get('href', '') | |
if pdf_url.lower().endswith('.pdf'): | |
logger.info(f"Found PDF via DuckDuckGo: {pdf_url}") | |
return pdf_url | |
else: | |
logger.info(f"Skipped non-PDF link: {pdf_url}") | |
except Exception as error: | |
logger.error(f"DuckDuckGo search error: {error}") | |
return None | |
# Compose search query | |
query = f"{company_name} sustainability report filetype:pdf" | |
if year: | |
query += f" {year}" | |
logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.") | |
# Perform search according to engine selection | |
if search_engine == "tavily": | |
return search_with_tavily(query) | |
elif search_engine == "duckduckgo": | |
return search_with_duckduckgo(query, max_results=max_results) | |
elif search_engine == "both": | |
pdf_url = search_with_tavily(query) | |
if not pdf_url: | |
pdf_url = search_with_duckduckgo(query, max_results=max_results) | |
return pdf_url | |
else: | |
logger.error(f"Invalid search engine option provided: {search_engine}") | |
raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.") |