Spaces:

VelaTest
/

PDFExtractor

Sleeping

File size: 5,984 Bytes

import os
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
from typing import List
from application.utils.logger import get_logger
from typing import Literal
from duckduckgo_search import DDGS
from tavily import TavilyClient
from langchain_core.tools import tool
import ast

logger = get_logger()
load_dotenv()
os.makedirs("reports", exist_ok=True)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
client = OpenAI(api_key=OPENAI_API_KEY)

class CompanyListResponse(BaseModel):
    companies: List[str]

# parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss']

@tool
def get_top_companies_from_web(query: str):

    """

    # Searches the web for a list of top companies based on a given query.

    

    Extracts the number of companies from the query if specified; defaults to 5 otherwise.

    Returns only the specified number of company names in a list format.



    Args:

        query (str): The search query from the user.



    Returns:

        CompanyListResponse: A structured list of top company names.

    """
    prompt = (
        f"{query} "
        "focusing on globally recognized companies known for size, influence, or sustainability efforts. "
        "Respond with a Python list of company names only, no explanation. "
        "Example: ['Company A', 'Company B', 'Company C']. "
        "Please do not include any other text or formatting."
    )
    logger.info(f'User query : {query}')
    try:
        response = client.responses.create(
        model="gpt-4o-mini",
        tools=[{"type": "web_search_preview"}],
        input=prompt,
        )

        output = response.output_text
        # logger.info(f"Raw Output: {output}")
        parsed_list = ast.literal_eval(output.strip())
        # parsed_list = eval(output.strip())
        logger.info(f"Parsed List: {parsed_list}")
        result =  CompanyListResponse(companies=parsed_list)
        return result
    except Exception as e:
        logger.error(f"Error parsing response: {e}")
        raise ValueError(f"Failed to parse company list: {output}")
    
@tool
def get_sustainability_report_pdf(

    company_name: str,

    year: int | None = None,

    max_results: int = 1,

    search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo",

) -> str | None:

    """

    Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company.

    Use this tool when the user provides the exact name of the company they want the report for.

    Optionally, a specific 'year' can be provided.



    Args:

        company_name (str): The name of the company.

        year (int, optional): The year of the sustainability report. Defaults to None.

        max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1.

        search_engine (str, optional): Search engine to use. 

            - "tavily" : only use Tavily search

            - "duckduckgo" : only use DuckDuckGo

            - "both" (default): try Tavily first, fallback to DuckDuckGo if needed



    Returns:

        str or None: The URL of the sustainability report PDF if found, otherwise None.



    Search Strategy:

        - Tavily: Searches with advanced search settings.

        - DuckDuckGo: Searches public web with 'filetype:pdf' filter.

        - Only URLs ending with '.pdf' are considered valid.

        

    Notes:

        - Any search failures are internally handled and logged.

    """

    def search_with_tavily(query: str) -> str | None:
        try:
            logger.info(f"Searching Tavily for: {query}")
            result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results)
            urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")]
            if urls:
                logger.info(f"Found PDF via Tavily: {urls[0]}")
                return urls[0]
            logger.info("No PDF found via Tavily.")
        except Exception as e:
            logger.error(f"Tavily search error: {e}")
        return None

    def search_with_duckduckgo(query: str, max_results: int) -> str | None:
        try:
            logger.info(f"Searching DuckDuckGo for: {query}")
            with DDGS() as ddgs:
                search_results = ddgs.text(query.strip(), max_results=max_results)
                for result in search_results:
                    pdf_url = result.get('href', '')
                    if pdf_url.lower().endswith('.pdf'):
                        logger.info(f"Found PDF via DuckDuckGo: {pdf_url}")
                        return pdf_url
                    else:
                        logger.info(f"Skipped non-PDF link: {pdf_url}")
        except Exception as error:
            logger.error(f"DuckDuckGo search error: {error}")
        return None

    # Compose search query
    query = f"{company_name} sustainability report filetype:pdf"
    if year:
        query += f" {year}"

    logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.")

    # Perform search according to engine selection
    if search_engine == "tavily":
        return search_with_tavily(query)
    
    elif search_engine == "duckduckgo":
        return search_with_duckduckgo(query, max_results=max_results)
    
    elif search_engine == "both":
        pdf_url = search_with_tavily(query)
        if not pdf_url:
            pdf_url = search_with_duckduckgo(query, max_results=max_results)
        return pdf_url
    
    else:
        logger.error(f"Invalid search engine option provided: {search_engine}")
        raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")