Spaces:

VelaTest
/

PDFExtractor

Sleeping

App Files Files Community

Vela commited on Apr 28

Commit

172e21d

1 Parent(s): 22481bd

added agentic framework

Browse files

Files changed (11) hide show

application/agents/extractor_agent.py +35 -0
application/agents/scraper_agent.py +214 -0
application/services/gemini_model.py +49 -13
application/services/langgraph_service.py +22 -0
application/services/mongo_db_service.py +87 -0
application/tools/emission_data_extractor.py +115 -0
application/tools/pdf_downloader_tool.py +63 -0
application/tools/web_search_tools.py +150 -0
pages/chatbot.py +102 -0
pages/multiple_pdf_extractor.py +6 -6
requirements.txt +8 -1

application/agents/extractor_agent.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from application.tools.emission_data_extractor import extract_emission_data_as_json
+from application.services.langgraph_service import create_agent
+from application.utils.logger import get_logger
+load_dotenv()
+logger = get_logger()
+EXTRACTOR_SYSTEM_PROMPT = """
+You are an intelligent assistant specialized in extracting emission-related ESG (Environmental, Social, and Governance) data from PDF documents.
+You have access to the following tool:
+- **extract_emission_data_as_json**: Use this tool to upload a PDF and extract structured emission-related information as a JSON response.
+Instructions:
+- Your task is to extract only emission-related ESG data, such as carbon emissions, Scope 1, Scope 2, Scope 3 emissions, and other relevant sustainability metrics.
+- Always attempt to return structured JSON data if possible.
+- If structured data cannot be extracted cleanly, ensure that the raw response from the document is returned under a "raw_response" field.
+- Do not make assumptions or hallucinate missing values — extract only what is explicitly present in the document.
+- Always prioritize extracting the latest, most clearly defined data from the PDF.
+- Do not summarize, analyze, or interpret the document — your only role is **accurate data extraction**.
+Goal:
+- Accurately upload the PDF.
+- Extract the requested emission-related ESG data in a clean JSON format.
+- Handle edge cases gracefully (e.g., invalid PDFs, no emission data found).
+Behave like a highly precise and reliable data extraction engine.
+"""
+llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
+extractor_agent = create_agent(llm, [extract_emission_data_as_json], EXTRACTOR_SYSTEM_PROMPT)

application/agents/scraper_agent.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+from dotenv import load_dotenv
+import json
+from langchain_core.messages import ToolMessage
+from typing import TypedDict, Annotated
+from langgraph.graph.message import add_messages
+from typing import Annotated, List
+from langgraph.graph import StateGraph, END
+from langchain_core.messages import ToolMessage, HumanMessage
+from langchain_openai import ChatOpenAI
+# Local Imports
+from application.tools.web_search_tools import get_top_companies_from_web, get_sustainability_report_pdf
+from application.tools.pdf_downloader_tool import download_pdf
+from application.tools.emission_data_extractor import extract_emission_data_as_json
+from application.services.langgraph_service import create_agent
+from application.utils.logger import get_logger
+# setting up environment and logger
+load_dotenv()
+logger = get_logger()
+# Langsmith
+LANGSMITH_API_KEY=os.getenv('LANGSMITH_API_KEY')
+os.environ['LANGSMITH_API_KEY'] = LANGSMITH_API_KEY
+os.environ['LANGCHAIN_TRACING_V2'] = 'true'
+os.environ["LANGCHAIN_PROJECT"] = "Sustainability_AI"
+# OpenAI api key set up
+os.environ['OPENAI_API_KEY'] = os.environ.get("OPENAI_API_KEY")
+class AgentState(TypedDict):
+    messages: Annotated[List, add_messages]
+graph = StateGraph(AgentState)
+model = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
+tools = [get_top_companies_from_web, get_sustainability_report_pdf, download_pdf, extract_emission_data_as_json]
+model_with_tools = model.bind_tools(tools)
+def invoke_model(state: AgentState) -> dict:
+    """Invokes the LLM with the current conversation history."""
+    logger.info("--- Invoking Model ---")
+    # LangGraph automatically passes the entire state
+    # The model_with_tools expects a list of BaseMessages
+    response = model_with_tools.invoke(state['messages'])
+    # logger.info(f"Model response: {response}")
+    # We return a dictionary with the key corresponding to the state field name
+    return {"messages": [response]} # The response is already an AIMessage
+def invoke_tools(state: AgentState) -> dict:
+    """Invokes the necessary tools based on the last AI message."""
+    logger.info("--- Invoking Tools ---")
+    # The state contains the history, the last message is the AI's request
+    last_message = state['messages'][-1]
+    # Check if the last message is an AIMessage with tool_calls
+    if not hasattr(last_message, 'tool_calls') or not last_message.tool_calls:
+         logger.info("No tool calls found in the last message.")
+         # This scenario might indicate the conversation should end or requires clarification
+         # For now, return an empty dict, which won't update the state significantly.
+         # Consider adding a message indicating no tools were called if needed.
+         return {}
+         # Alternative: return {"messages": [SystemMessage(content="No tool calls requested.")]}
+    tool_invocation_messages = []
+    # Find the tool object by name
+    tool_map = {tool.name: tool for tool in tools}
+    for tool_call in last_message.tool_calls:
+        tool_name = tool_call['name']
+        tool_args = tool_call['args']
+        tool_call_id = tool_call['id'] # Crucial for linking the result
+        logger.info(f"Executing tool: {tool_name} with args: {tool_args}")
+        if tool_name in tool_map:
+            selected_tool = tool_map[tool_name]
+            try:
+                # Use the tool's invoke method, passing the arguments dictionary
+                result = selected_tool.invoke(tool_args)
+                # IMPORTANT: Convert the result to a string or a JSON serializable format
+                # if it's a complex object. ToolMessage content should be simple.
+                # Adjust this based on what your tools actually return.
+                if isinstance(result, list) or isinstance(result, dict):
+                    result_content = json.dumps(result) # Convert dict/list to JSON string
+                elif hasattr(result, 'companies') and isinstance(result.companies, list): # Handle CompanyListResponse example
+                     result_content = f"Companies found: {', '.join(result.companies)}"
+                elif result is None:
+                    result_content = "Tool executed successfully, but returned no specific data (None)."
+                else:
+                    result_content = str(result) # Default to string conversion
+                logger.info(f"Tool {tool_name} result: {result_content}")
+                tool_invocation_messages.append(
+                    ToolMessage(content=result_content, tool_call_id=tool_call_id)
+                )
+            except Exception as e:
+                logger.error(f"Error executing tool {tool_name}: {e}")
+                # Return an error message in the ToolMessage
+                tool_invocation_messages.append(
+                    ToolMessage(content=f"Error executing tool {tool_name}: {str(e)}", tool_call_id=tool_call_id)
+                )
+        else:
+            logger.warning(f"Tool '{tool_name}' not found.")
+            tool_invocation_messages.append(
+                ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id)
+            )
+    # Return the collected ToolMessages to be added to the state
+    return {"messages": tool_invocation_messages}
+# --- Graph Definition ---
+graph_builder = StateGraph(AgentState)
+# Add nodes
+graph_builder.add_node("scraper_agent", invoke_model)
+graph_builder.add_node("tools", invoke_tools) # Renamed for clarity
+# Define edges
+graph_builder.set_entry_point("scraper_agent")
+# Conditional edge: After the agent runs, decide whether to call tools or end.
+def router(state: AgentState) -> str:
+    """Determines the next step based on the last message."""
+    last_message = state['messages'][-1]
+    if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
+        # If the AI message has tool calls, invoke the tools node
+        logger.info("--- Routing to Tools ---")
+        return "tools"
+    else:
+        # Otherwise, the conversation can end
+        logger.info("--- Routing to End ---")
+        return END
+graph_builder.add_conditional_edges(
+    "scraper_agent",
+    router,
+    {
+        "tools": "tools", # If router returns "tools", go to the "tools" node
+         END: END,       # If router returns END, finish the graph execution
+    }
+)
+# After tools are invoked, their results (ToolMessages) should go back to the agent
+graph_builder.add_edge("tools", "scraper_agent")
+# Compile the graph
+app = graph_builder.compile()
+# # --- Running the Graph ---
+# if __name__ == "__main__":
+#     logger.info("Starting graph execution...")
+#     # Use HumanMessage for the initial input
+#     initial_input = {"messages": [HumanMessage(content="Please download this pdf https://www.infosys.com/sustainability/documents/infosys-esg-report-2023-24.pdf")]}
+#     # Stream events to see the flow (optional, but helpful for debugging)
+#     # Add recursion limit to prevent infinite loops
+#     try:
+#         final_state = None
+#         for event in app.stream(initial_input, {"recursion_limit": 15}):
+#             # event is a dictionary where keys are node names and values are outputs
+#             logger.info(f"Event: {event}")
+#             # Keep track of the latest state if needed, especially the messages
+#             if "scraper_agent" in event:
+#                 final_state = event["scraper_agent"]
+#             elif "tools" in event:
+#                 final_state = event["tools"] # Though tool output doesn't directly give full state
+#             logger.info("---")
+#         logger.info("\n--- Final State Messages ---")
+#         # To get the absolute final state after streaming, invoke might be simpler,
+#         # or you need to properly aggregate the state from the stream events.
+#         # A simpler way to get final output:
+#         final_output = app.invoke(initial_input, {"recursion_limit": 15})
+#         logger.info(json.dumps(final_output['messages'][-1].dict(), indent=2)) # Print the last message
+#     except Exception as e:
+#         logger.error(f"\n--- An error occurred during graph execution ---")
+#         import traceback
+#         traceback.print_exc()
+SCRAPER_SYSTEM_PROMPT = """
+You are an intelligent assistant specialized in company research and sustainability report retrieval.
+You have access to the following tools:
+- **search_tool**: Use this tool when the user asks for a list of top companies related to an industry or category (e.g., "top 5 textile companies"). Always preserve any number mentioned (e.g., 'top 5', 'top 10') in the query.
+- **pdf_finder_tool**: Use this tool when the user requests a sustainability report or any other specific PDF document about a company. Search specifically for the latest sustainability report if not otherwise specified.
+- **pdf_downloader_tool**: Use this tool when the user provides a direct PDF link or asks you to download a PDF document from a URL.
+Instructions:
+- Carefully read the user's request and select the correct tool based on their intent.
+- Always preserve important details like quantity (e.g., "top 5"), industry, or company name.
+- If the user mentions multiple companies and asks for reports, find reports for **each** company individually.
+- Do not add assumptions, opinions, or unrelated information.
+- Always generate clean, direct, and minimal input for the tool — close to the user's original query.
+- Prioritize the most recent information when searching for reports unless otherwise instructed.
+Goal:
+- Select the appropriate tool.
+- Build a precise query that perfectly reflects the user's request.
+- Return only what the user asks — no extra text or interpretation.
+"""
+search_tool = get_top_companies_from_web
+pdf_finder_tool = get_sustainability_report_pdf
+pdf_downloader_tool = download_pdf
+llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
+scraper_agent = create_agent(llm, [search_tool, pdf_finder_tool, pdf_downloader_tool], SCRAPER_SYSTEM_PROMPT)

application/services/gemini_model.py CHANGED Viewed

@@ -10,22 +10,57 @@ logger=logger.get_logger()
 client = genai.Client(api_key=os.getenv("gemini_api_key"))
-PROMPT = (
-    """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
-    and ESG (Environmental, Social, Governance) Data from a company’s sustainability
-    or ESG report in PDF format.
-    You must extract the data based on a predefined response schema. It is critical
-    that you return all keys specified in the schema, even if the value is not present
-    or not found in the document. If a value is missing or unavailable, return a suitable
-    placeholder according to the format used
-    in the schema.
-    Your output should strictly follow the structure of the schema, ensuring completeness
-    and consistency for downstream processing.
-    Be precise in extracting values and identifying relevant context from the PDF. Use
-    surrounding text or tables to identify the most likely match for each field.
     """
 )
@@ -176,6 +211,7 @@ def extract_emissions_data_as_json(
             config={
                 'response_mime_type': 'application/json',
                 'response_schema': response_schema,
             },
         )
         if hasattr(response, 'usage_metadata'):

 client = genai.Client(api_key=os.getenv("gemini_api_key"))
+# PROMPT = (
+#     """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
+#     and ESG (Environmental, Social, Governance) Data from a company’s sustainability
+#     or ESG report in PDF format.
+#     You must extract the data based on a predefined response schema. It is critical
+#     that you return all keys specified in the schema, even if the value is not present
+#     or not found in the document. If a value is missing or unavailable, return a suitable
+#     placeholder according to the format used
+#     in the schema.
+#     Your output should strictly follow the structure of the schema, ensuring completeness
+#     and consistency for downstream processing.
+#     Be precise in extracting values and identifying relevant context from the PDF. Use
+#     surrounding text or tables to identify the most likely match for each field.
+#     """
+# )
+PROMPT = (
+    """You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format.
+    Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.
+    ### Instructions:
+    1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.
+    2. **Data Sources**: Extract data from all relevant sections of the PDF, including:
+       - Narrative text
+       - Tables
+       - Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)
+       - Footnotes or appendices
+    3. **Infographic Handling**: For infographics, prioritize:
+       - Text labels or annotations within the graphic
+       - Captions or descriptions near the infographic
+       - Legends or keys that clarify values
+       - If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.
+    4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).
+    5. **Edge Cases**:
+       - If data is missing, use placeholders as specified in the schema.
+       - If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.
+    ### Output Requirements:
+    - Return a JSON object adhering to the schema.
+    - Ensure all fields are populated, using placeholders for missing data.
+    - Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.
+    ### Task:
+    - Parse the PDF thoroughly to extract all relevant data.
+    - Ensure consistency in units, years, and terminology across the output.
+    - Handle infographics with care, prioritizing textual data and flagging estimates.
+    - Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.
     """
 )
             config={
                 'response_mime_type': 'application/json',
                 'response_schema': response_schema,
+                'temperature': 0.0,
             },
         )
         if hasattr(response, 'usage_metadata'):

application/services/langgraph_service.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import List
+from langchain.agents import AgentExecutor, create_openai_tools_agent
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.tools import BaseTool
+from langchain_openai import ChatOpenAI
+def create_agent(
+    llm: ChatOpenAI,
+    tools: List[BaseTool],
+    system_prompt: str
+) -> AgentExecutor:
+    """Create an agent executor with given tools and a system prompt."""
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", system_prompt),
+            MessagesPlaceholder(variable_name="messages"),
+            MessagesPlaceholder(variable_name="agent_scratchpad"),
+        ]
+    )
+    agent = create_openai_tools_agent(llm, tools, prompt)
+    executor = AgentExecutor(agent=agent, tools=tools)
+    return executor

application/services/mongo_db_service.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+from pymongo import MongoClient
+from dotenv import load_dotenv
+from typing import List, Dict, Optional
+from application.utils.logger import get_logger
+logger = get_logger()
+load_dotenv()
+DB_NAME = "sustainability_reports_db"
+def get_mongo_client():
+    try:
+        client = MongoClient(os.getenv("MONGODB_URI"))
+        return client
+    except Exception as e:
+        logger.exception(f"An unexpected error occurred while connecting to MongoDB: {str(e)}")
+    return None
+def store_document(collection_name: str, document: Dict) -> Optional[str]:
+    """
+    Stores a document in MongoDB if it doesn't already exist.
+    Args:
+        collection_name (str): Name of the MongoDB collection.
+        document (Dict): The document to be inserted.
+    Returns:
+        Optional[str]: Inserted document ID if successful, None otherwise.
+    """
+    try:
+        client = get_mongo_client()
+        if client is None:
+            logger.error("MongoDB client is not available.")
+            return None
+        db = client.get_database(DB_NAME)
+        collection = db[collection_name]
+        # Check if a similar document already exists
+        existing_document = collection.find_one(document)
+        if existing_document:
+            logger.info(f"Document already exists with ID: {existing_document['_id']}")
+            return str(existing_document['_id'])
+        # If no existing document, insert the new one
+        result = collection.insert_one(document)
+        logger.info(f"New document inserted with ID: {result.inserted_id}")
+        return str(result.inserted_id)
+    except Exception as e:
+        logger.exception(f"An unexpected error occurred: {str(e)}")
+    return None
+def retrieve_documents(collection_name: str, query: Optional[Dict] = None) -> List[Dict]:
+    """
+    Retrieves documents from the specified MongoDB collection.
+    Args:
+        collection_name (str): Name of the MongoDB collection.
+        query (Optional[Dict]): A MongoDB query filter. Defaults to {} (fetch all documents).
+    Returns:
+        List[Dict]: A list of documents matching the query. Empty list if none found or error occurs.
+    """
+    try:
+        client = get_mongo_client()
+        if client is None:
+            logger.error("MongoDB client is not available.")
+            return []
+        db = client.get_database(DB_NAME)
+        collection = db[collection_name]
+        documents_cursor = collection.find(query or {})
+        documents = list(documents_cursor)
+        logger.info(f"Retrieved {len(documents)} documents from collection: {collection_name}")
+        return documents
+    except Exception as e:
+        logger.exception(f"An error occurred while retrieving documents: {str(e)}")
+        return []
+# all_docs = retrieve_documents("Zalando")

application/tools/emission_data_extractor.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import json
+import re
+from typing import Optional, Dict, Union, BinaryIO
+import requests
+from google import genai
+from google.genai import types
+from application.utils.logger import get_logger
+from application.services.gemini_model import upload_file
+from application.services.mongo_db_service import store_document
+from application.schemas.response_schema import GEMINI_GHG_PARAMETERS
+from langchain_core.tools import tool
+logger = get_logger()
+client = genai.Client(api_key=os.getenv("gemini_api_key"))
+MODEL = "gemini-2.0-flash"
+PROMPT = (
+    """You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format.
+    Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.
+    ### Instructions:
+    1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.
+    2. **Data Sources**: Extract data from all relevant sections of the PDF, including:
+       - Narrative text
+       - Tables
+       - Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)
+       - Footnotes or appendices
+    3. **Infographic Handling**: For infographics, prioritize:
+       - Text labels or annotations within the graphic
+       - Captions or descriptions near the infographic
+       - Legends or keys that clarify values
+       - If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.
+    4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).
+    5. **Edge Cases**:
+       - If data is missing, use placeholders as specified in the schema.
+       - If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.
+    ### Output Requirements:
+    - Return a JSON object adhering to the schema.
+    - Ensure all fields are populated, using placeholders for missing data.
+    - Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.
+    ### Task:
+    - Parse the PDF thoroughly to extract all relevant data.
+    - Ensure consistency in units, years, and terminology across the output.
+    - Handle infographics with care, prioritizing textual data and flagging estimates.
+    - Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.
+    """
+)
+@tool
+def extract_emission_data_as_json(file_input: Union[BinaryIO, bytes, str]) -> Optional[Dict]:
+    """
+    Extracts emission-related ESG data from a PDF file using the Gemini API.
+    This function uploads the provided PDF (local file path, binary file, or byte stream) to Gemini,
+    sends a structured prompt to extract relevant emission data, and attempts to parse the response as JSON.
+    Args:
+        file_input (Union[BinaryIO, bytes, str]):
+            The input file to process. Can be a file object, byte stream, or local file path.
+    Returns:
+        Optional[Dict]:
+            A dictionary containing the extracted emission data if parsing succeeds,
+            or a dictionary with the raw text response if JSON parsing fails.
+            Returns None if the extraction process encounters an error.
+    Raises:
+        Exception:
+            Logs and handles any unexpected errors during file upload, Gemini API interaction, or response parsing.
+    Notes:
+        - The function automatically handles uploading if the file is not already present on Gemini.
+        - If the response is not valid JSON, the raw response text is returned under the key "raw_response".
+        - Token usage information (input, output, total tokens) is logged if available.
+    """
+    try:
+        uploaded_file = upload_file(file=file_input)
+        response = client.models.generate_content(
+            model=MODEL,
+            contents=[uploaded_file, PROMPT],
+            config={
+                'response_mime_type': 'application/json',
+                'response_schema': GEMINI_GHG_PARAMETERS,
+                'temperature': 0.0,
+            },
+        )
+        if hasattr(response, 'usage_metadata'):
+            logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}")
+            logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}")
+            logger.info(f"Total tokens: {response.usage_metadata.total_token_count}")
+        else:
+            logger.info("Token usage metadata not available in response")
+        logger.info("[Gemini] Response received.")
+        try:
+            result = json.loads(response.text)
+            file_name = result.get('Company Name', 'Unknown Company')
+            document ={"Greenhouse Gas (GHG) Protocol Parameters": result.get('Greenhouse Gas (GHG) Protocol Parameters')}
+            store_document(file_name, document)
+            return json.loads(response.text)
+        except json.JSONDecodeError:
+            logger.warning("Failed to parse JSON, returning raw response.")
+            return {"raw_response": response.text}
+    except Exception as e:
+        logger.exception("Error during ESG data extraction.")
+        return None

application/tools/pdf_downloader_tool.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import requests
+from application.utils.logger import get_logger
+from langchain_core.tools import tool
+logger = get_logger()
+@tool
+def download_pdf(filename:str, url: str, save_path: str = "reports", overwrite: bool = False):
+    """
+    Downloads a PDF file from a given URL ('pdf_link') and saves it locally
+    with the specified 'filename'. Returns the local path if successful, otherwise None.
+    Use this tool AFTER get_sustainability_report_pdf has returned a valid PDF link or if user provides the PDF link.
+    Args:
+        filename (str): The name to save the PDF as (should end with .pdf).
+        url (str): The direct URL to the PDF file.
+        save_path (str): The directory to save the PDF into (default: "reports").
+        overwrite (bool): Whether to overwrite the file if it already exists.
+    Returns:
+        str | None: The path to the saved file if successful, otherwise None.
+    """
+    try:
+        # parsed_url = urlparse(url)
+        # filename = os.path.basename(parsed_url.path)
+        if not filename.lower().endswith(".pdf"):
+            logger.warning(f"URL does not point to a PDF file: {url}")
+            return None
+        os.makedirs(save_path, exist_ok=True)
+        full_path = os.path.join(save_path, filename)
+        if os.path.exists(full_path) and not overwrite:
+            logger.info(f"File already exists, skipping download: {full_path}")
+            return full_path
+        logger.info(f"Starting download from {url}")
+        response = requests.get(url, stream=True, timeout=20)
+        response.raise_for_status()
+        with open(full_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    file.write(chunk)
+        logger.info(f"Successfully downloaded to: {full_path}")
+        return full_path
+    except requests.exceptions.Timeout:
+        logger.error(f"Timeout while downloading {url}")
+    except requests.exceptions.HTTPError as http_err:
+        logger.error(f"HTTP error while downloading {url}: {http_err}")
+    except requests.exceptions.RequestException as req_err:
+        logger.error(f"Request error while downloading {url}: {req_err}")
+    except Exception as e:
+        logger.error(f"Unexpected error: {e}")
+    return None

application/tools/web_search_tools.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+from dotenv import load_dotenv
+from openai import OpenAI
+from pydantic import BaseModel
+from typing import List
+from application.utils.logger import get_logger
+from typing import Literal
+from duckduckgo_search import DDGS
+from tavily import TavilyClient
+from langchain_core.tools import tool
+logger = get_logger()
+load_dotenv()
+os.makedirs("reports", exist_ok=True)
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+client = OpenAI(api_key=OPENAI_API_KEY)
+class CompanyListResponse(BaseModel):
+    companies: List[str]
+# parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss']
+@tool
+def get_top_companies_from_web(query: str):
+    """
+    # Searches the web for a list of top companies based on a given query.
+    Extracts the number of companies from the query if specified; defaults to 5 otherwise.
+    Returns only the specified number of company names in a list format.
+    Args:
+        query (str): The search query from the user.
+    Returns:
+        CompanyListResponse: A structured list of top company names.
+    """
+    prompt = (
+        f"{query} "
+        "focusing on globally recognized companies known for size, influence, or sustainability efforts. "
+        "Respond with a Python list of company names only, no explanation. "
+        "Example: ['Company A', 'Company B', 'Company C']. "
+        "Please do not include any other text or formatting."
+    )
+    logger.info(f'User query : {query}')
+    try:
+        response = client.responses.create(
+        model="gpt-4o-mini",
+        tools=[{"type": "web_search_preview"}],
+        input=prompt,
+        )
+        output = response.output_text
+        # logger.info(f"Raw Output: {output}")
+        parsed_list = eval(output.strip())
+        logger.info(f"Parsed List: {parsed_list}")
+        result =  CompanyListResponse(companies=parsed_list)
+        return result
+    except Exception as e:
+        logger.error(f"Error parsing response: {e}")
+        raise ValueError(f"Failed to parse company list: {output}")
+@tool
+def get_sustainability_report_pdf(
+    company_name: str,
+    year: int | None = None,
+    max_results: int = 1,
+    search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo",
+) -> str | None:
+    """
+    Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company.
+    Use this tool when the user provides the exact name of the company they want the report for.
+    Optionally, a specific 'year' can be provided.
+    Args:
+        company_name (str): The name of the company.
+        year (int, optional): The year of the sustainability report. Defaults to None.
+        max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1.
+        search_engine (str, optional): Search engine to use.
+            - "tavily" : only use Tavily search
+            - "duckduckgo" : only use DuckDuckGo
+            - "both" (default): try Tavily first, fallback to DuckDuckGo if needed
+    Returns:
+        str or None: The URL of the sustainability report PDF if found, otherwise None.
+    Search Strategy:
+        - Tavily: Searches with advanced search settings.
+        - DuckDuckGo: Searches public web with 'filetype:pdf' filter.
+        - Only URLs ending with '.pdf' are considered valid.
+    Notes:
+        - Any search failures are internally handled and logged.
+    """
+    def search_with_tavily(query: str) -> str | None:
+        try:
+            logger.info(f"Searching Tavily for: {query}")
+            result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results)
+            urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")]
+            if urls:
+                logger.info(f"Found PDF via Tavily: {urls[0]}")
+                return urls[0]
+            logger.info("No PDF found via Tavily.")
+        except Exception as e:
+            logger.error(f"Tavily search error: {e}")
+        return None
+    def search_with_duckduckgo(query: str, max_results: int) -> str | None:
+        try:
+            logger.info(f"Searching DuckDuckGo for: {query}")
+            with DDGS() as ddgs:
+                search_results = ddgs.text(query.strip(), max_results=max_results)
+                for result in search_results:
+                    pdf_url = result.get('href', '')
+                    if pdf_url.lower().endswith('.pdf'):
+                        logger.info(f"Found PDF via DuckDuckGo: {pdf_url}")
+                        return pdf_url
+                    else:
+                        logger.info(f"Skipped non-PDF link: {pdf_url}")
+        except Exception as error:
+            logger.error(f"DuckDuckGo search error: {error}")
+        return None
+    # Compose search query
+    query = f"{company_name} sustainability report filetype:pdf"
+    if year:
+        query += f" {year}"
+    logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.")
+    # Perform search according to engine selection
+    if search_engine == "tavily":
+        return search_with_tavily(query)
+    elif search_engine == "duckduckgo":
+        return search_with_duckduckgo(query, max_results=max_results)
+    elif search_engine == "both":
+        pdf_url = search_with_tavily(query)
+        if not pdf_url:
+            pdf_url = search_with_duckduckgo(query, max_results=max_results)
+        return pdf_url
+    else:
+        logger.error(f"Invalid search engine option provided: {search_engine}")
+        raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")

pages/chatbot.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+from dotenv import load_dotenv
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+# from application.agents.scraper_agent import app
+# from application.utils.logger import get_logger
+try:
+    from application.agents.scraper_agent import app
+    # from application.main import graph
+    from application.utils.logger import get_logger
+except ImportError as e:
+    st.error(f"Import Error: Ensure backend modules are accessible. Details: {e}")
+    st.stop()
+logger = get_logger()
+st.set_page_config(page_title="Sustainability AI Assistant", layout="wide")
+st.title("♻️ Sustainability Report AI Assistant")
+st.caption(
+    "Ask about sustainability reports by company or industry! "
+    "(e.g., 'Get report for Apple', 'Download report for Microsoft 2023', "
+    "'Find reports for top 3 airline companies', 'Download this pdf <link>')"
+)
+load_dotenv()
+def initialize_chat_history():
+    """Initialize session chat history."""
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+        logger.info("Initialized empty chat history in session state.")
+def display_chat_history():
+    """Render previous chat messages."""
+    for message in st.session_state.messages:
+        if isinstance(message, SystemMessage):
+            # st.info(f"System: {message.content}")
+            pass
+        elif isinstance(message, HumanMessage):
+            with st.chat_message("user"):
+                st.markdown(message.content)
+        elif isinstance(message, AIMessage):
+            with st.chat_message("assistant"):
+                st.markdown(message.content)
+def invoke_agent():
+    """Invoke the LangGraph agent and update session state."""
+    try:
+        graph_input = {"messages": st.session_state.messages}
+        logger.info("Invoking LangGraph agent...")
+        # final_output_state = graph.invoke(graph_input, {"recursion_limit": 15})
+        final_output_state = app.invoke(graph_input, {"recursion_limit": 15})
+        logger.info("Agent invocation completed successfully.")
+        return final_output_state
+    except Exception as e:
+        logger.error("Agent invocation failed.", exc_info=True)
+        st.error(f"An error occurred while processing your request: {e}")
+        return None
+def display_last_ai_response():
+    """Display the latest AI message, if any."""
+    last_ai_message = next(
+        (msg for msg in reversed(st.session_state.messages) if isinstance(msg, AIMessage)),
+        None
+    )
+    if last_ai_message:
+        with st.chat_message("assistant"):
+            st.markdown(last_ai_message.content)
+        logger.info("Displayed latest AI response.")
+    else:
+        st.warning("Agent completed without a final AI message.")
+        logger.warning("No AI message found in the final output.")
+initialize_chat_history()
+display_chat_history()
+if user_query := st.chat_input("Your question about sustainability reports..."):
+    logger.info(f"User input received: {user_query}")
+    st.session_state.messages.append(HumanMessage(content=user_query))
+    with st.chat_message("user"):
+        st.markdown(user_query)
+    with st.spinner("Processing your request... Please wait."):
+        final_output_state = invoke_agent()
+        if final_output_state:
+            st.session_state.messages = final_output_state['messages']
+            display_last_ai_response()
+with st.sidebar:
+    st.markdown("---")
+    if st.button("Clear Chat History"):
+        st.session_state.messages = []
+        logger.info("Chat history cleared by user.")
+        st.rerun()

pages/multiple_pdf_extractor.py CHANGED Viewed

@@ -24,12 +24,12 @@ AVAILABLE_MODELS = [
 RESPONSE_SCHEMAS = {
     "Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
-    "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
-    "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
-    "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
-    "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
-    "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
-    "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
 }
 selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)

 RESPONSE_SCHEMAS = {
     "Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
+    # "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
+    # "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
+    # "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
+    # "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
+    # "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
+    # "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
 }
 selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)

requirements.txt CHANGED Viewed

@@ -6,4 +6,11 @@ google.genai
 google-generativeai
 pandas
 supabase
-openpyxl

 google-generativeai
 pandas
 supabase
+openpyxl
+langchain
+pymongo
+langgraph
+langsmith
+tavily-python
+duckduckgo-search
+langchain_openai