Spaces:

VelaTest
/

PDFExtractor

Sleeping

File size: 5,762 Bytes

import os
import json
import re
from typing import Optional, Dict, Union, BinaryIO
import requests
from google import genai
from google.genai import types
from application.utils.logger import get_logger
from application.services.gemini_api_service import upload_file
from application.services.mongo_db_service import store_document
from application.schemas.response_schema import GEMINI_GHG_PARAMETERS
from langchain_core.tools import tool

logger = get_logger()

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
MODEL = "gemini-2.0-flash"


PROMPT = (
    """You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format. 

    Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.



    ### Instructions:

    1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.

    2. **Data Sources**: Extract data from all relevant sections of the PDF, including:

       - Narrative text

       - Tables

       - Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)

       - Footnotes or appendices

    3. **Infographic Handling**: For infographics, prioritize:

       - Text labels or annotations within the graphic

       - Captions or descriptions near the infographic

       - Legends or keys that clarify values

       - If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.

    4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).

    5. **Edge Cases**:

       - If data is missing, use placeholders as specified in the schema.

       - If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.



    ### Output Requirements:

    - Return a JSON object adhering to the schema.

    - Ensure all fields are populated, using placeholders for missing data.

    - Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.





    ### Task:

    - Parse the PDF thoroughly to extract all relevant data.

    - Ensure consistency in units, years, and terminology across the output.

    - Handle infographics with care, prioritizing textual data and flagging estimates.

    - Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.

    """
)

@tool
def extract_emission_data_as_json(file_input: Union[BinaryIO, bytes, str]) -> Optional[Dict]:
    """

    Extracts emission-related ESG data from a PDF file using the Gemini API.



    This function uploads the provided PDF (local file path, binary file, or byte stream) to Gemini, 

    sends a structured prompt to extract relevant emission data, and attempts to parse the response as JSON.



    Args:

        file_input (Union[BinaryIO, bytes, str]): 

            The input file to process. Can be a file object, byte stream, or local file path.



    Returns:

        Optional[Dict]: 

            A dictionary containing the extracted emission data if parsing succeeds, 

            or a dictionary with the raw text response if JSON parsing fails. 

            Returns None if the extraction process encounters an error.



    Raises:

        Exception: 

            Logs and handles any unexpected errors during file upload, Gemini API interaction, or response parsing.



    Notes:

        - The function automatically handles uploading if the file is not already present on Gemini.

        - If the response is not valid JSON, the raw response text is returned under the key "raw_response".

        - Token usage information (input, output, total tokens) is logged if available.

    """
    try:

        uploaded_file = upload_file(file=file_input)

        response = client.models.generate_content(
            model=MODEL,
            contents=[uploaded_file, PROMPT],
            config={
                'response_mime_type': 'application/json',
                'response_schema': GEMINI_GHG_PARAMETERS,
                'temperature': 0.0,
            },
        )
        if hasattr(response, 'usage_metadata'):
            logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}")
            logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}")
            logger.info(f"Total tokens: {response.usage_metadata.total_token_count}")
        else:
            logger.info("Token usage metadata not available in response")

        logger.info("[Gemini] Response received.")
        try:
            result = json.loads(response.text)
            file_name = result.get('Company Name', 'Unknown Company')
            document ={"Greenhouse Gas (GHG) Protocol Parameters": result.get('Greenhouse Gas (GHG) Protocol Parameters')}
            store_document(file_name, document)
            return json.loads(response.text) 
        except json.JSONDecodeError:
            logger.warning("Failed to parse JSON, returning raw response.")
            return {"raw_response": response.text}

    except Exception as e:
        logger.exception("Error during ESG data extraction.")
        return None