import os import json from google import genai from google.genai import types from pydantic import BaseModel from typing import Optional, Union, BinaryIO from application.utils import logger from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT logger = logger.get_logger() PROMPT = ( """You are a PDF parsing agent. Your job is to extract from a company’s sustainability or ESG report in PDF format: If the values are not found in the document, please return json null for that value. """ ) class Parameter(BaseModel): """ A generic class to hold details for a sustainability metric. """ synonym: str uom: str description: str value: str class GreenhouseGasGHGProtocolParameters(BaseModel): Total_GHG_Emissions: Parameter Scope_1_Emissions: Parameter Scope_2_Emissions: Parameter Scope_3_Emissions: Parameter CO2_Emissions: Parameter CH4_Emissions: Parameter N2O_Emissions: Parameter HFC_Emissions: Parameter PFC_Emissions: Parameter SF6_Emissions: Parameter NF3_Emissions: Parameter Biogenic_CO2_Emissions: Parameter Emissions_Intensity_per_Revenue: Parameter Emissions_Intensity_per_Employee: Parameter Base_Year_Emissions: Parameter Emissions_Reduction_Target: Parameter Emissions_Reduction_Achieved: Parameter Energy_Consumption: Parameter Renewable_Energy_Consumption: Parameter Non_Renewable_Energy_Consumption: Parameter Energy_Intensity_per_Revenue: Parameter Energy_Intensity_per_Employee: Parameter Fuel_Consumption: Parameter Electricity_Consumption: Parameter Heat_Consumption: Parameter Steam_Consumption: Parameter Cooling_Consumption: Parameter Purchased_Goods_and_Services_Emissions: Parameter Capital_Goods_Emissions: Parameter Fuel_and_Energy_Related_Activities_Emissions: Parameter Upstream_Transportation_and_Distribution_Emissions: Parameter Waste_Generated_in_Operations_Emissions: Parameter Business_Travel_Emissions: Parameter Employee_Commuting_Emissions: Parameter Upstream_Leased_Assets_Emissions: Parameter # Downstream_Transportation_and_Distribution_Emissions: Parameter # Processing_of_Sold_Products_Emissions: Parameter # Use_of_Sold_Products_Emissions: Parameter # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter # Downstream_Leased_Assets_Emissions: Parameter # Franchises_Emissions: Parameter # Investments_Emissions: Parameter # Carbon_Offsets_Purchased: Parameter # Net_GHG_Emissions: Parameter # Carbon_Sequestration: Parameter class EmissionData(BaseModel): GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters # print(json.dumps(EmissionData.model_json_schema(), indent=2)) def extract_emissions_data_as_json( api: str, model: str, file_input: Union[BinaryIO, bytes] ) -> Optional[dict]: """ Extract ESG data from PDF using OpenAI or Gemini APIs. Args: api: 'openai' or 'gemini' model: Model name (e.g. gpt-4o, gemini-pro) file_input: File-like object or bytes of the PDF. Returns: Parsed ESG data as dict or None if failed. """ try: client = genai.Client(api_key=os.getenv("gemini_api_key")) file_bytes = file_input.read() logger.info("[Gemini] Sending content for generation...") response = client.models.generate_content( model=model, contents=[ types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"), PROMPT ], config={ 'response_mime_type': 'application/json', 'response_schema': GEMINI_RESPONSE_FORMAT, } ) logger.info("[Gemini] Response received.") try: return json.loads(response.text) except json.JSONDecodeError: logger.warning("Failed to parse JSON, returning raw response.") return {"raw_response": response.text} except Exception as e: logger.exception(f"Error during ESG data extraction.{e}") return None # import os # from google import genai # from pydantic import BaseModel, Field, ValidationError # from dotenv import load_dotenv # from typing import Optional # from google.genai import types # load_dotenv() # client = genai.Client(api_key=os.getenv("gemini_api_key")) # schema= """{ # "parameters": [ # { # "parameter": "Total GHG Emissions", # "dataType": "Numeric", # "synonyms": ["Carbon Footprint"], # "uom": "Metric Tons CO₂e", # "description": "Total greenhouse gases emitted by the organization." # }, # { # "parameter": "Scope 1 Emissions", # "dataType": "Numeric", # "synonyms": ["Direct Emissions"], # "uom": "Metric Tons CO₂e", # "description": "Direct GHG emissions from owned or controlled sources." # }, # { # "parameter": "Scope 2 Emissions", # "dataType": "Numeric", # "synonyms": ["Indirect Energy Emissions"], # "uom": "Metric Tons CO₂e", # "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling." # }, # { # "parameter": "Scope 3 Emissions", # "dataType": "Numeric", # "synonyms": ["Value Chain Emissions"], # "uom": "Metric Tons CO₂e", # "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions." # }, # { # "parameter": "CO₂ Emissions", # "dataType": "Numeric", # "synonyms": ["Carbon Emissions"], # "uom": "Metric Tons CO₂", # "description": "Emissions of carbon dioxide." # }, # { # "parameter": "CH₄ Emissions", # "dataType": "Numeric", # "synonyms": ["Methane Emissions"], # "uom": "Metric Tons CH₄", # "description": "Emissions of methane." # }, # { # "parameter": "N₂O Emissions", # "dataType": "Numeric", # "synonyms": ["Nitrous Oxide Emissions"], # "uom": "Metric Tons N₂O", # "description": "Emissions of nitrous oxide." # }, # { # "parameter": "HFC Emissions", # "dataType": "Numeric", # "synonyms": ["Hydrofluorocarbon Emissions"], # "uom": "Metric Tons HFCs", # "description": "Emissions of hydrofluorocarbons." # }, # { # "parameter": "PFC Emissions", # "dataType": "Numeric", # "synonyms": ["Perfluorocarbon Emissions"], # "uom": "Metric Tons PFCs", # "description": "Emissions of perfluorocarbons." # }, # { # "parameter": "SF₆ Emissions", # "dataType": "Numeric", # "synonyms": ["Sulfur Hexafluoride Emissions"], # "uom": "Metric Tons SF₆", # "description": "Emissions of sulfur hexafluoride." # }, # { # "parameter": "NF₃ Emissions", # "dataType": "Numeric", # "synonyms": ["Nitrogen Trifluoride Emissions"], # "uom": "Metric Tons NF₃", # "description": "Emissions of nitrogen trifluoride." # }, # { # "parameter": "Biogenic CO₂ Emissions", # "dataType": "Numeric", # "synonyms": ["Biogenic Carbon Emissions"], # "uom": "Metric Tons CO₂", # "description": "CO₂ emissions from biological sources." # }, # { # "parameter": "Emissions Intensity per Revenue", # "dataType": "Numeric", # "synonyms": ["Carbon Intensity"], # "uom": "Metric Tons CO₂e / Revenue", # "description": "GHG emissions per unit of revenue." # }, # { # "parameter": "Emissions Intensity per Employee", # "dataType": "Numeric", # "synonyms": ["Emissions per Employee"], # "uom": "Metric Tons CO₂e / Employee", # "description": "GHG emissions per employee." # }, # { # "parameter": "Base Year Emissions", # "dataType": "Numeric", # "synonyms": ["Baseline Emissions"], # "uom": "Metric Tons CO₂e", # "description": "GHG emissions in the base year for comparison." # }, # { # "parameter": "Emissions Reduction Target", # "dataType": "Numeric", # "synonyms": ["Emission Reduction Goal"], # "uom": "Percentage (%)", # "description": "Targeted percentage reduction in GHG emissions." # }, # { # "parameter": "Emissions Reduction Achieved", # "dataType": "Numeric", # "synonyms": ["Emission Reduction Accomplished"], # "uom": "Percentage (%)", # "description": "Actual percentage reduction in GHG emissions achieved." # }, # { # "parameter": "Energy Consumption", # "dataType": "Numeric", # "synonyms": ["Energy Use"], # "uom": "MWh or GJ", # "description": "Total energy consumed by the organization." # }, # { # "parameter": "Renewable Energy Consumption", # "dataType": "Numeric", # "synonyms": ["Green Energy Use"], # "uom": "MWh or GJ", # "description": "Amount of energy consumed from renewable sources." # }, # { # "parameter": "Non-Renewable Energy Consumption", # "dataType": "Numeric", # "synonyms": ["Fossil Energy Use"], # "uom": "MWh or GJ", # "description": "Amount of energy consumed from non-renewable sources." # }, # { # "parameter": "Carbon Offsets Purchased", # "dataType": "Numeric", # "synonyms": ["Carbon Credits"], # "uom": "Metric Tons CO₂e", # "description": "Amount of carbon offsets purchased." # }, # { # "parameter": "Net GHG Emissions", # "dataType": "Numeric", # "synonyms": ["Net Carbon Emissions"], # "uom": "Metric Tons CO₂e", # "description": "GHG emissions after accounting for offsets." # }, # { # "parameter": "Carbon Sequestration", # "dataType": "Numeric", # "synonyms": ["Carbon Capture"], # "uom": "Metric Tons CO₂e", # "description": "Amount of CO₂ sequestered or captured." # } # ] # }"""