Spaces:
Sleeping
Sleeping
import os | |
import json | |
from google import genai | |
from google.genai import types | |
from pydantic import BaseModel | |
from typing import Optional, Union, BinaryIO | |
from application.utils import logger | |
from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT | |
logger = logger.get_logger() | |
PROMPT = ( | |
"""You are a PDF parsing agent. | |
Your job is to extract from a company’s sustainability or ESG report in PDF format: | |
If the values are not found in the document, please return json null for that value. | |
""" | |
) | |
class Parameter(BaseModel): | |
""" | |
A generic class to hold details for a sustainability metric. | |
""" | |
synonym: str | |
uom: str | |
description: str | |
value: str | |
class GreenhouseGasGHGProtocolParameters(BaseModel): | |
Total_GHG_Emissions: Parameter | |
Scope_1_Emissions: Parameter | |
Scope_2_Emissions: Parameter | |
Scope_3_Emissions: Parameter | |
CO2_Emissions: Parameter | |
CH4_Emissions: Parameter | |
N2O_Emissions: Parameter | |
HFC_Emissions: Parameter | |
PFC_Emissions: Parameter | |
SF6_Emissions: Parameter | |
NF3_Emissions: Parameter | |
Biogenic_CO2_Emissions: Parameter | |
Emissions_Intensity_per_Revenue: Parameter | |
Emissions_Intensity_per_Employee: Parameter | |
Base_Year_Emissions: Parameter | |
Emissions_Reduction_Target: Parameter | |
Emissions_Reduction_Achieved: Parameter | |
Energy_Consumption: Parameter | |
Renewable_Energy_Consumption: Parameter | |
Non_Renewable_Energy_Consumption: Parameter | |
Energy_Intensity_per_Revenue: Parameter | |
Energy_Intensity_per_Employee: Parameter | |
Fuel_Consumption: Parameter | |
Electricity_Consumption: Parameter | |
Heat_Consumption: Parameter | |
Steam_Consumption: Parameter | |
Cooling_Consumption: Parameter | |
Purchased_Goods_and_Services_Emissions: Parameter | |
Capital_Goods_Emissions: Parameter | |
Fuel_and_Energy_Related_Activities_Emissions: Parameter | |
Upstream_Transportation_and_Distribution_Emissions: Parameter | |
Waste_Generated_in_Operations_Emissions: Parameter | |
Business_Travel_Emissions: Parameter | |
Employee_Commuting_Emissions: Parameter | |
Upstream_Leased_Assets_Emissions: Parameter | |
# Downstream_Transportation_and_Distribution_Emissions: Parameter | |
# Processing_of_Sold_Products_Emissions: Parameter | |
# Use_of_Sold_Products_Emissions: Parameter | |
# End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter | |
# Downstream_Leased_Assets_Emissions: Parameter | |
# Franchises_Emissions: Parameter | |
# Investments_Emissions: Parameter | |
# Carbon_Offsets_Purchased: Parameter | |
# Net_GHG_Emissions: Parameter | |
# Carbon_Sequestration: Parameter | |
class EmissionData(BaseModel): | |
GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters | |
# print(json.dumps(EmissionData.model_json_schema(), indent=2)) | |
def extract_emissions_data_as_json( | |
api: str, | |
model: str, | |
file_input: Union[BinaryIO, bytes] | |
) -> Optional[dict]: | |
""" | |
Extract ESG data from PDF using OpenAI or Gemini APIs. | |
Args: | |
api: 'openai' or 'gemini' | |
model: Model name (e.g. gpt-4o, gemini-pro) | |
file_input: File-like object or bytes of the PDF. | |
Returns: | |
Parsed ESG data as dict or None if failed. | |
""" | |
try: | |
client = genai.Client(api_key=os.getenv("gemini_api_key")) | |
file_bytes = file_input.read() | |
logger.info("[Gemini] Sending content for generation...") | |
response = client.models.generate_content( | |
model=model, | |
contents=[ | |
types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"), | |
PROMPT | |
], | |
config={ | |
'response_mime_type': 'application/json', | |
'response_schema': GEMINI_RESPONSE_FORMAT, | |
} | |
) | |
logger.info("[Gemini] Response received.") | |
try: | |
return json.loads(response.text) | |
except json.JSONDecodeError: | |
logger.warning("Failed to parse JSON, returning raw response.") | |
return {"raw_response": response.text} | |
except Exception as e: | |
logger.exception(f"Error during ESG data extraction.{e}") | |
return None | |
# import os | |
# from google import genai | |
# from pydantic import BaseModel, Field, ValidationError | |
# from dotenv import load_dotenv | |
# from typing import Optional | |
# from google.genai import types | |
# load_dotenv() | |
# client = genai.Client(api_key=os.getenv("gemini_api_key")) | |
# schema= """{ | |
# "parameters": [ | |
# { | |
# "parameter": "Total GHG Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Carbon Footprint"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "Total greenhouse gases emitted by the organization." | |
# }, | |
# { | |
# "parameter": "Scope 1 Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Direct Emissions"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "Direct GHG emissions from owned or controlled sources." | |
# }, | |
# { | |
# "parameter": "Scope 2 Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Indirect Energy Emissions"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling." | |
# }, | |
# { | |
# "parameter": "Scope 3 Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Value Chain Emissions"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions." | |
# }, | |
# { | |
# "parameter": "CO₂ Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Carbon Emissions"], | |
# "uom": "Metric Tons CO₂", | |
# "description": "Emissions of carbon dioxide." | |
# }, | |
# { | |
# "parameter": "CH₄ Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Methane Emissions"], | |
# "uom": "Metric Tons CH₄", | |
# "description": "Emissions of methane." | |
# }, | |
# { | |
# "parameter": "N₂O Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Nitrous Oxide Emissions"], | |
# "uom": "Metric Tons N₂O", | |
# "description": "Emissions of nitrous oxide." | |
# }, | |
# { | |
# "parameter": "HFC Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Hydrofluorocarbon Emissions"], | |
# "uom": "Metric Tons HFCs", | |
# "description": "Emissions of hydrofluorocarbons." | |
# }, | |
# { | |
# "parameter": "PFC Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Perfluorocarbon Emissions"], | |
# "uom": "Metric Tons PFCs", | |
# "description": "Emissions of perfluorocarbons." | |
# }, | |
# { | |
# "parameter": "SF₆ Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Sulfur Hexafluoride Emissions"], | |
# "uom": "Metric Tons SF₆", | |
# "description": "Emissions of sulfur hexafluoride." | |
# }, | |
# { | |
# "parameter": "NF₃ Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Nitrogen Trifluoride Emissions"], | |
# "uom": "Metric Tons NF₃", | |
# "description": "Emissions of nitrogen trifluoride." | |
# }, | |
# { | |
# "parameter": "Biogenic CO₂ Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Biogenic Carbon Emissions"], | |
# "uom": "Metric Tons CO₂", | |
# "description": "CO₂ emissions from biological sources." | |
# }, | |
# { | |
# "parameter": "Emissions Intensity per Revenue", | |
# "dataType": "Numeric", | |
# "synonyms": ["Carbon Intensity"], | |
# "uom": "Metric Tons CO₂e / Revenue", | |
# "description": "GHG emissions per unit of revenue." | |
# }, | |
# { | |
# "parameter": "Emissions Intensity per Employee", | |
# "dataType": "Numeric", | |
# "synonyms": ["Emissions per Employee"], | |
# "uom": "Metric Tons CO₂e / Employee", | |
# "description": "GHG emissions per employee." | |
# }, | |
# { | |
# "parameter": "Base Year Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Baseline Emissions"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "GHG emissions in the base year for comparison." | |
# }, | |
# { | |
# "parameter": "Emissions Reduction Target", | |
# "dataType": "Numeric", | |
# "synonyms": ["Emission Reduction Goal"], | |
# "uom": "Percentage (%)", | |
# "description": "Targeted percentage reduction in GHG emissions." | |
# }, | |
# { | |
# "parameter": "Emissions Reduction Achieved", | |
# "dataType": "Numeric", | |
# "synonyms": ["Emission Reduction Accomplished"], | |
# "uom": "Percentage (%)", | |
# "description": "Actual percentage reduction in GHG emissions achieved." | |
# }, | |
# { | |
# "parameter": "Energy Consumption", | |
# "dataType": "Numeric", | |
# "synonyms": ["Energy Use"], | |
# "uom": "MWh or GJ", | |
# "description": "Total energy consumed by the organization." | |
# }, | |
# { | |
# "parameter": "Renewable Energy Consumption", | |
# "dataType": "Numeric", | |
# "synonyms": ["Green Energy Use"], | |
# "uom": "MWh or GJ", | |
# "description": "Amount of energy consumed from renewable sources." | |
# }, | |
# { | |
# "parameter": "Non-Renewable Energy Consumption", | |
# "dataType": "Numeric", | |
# "synonyms": ["Fossil Energy Use"], | |
# "uom": "MWh or GJ", | |
# "description": "Amount of energy consumed from non-renewable sources." | |
# }, | |
# { | |
# "parameter": "Carbon Offsets Purchased", | |
# "dataType": "Numeric", | |
# "synonyms": ["Carbon Credits"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "Amount of carbon offsets purchased." | |
# }, | |
# { | |
# "parameter": "Net GHG Emissions", | |
# "dataType": "Numeric", | |
# "synonyms": ["Net Carbon Emissions"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "GHG emissions after accounting for offsets." | |
# }, | |
# { | |
# "parameter": "Carbon Sequestration", | |
# "dataType": "Numeric", | |
# "synonyms": ["Carbon Capture"], | |
# "uom": "Metric Tons CO₂e", | |
# "description": "Amount of CO₂ sequestered or captured." | |
# } | |
# ] | |
# }""" |