Spaces:
Sleeping
Sleeping
Vela
commited on
Commit
·
d1ca23a
1
Parent(s):
dab58f3
modified gemini service module add API call file handling
Browse files- app.py +52 -34
- application/schemas/response_schema.py +63 -1
- application/services/gemini_model.py +138 -258
- application/services/streamlit_function.py +50 -12
- application/services/supabase_service.py +20 -0
- requirements.txt +3 -3
app.py
CHANGED
@@ -1,60 +1,78 @@
|
|
1 |
-
from application.services import streamlit_function, llm_service
|
2 |
-
from application.services import gemini_model, openai_model
|
3 |
import streamlit as st
|
|
|
|
|
4 |
from google.genai.errors import ClientError
|
5 |
from application.utils import logger
|
6 |
-
import test
|
7 |
|
8 |
logger = logger.get_logger()
|
9 |
|
10 |
-
|
11 |
-
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
16 |
|
17 |
-
|
18 |
|
19 |
-
for key in ["
|
20 |
if key not in st.session_state:
|
21 |
st.session_state[key] = None
|
22 |
|
|
|
|
|
|
|
23 |
if st.session_state.pdf_file:
|
24 |
with st.container():
|
25 |
col1, col2, col3 = st.columns([5, 5, 5], gap="small")
|
|
|
|
|
26 |
|
27 |
with col1:
|
28 |
-
if st.button("Generate
|
29 |
-
with st.spinner("Calling
|
30 |
-
result =
|
31 |
-
|
32 |
-
st.session_state
|
33 |
-
if st.session_state
|
34 |
-
st.write("Extracted Metrics by
|
35 |
-
st.json(st.session_state
|
36 |
-
|
37 |
with col2:
|
38 |
-
if st.button("Generate
|
39 |
-
with st.spinner("Calling
|
40 |
-
result=
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
st.
|
|
|
45 |
|
46 |
with col3:
|
47 |
try:
|
48 |
-
if st.button("Generate
|
49 |
-
with st.spinner("Calling
|
50 |
-
result =
|
51 |
-
|
52 |
-
st.session_state
|
53 |
except ClientError as e:
|
54 |
st.error(f"Gemini API Error: {e}")
|
55 |
logger.error("Error Details:", e.message, e.response)
|
56 |
|
57 |
-
if st.session_state
|
58 |
-
st.write("Extracted Metrics by
|
59 |
-
st.json(st.session_state
|
|
|
|
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import os
|
3 |
+
from application.services import streamlit_function, gemini_model
|
4 |
from google.genai.errors import ClientError
|
5 |
from application.utils import logger
|
|
|
6 |
|
7 |
logger = logger.get_logger()
|
8 |
|
9 |
+
MODEL_1 = "gemini-1.5-pro-latest"
|
10 |
+
MODEL_2 = "gemini-2.0-flash"
|
11 |
+
MODEL_3 = "gemini-1.5-flash"
|
12 |
|
13 |
+
API_1 = "gemini"
|
14 |
+
API_2 = "gemini"
|
15 |
+
API_3 = "gemini"
|
16 |
+
|
17 |
+
streamlit_function.config_homepage()
|
18 |
|
19 |
+
pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
|
20 |
|
21 |
+
for key in [f"{MODEL_1}_result", f"{MODEL_2}_result", f"{MODEL_3}_result", "pdf_file"]:
|
22 |
if key not in st.session_state:
|
23 |
st.session_state[key] = None
|
24 |
|
25 |
+
if "excel_file" not in st.session_state:
|
26 |
+
st.session_state["excel_file"] = None
|
27 |
+
|
28 |
if st.session_state.pdf_file:
|
29 |
with st.container():
|
30 |
col1, col2, col3 = st.columns([5, 5, 5], gap="small")
|
31 |
+
file_name = st.session_state.pdf_file.name.removesuffix(".pdf")
|
32 |
+
excel_file=None
|
33 |
|
34 |
with col1:
|
35 |
+
if st.button(f"Generate {MODEL_1} Response"):
|
36 |
+
with st.spinner(f"Calling {MODEL_1}..."):
|
37 |
+
result = gemini_model.extract_emissions_data_as_json(API_1 , MODEL_1, st.session_state.pdf_file)
|
38 |
+
excel_file = streamlit_function.export_results_to_excel(result, MODEL_1, file_name)
|
39 |
+
st.session_state[f"{MODEL_1}_result"] = result
|
40 |
+
if st.session_state[f"{MODEL_1}_result"]:
|
41 |
+
st.write(f"Extracted Metrics by {MODEL_1}_result")
|
42 |
+
st.json(st.session_state[f"{MODEL_1}_result"])
|
43 |
+
|
44 |
with col2:
|
45 |
+
if st.button(f"Generate {MODEL_2} Response"):
|
46 |
+
with st.spinner(f"Calling {MODEL_2}..."):
|
47 |
+
result = gemini_model.extract_emissions_data_as_json(API_2, MODEL_2, st.session_state.pdf_file)
|
48 |
+
excel_file = streamlit_function.export_results_to_excel(result, MODEL_2, file_name)
|
49 |
+
st.session_state[f"{MODEL_2}_result"] = result
|
50 |
+
if st.session_state[f"{MODEL_2}_result"]:
|
51 |
+
st.write(f"Extracted Metrics by {MODEL_2}_result")
|
52 |
+
st.json(st.session_state[f"{MODEL_2}_result"])
|
53 |
|
54 |
with col3:
|
55 |
try:
|
56 |
+
if st.button(f"Generate {MODEL_3} Response"):
|
57 |
+
with st.spinner(f"Calling {MODEL_3}..."):
|
58 |
+
result = gemini_model.extract_emissions_data_as_json(API_3, MODEL_3, st.session_state.pdf_file)
|
59 |
+
excel_file = streamlit_function.export_results_to_excel(result, MODEL_3, file_name)
|
60 |
+
st.session_state[f"{MODEL_3}_result"] = result
|
61 |
except ClientError as e:
|
62 |
st.error(f"Gemini API Error: {e}")
|
63 |
logger.error("Error Details:", e.message, e.response)
|
64 |
|
65 |
+
if st.session_state[f"{MODEL_3}_result"]:
|
66 |
+
st.write(f"Extracted Metrics by {MODEL_3}_result")
|
67 |
+
st.json(st.session_state[f"{MODEL_3}_result"])
|
68 |
+
|
69 |
+
file_path = f"data/{file_name}.xlsx"
|
70 |
|
71 |
+
if os.path.exists(file_path):
|
72 |
+
with open(file_path, "rb") as file:
|
73 |
+
st.download_button(
|
74 |
+
label="Download Excel File",
|
75 |
+
data=file,
|
76 |
+
file_name=f"{file_name}.xlsx",
|
77 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
78 |
+
)
|
application/schemas/response_schema.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
RESPONSE_FORMAT = {
|
2 |
"type": "json_schema",
|
3 |
"json_schema": {
|
@@ -449,4 +451,64 @@ GEMINI_RESPONSE_FORMAT = {
|
|
449 |
}
|
450 |
},
|
451 |
"propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
|
452 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
|
3 |
RESPONSE_FORMAT = {
|
4 |
"type": "json_schema",
|
5 |
"json_schema": {
|
|
|
451 |
}
|
452 |
},
|
453 |
"propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
|
454 |
+
}
|
455 |
+
|
456 |
+
|
457 |
+
class Parameter(BaseModel):
|
458 |
+
"""
|
459 |
+
A generic class to hold details for a sustainability metric.
|
460 |
+
"""
|
461 |
+
synonym: str
|
462 |
+
uom: str
|
463 |
+
description: str
|
464 |
+
value: str
|
465 |
+
|
466 |
+
class GreenhouseGasGHGProtocolParameters(BaseModel):
|
467 |
+
Total_GHG_Emissions: Parameter
|
468 |
+
Scope_1_Emissions: Parameter
|
469 |
+
Scope_2_Emissions: Parameter
|
470 |
+
Scope_3_Emissions: Parameter
|
471 |
+
CO2_Emissions: Parameter
|
472 |
+
CH4_Emissions: Parameter
|
473 |
+
N2O_Emissions: Parameter
|
474 |
+
HFC_Emissions: Parameter
|
475 |
+
PFC_Emissions: Parameter
|
476 |
+
SF6_Emissions: Parameter
|
477 |
+
NF3_Emissions: Parameter
|
478 |
+
Biogenic_CO2_Emissions: Parameter
|
479 |
+
Emissions_Intensity_per_Revenue: Parameter
|
480 |
+
Emissions_Intensity_per_Employee: Parameter
|
481 |
+
Base_Year_Emissions: Parameter
|
482 |
+
Emissions_Reduction_Target: Parameter
|
483 |
+
Emissions_Reduction_Achieved: Parameter
|
484 |
+
Energy_Consumption: Parameter
|
485 |
+
Renewable_Energy_Consumption: Parameter
|
486 |
+
Non_Renewable_Energy_Consumption: Parameter
|
487 |
+
Energy_Intensity_per_Revenue: Parameter
|
488 |
+
Energy_Intensity_per_Employee: Parameter
|
489 |
+
Fuel_Consumption: Parameter
|
490 |
+
Electricity_Consumption: Parameter
|
491 |
+
Heat_Consumption: Parameter
|
492 |
+
Steam_Consumption: Parameter
|
493 |
+
Cooling_Consumption: Parameter
|
494 |
+
Purchased_Goods_and_Services_Emissions: Parameter
|
495 |
+
Capital_Goods_Emissions: Parameter
|
496 |
+
Fuel_and_Energy_Related_Activities_Emissions: Parameter
|
497 |
+
Upstream_Transportation_and_Distribution_Emissions: Parameter
|
498 |
+
Waste_Generated_in_Operations_Emissions: Parameter
|
499 |
+
Business_Travel_Emissions: Parameter
|
500 |
+
Employee_Commuting_Emissions: Parameter
|
501 |
+
Upstream_Leased_Assets_Emissions: Parameter
|
502 |
+
# Downstream_Transportation_and_Distribution_Emissions: Parameter
|
503 |
+
# Processing_of_Sold_Products_Emissions: Parameter
|
504 |
+
# Use_of_Sold_Products_Emissions: Parameter
|
505 |
+
# End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
|
506 |
+
# Downstream_Leased_Assets_Emissions: Parameter
|
507 |
+
# Franchises_Emissions: Parameter
|
508 |
+
# Investments_Emissions: Parameter
|
509 |
+
# Carbon_Offsets_Purchased: Parameter
|
510 |
+
# Net_GHG_Emissions: Parameter
|
511 |
+
# Carbon_Sequestration: Parameter
|
512 |
+
|
513 |
+
class EmissionData(BaseModel):
|
514 |
+
GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
|
application/services/gemini_model.py
CHANGED
@@ -1,81 +1,138 @@
|
|
1 |
import os
|
2 |
import json
|
|
|
|
|
3 |
from google import genai
|
4 |
from google.genai import types
|
5 |
-
from pydantic import BaseModel
|
6 |
-
from typing import Optional, Union, BinaryIO
|
7 |
-
from application.utils import logger
|
8 |
from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
|
12 |
PROMPT = (
|
13 |
-
"""You are a PDF parsing agent.
|
14 |
-
|
15 |
-
|
16 |
-
"""
|
17 |
)
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
"""
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
"""
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
Heat_Consumption: Parameter
|
54 |
-
Steam_Consumption: Parameter
|
55 |
-
Cooling_Consumption: Parameter
|
56 |
-
Purchased_Goods_and_Services_Emissions: Parameter
|
57 |
-
Capital_Goods_Emissions: Parameter
|
58 |
-
Fuel_and_Energy_Related_Activities_Emissions: Parameter
|
59 |
-
Upstream_Transportation_and_Distribution_Emissions: Parameter
|
60 |
-
Waste_Generated_in_Operations_Emissions: Parameter
|
61 |
-
Business_Travel_Emissions: Parameter
|
62 |
-
Employee_Commuting_Emissions: Parameter
|
63 |
-
Upstream_Leased_Assets_Emissions: Parameter
|
64 |
-
# Downstream_Transportation_and_Distribution_Emissions: Parameter
|
65 |
-
# Processing_of_Sold_Products_Emissions: Parameter
|
66 |
-
# Use_of_Sold_Products_Emissions: Parameter
|
67 |
-
# End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
|
68 |
-
# Downstream_Leased_Assets_Emissions: Parameter
|
69 |
-
# Franchises_Emissions: Parameter
|
70 |
-
# Investments_Emissions: Parameter
|
71 |
-
# Carbon_Offsets_Purchased: Parameter
|
72 |
-
# Net_GHG_Emissions: Parameter
|
73 |
-
# Carbon_Sequestration: Parameter
|
74 |
-
|
75 |
-
class EmissionData(BaseModel):
|
76 |
-
GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
|
77 |
-
|
78 |
-
# print(json.dumps(EmissionData.model_json_schema(), indent=2))
|
79 |
|
80 |
def extract_emissions_data_as_json(
|
81 |
api: str,
|
@@ -83,34 +140,33 @@ def extract_emissions_data_as_json(
|
|
83 |
file_input: Union[BinaryIO, bytes]
|
84 |
) -> Optional[dict]:
|
85 |
"""
|
86 |
-
|
87 |
|
88 |
Args:
|
89 |
-
api:
|
90 |
-
model: Model name (e.g
|
91 |
-
file_input: File
|
92 |
|
93 |
Returns:
|
94 |
-
Parsed
|
95 |
"""
|
96 |
try:
|
|
|
|
|
|
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
file_bytes = file_input.read()
|
101 |
-
logger.info("[Gemini] Sending content for generation...")
|
102 |
|
103 |
response = client.models.generate_content(
|
104 |
model=model,
|
105 |
-
contents=[
|
106 |
-
types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
|
107 |
-
PROMPT
|
108 |
-
],
|
109 |
config={
|
110 |
'response_mime_type': 'application/json',
|
111 |
-
'response_schema': GEMINI_RESPONSE_FORMAT
|
112 |
}
|
113 |
)
|
|
|
114 |
logger.info("[Gemini] Response received.")
|
115 |
try:
|
116 |
return json.loads(response.text)
|
@@ -119,181 +175,5 @@ def extract_emissions_data_as_json(
|
|
119 |
return {"raw_response": response.text}
|
120 |
|
121 |
except Exception as e:
|
122 |
-
logger.exception(
|
123 |
-
return None
|
124 |
-
|
125 |
-
# import os
|
126 |
-
# from google import genai
|
127 |
-
# from pydantic import BaseModel, Field, ValidationError
|
128 |
-
# from dotenv import load_dotenv
|
129 |
-
# from typing import Optional
|
130 |
-
# from google.genai import types
|
131 |
-
|
132 |
-
# load_dotenv()
|
133 |
-
# client = genai.Client(api_key=os.getenv("gemini_api_key"))
|
134 |
-
|
135 |
-
# schema= """{
|
136 |
-
# "parameters": [
|
137 |
-
# {
|
138 |
-
# "parameter": "Total GHG Emissions",
|
139 |
-
# "dataType": "Numeric",
|
140 |
-
# "synonyms": ["Carbon Footprint"],
|
141 |
-
# "uom": "Metric Tons CO₂e",
|
142 |
-
# "description": "Total greenhouse gases emitted by the organization."
|
143 |
-
# },
|
144 |
-
# {
|
145 |
-
# "parameter": "Scope 1 Emissions",
|
146 |
-
# "dataType": "Numeric",
|
147 |
-
# "synonyms": ["Direct Emissions"],
|
148 |
-
# "uom": "Metric Tons CO₂e",
|
149 |
-
# "description": "Direct GHG emissions from owned or controlled sources."
|
150 |
-
# },
|
151 |
-
# {
|
152 |
-
# "parameter": "Scope 2 Emissions",
|
153 |
-
# "dataType": "Numeric",
|
154 |
-
# "synonyms": ["Indirect Energy Emissions"],
|
155 |
-
# "uom": "Metric Tons CO₂e",
|
156 |
-
# "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling."
|
157 |
-
# },
|
158 |
-
# {
|
159 |
-
# "parameter": "Scope 3 Emissions",
|
160 |
-
# "dataType": "Numeric",
|
161 |
-
# "synonyms": ["Value Chain Emissions"],
|
162 |
-
# "uom": "Metric Tons CO₂e",
|
163 |
-
# "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions."
|
164 |
-
# },
|
165 |
-
# {
|
166 |
-
# "parameter": "CO₂ Emissions",
|
167 |
-
# "dataType": "Numeric",
|
168 |
-
# "synonyms": ["Carbon Emissions"],
|
169 |
-
# "uom": "Metric Tons CO₂",
|
170 |
-
# "description": "Emissions of carbon dioxide."
|
171 |
-
# },
|
172 |
-
# {
|
173 |
-
# "parameter": "CH₄ Emissions",
|
174 |
-
# "dataType": "Numeric",
|
175 |
-
# "synonyms": ["Methane Emissions"],
|
176 |
-
# "uom": "Metric Tons CH₄",
|
177 |
-
# "description": "Emissions of methane."
|
178 |
-
# },
|
179 |
-
# {
|
180 |
-
# "parameter": "N₂O Emissions",
|
181 |
-
# "dataType": "Numeric",
|
182 |
-
# "synonyms": ["Nitrous Oxide Emissions"],
|
183 |
-
# "uom": "Metric Tons N₂O",
|
184 |
-
# "description": "Emissions of nitrous oxide."
|
185 |
-
# },
|
186 |
-
# {
|
187 |
-
# "parameter": "HFC Emissions",
|
188 |
-
# "dataType": "Numeric",
|
189 |
-
# "synonyms": ["Hydrofluorocarbon Emissions"],
|
190 |
-
# "uom": "Metric Tons HFCs",
|
191 |
-
# "description": "Emissions of hydrofluorocarbons."
|
192 |
-
# },
|
193 |
-
# {
|
194 |
-
# "parameter": "PFC Emissions",
|
195 |
-
# "dataType": "Numeric",
|
196 |
-
# "synonyms": ["Perfluorocarbon Emissions"],
|
197 |
-
# "uom": "Metric Tons PFCs",
|
198 |
-
# "description": "Emissions of perfluorocarbons."
|
199 |
-
# },
|
200 |
-
# {
|
201 |
-
# "parameter": "SF₆ Emissions",
|
202 |
-
# "dataType": "Numeric",
|
203 |
-
# "synonyms": ["Sulfur Hexafluoride Emissions"],
|
204 |
-
# "uom": "Metric Tons SF₆",
|
205 |
-
# "description": "Emissions of sulfur hexafluoride."
|
206 |
-
# },
|
207 |
-
# {
|
208 |
-
# "parameter": "NF₃ Emissions",
|
209 |
-
# "dataType": "Numeric",
|
210 |
-
# "synonyms": ["Nitrogen Trifluoride Emissions"],
|
211 |
-
# "uom": "Metric Tons NF₃",
|
212 |
-
# "description": "Emissions of nitrogen trifluoride."
|
213 |
-
# },
|
214 |
-
# {
|
215 |
-
# "parameter": "Biogenic CO₂ Emissions",
|
216 |
-
# "dataType": "Numeric",
|
217 |
-
# "synonyms": ["Biogenic Carbon Emissions"],
|
218 |
-
# "uom": "Metric Tons CO₂",
|
219 |
-
# "description": "CO₂ emissions from biological sources."
|
220 |
-
# },
|
221 |
-
# {
|
222 |
-
# "parameter": "Emissions Intensity per Revenue",
|
223 |
-
# "dataType": "Numeric",
|
224 |
-
# "synonyms": ["Carbon Intensity"],
|
225 |
-
# "uom": "Metric Tons CO₂e / Revenue",
|
226 |
-
# "description": "GHG emissions per unit of revenue."
|
227 |
-
# },
|
228 |
-
# {
|
229 |
-
# "parameter": "Emissions Intensity per Employee",
|
230 |
-
# "dataType": "Numeric",
|
231 |
-
# "synonyms": ["Emissions per Employee"],
|
232 |
-
# "uom": "Metric Tons CO₂e / Employee",
|
233 |
-
# "description": "GHG emissions per employee."
|
234 |
-
# },
|
235 |
-
# {
|
236 |
-
# "parameter": "Base Year Emissions",
|
237 |
-
# "dataType": "Numeric",
|
238 |
-
# "synonyms": ["Baseline Emissions"],
|
239 |
-
# "uom": "Metric Tons CO₂e",
|
240 |
-
# "description": "GHG emissions in the base year for comparison."
|
241 |
-
# },
|
242 |
-
# {
|
243 |
-
# "parameter": "Emissions Reduction Target",
|
244 |
-
# "dataType": "Numeric",
|
245 |
-
# "synonyms": ["Emission Reduction Goal"],
|
246 |
-
# "uom": "Percentage (%)",
|
247 |
-
# "description": "Targeted percentage reduction in GHG emissions."
|
248 |
-
# },
|
249 |
-
# {
|
250 |
-
# "parameter": "Emissions Reduction Achieved",
|
251 |
-
# "dataType": "Numeric",
|
252 |
-
# "synonyms": ["Emission Reduction Accomplished"],
|
253 |
-
# "uom": "Percentage (%)",
|
254 |
-
# "description": "Actual percentage reduction in GHG emissions achieved."
|
255 |
-
# },
|
256 |
-
# {
|
257 |
-
# "parameter": "Energy Consumption",
|
258 |
-
# "dataType": "Numeric",
|
259 |
-
# "synonyms": ["Energy Use"],
|
260 |
-
# "uom": "MWh or GJ",
|
261 |
-
# "description": "Total energy consumed by the organization."
|
262 |
-
# },
|
263 |
-
# {
|
264 |
-
# "parameter": "Renewable Energy Consumption",
|
265 |
-
# "dataType": "Numeric",
|
266 |
-
# "synonyms": ["Green Energy Use"],
|
267 |
-
# "uom": "MWh or GJ",
|
268 |
-
# "description": "Amount of energy consumed from renewable sources."
|
269 |
-
# },
|
270 |
-
# {
|
271 |
-
# "parameter": "Non-Renewable Energy Consumption",
|
272 |
-
# "dataType": "Numeric",
|
273 |
-
# "synonyms": ["Fossil Energy Use"],
|
274 |
-
# "uom": "MWh or GJ",
|
275 |
-
# "description": "Amount of energy consumed from non-renewable sources."
|
276 |
-
# },
|
277 |
-
# {
|
278 |
-
# "parameter": "Carbon Offsets Purchased",
|
279 |
-
# "dataType": "Numeric",
|
280 |
-
# "synonyms": ["Carbon Credits"],
|
281 |
-
# "uom": "Metric Tons CO₂e",
|
282 |
-
# "description": "Amount of carbon offsets purchased."
|
283 |
-
# },
|
284 |
-
# {
|
285 |
-
# "parameter": "Net GHG Emissions",
|
286 |
-
# "dataType": "Numeric",
|
287 |
-
# "synonyms": ["Net Carbon Emissions"],
|
288 |
-
# "uom": "Metric Tons CO₂e",
|
289 |
-
# "description": "GHG emissions after accounting for offsets."
|
290 |
-
# },
|
291 |
-
# {
|
292 |
-
# "parameter": "Carbon Sequestration",
|
293 |
-
# "dataType": "Numeric",
|
294 |
-
# "synonyms": ["Carbon Capture"],
|
295 |
-
# "uom": "Metric Tons CO₂e",
|
296 |
-
# "description": "Amount of CO₂ sequestered or captured."
|
297 |
-
# }
|
298 |
-
# ]
|
299 |
-
# }"""
|
|
|
1 |
import os
|
2 |
import json
|
3 |
+
import re
|
4 |
+
from typing import Optional, Dict, Union, IO, List, BinaryIO
|
5 |
from google import genai
|
6 |
from google.genai import types
|
|
|
|
|
|
|
7 |
from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
|
8 |
+
from application.utils import logger
|
9 |
+
|
10 |
+
logger=logger.get_logger()
|
11 |
|
12 |
+
client = genai.Client(api_key=os.getenv("gemini_api_key"))
|
13 |
|
14 |
PROMPT = (
|
15 |
+
"""You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
|
16 |
+
and ESG (Environmental, Social, Governance) Data from a company’s sustainability
|
17 |
+
or ESG report in PDF format."""
|
|
|
18 |
)
|
19 |
|
20 |
+
def sanitize_file_name(name: str, max_length: int = 40) -> str:
|
21 |
+
"""
|
22 |
+
Sanitizes a file name to comply with Gemini API naming rules:
|
23 |
+
- Lowercase only
|
24 |
+
- Alphanumeric characters and dashes (`-`) allowed
|
25 |
+
- Cannot start or end with a dash
|
26 |
+
- Max length: 40 characters
|
27 |
+
|
28 |
+
Args:
|
29 |
+
name (str): The original file name (without extension).
|
30 |
+
max_length (int, optional): Maximum allowed characters (default: 40).
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
str: Sanitized file name.
|
34 |
+
|
35 |
+
Raises:
|
36 |
+
ValueError: If the sanitized name is empty after cleaning.
|
37 |
+
"""
|
38 |
+
if not name or not isinstance(name, str):
|
39 |
+
raise ValueError("Invalid file name: must be a non-empty string.")
|
40 |
+
|
41 |
+
# Convert to lowercase and replace invalid characters with dashes
|
42 |
+
name = re.sub(r'[^a-z0-9]+', '-', name.lower())
|
43 |
+
|
44 |
+
# Remove leading/trailing dashes and truncate
|
45 |
+
name = name.strip('-')[:max_length].rstrip('-')
|
46 |
+
|
47 |
+
if not name:
|
48 |
+
raise ValueError("Sanitized file name is empty or invalid after cleanup.")
|
49 |
+
|
50 |
+
return name
|
51 |
+
|
52 |
+
def get_files() -> List[str]:
|
53 |
+
"""
|
54 |
+
Retrieves all uploaded file names from Gemini.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
List[str]: List of existing file names.
|
58 |
"""
|
59 |
+
files = client.files.list()
|
60 |
+
return [file.name for file in files]
|
61 |
+
|
62 |
+
|
63 |
+
def delete_files(file_names: Union[str, List[str]]) -> None:
|
64 |
+
"""
|
65 |
+
Deletes specified files from Gemini.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
file_names (Union[str, List[str]]): File name or list of names to delete.
|
69 |
+
"""
|
70 |
+
if not file_names:
|
71 |
+
logger.warning("No file names provided for deletion.")
|
72 |
+
return
|
73 |
+
|
74 |
+
if isinstance(file_names, str):
|
75 |
+
file_names = [file_names]
|
76 |
+
|
77 |
+
existing_files = get_files()
|
78 |
+
|
79 |
+
for name in file_names:
|
80 |
+
logger.info(f"Attempting to delete file: {name}")
|
81 |
+
if name in existing_files:
|
82 |
+
client.files.delete(name=name)
|
83 |
+
logger.info(f"Deleted file: {name}")
|
84 |
+
else:
|
85 |
+
logger.warning(f"File not found: {name}")
|
86 |
+
|
87 |
+
def upload_file(
|
88 |
+
file: Union[str, IO[bytes]],
|
89 |
+
file_name: Optional[str] = None,
|
90 |
+
config: Optional[Dict[str, str]] = None
|
91 |
+
) -> Optional[types.File]:
|
92 |
+
"""
|
93 |
+
Uploads a file to the Gemini API, handling both file paths and binary streams.
|
94 |
+
|
95 |
+
Args:
|
96 |
+
file (Union[str, IO[bytes]]): File path or binary file object (e.g., from Streamlit).
|
97 |
+
file_name (Optional[str]): Name for the file. If None, attempts to use file.name.
|
98 |
+
config (Optional[Dict[str, str]]): Extra config like 'mime_type'.
|
99 |
+
|
100 |
+
Returns:
|
101 |
+
Optional[types.File]: The uploaded Gemini file object, or existing one if already uploaded.
|
102 |
+
|
103 |
+
Raises:
|
104 |
+
Exception: If upload fails.
|
105 |
"""
|
106 |
+
try:
|
107 |
+
if not file_name:
|
108 |
+
if isinstance(file, str):
|
109 |
+
file_name = os.path.basename(file)
|
110 |
+
elif hasattr(file, "name"):
|
111 |
+
file_name = os.path.basename(file.name)
|
112 |
+
else:
|
113 |
+
raise ValueError("file_name must be provided if file has no 'name' attribute.")
|
114 |
+
|
115 |
+
sanitized_name = sanitize_file_name(os.path.splitext(file_name)[0])
|
116 |
+
mime_type = "application/pdf"
|
117 |
+
config = config or {}
|
118 |
+
config.update({"name": sanitized_name, "mime_type": mime_type})
|
119 |
+
gemini_file_key = f"files/{sanitized_name}"
|
120 |
+
|
121 |
+
if gemini_file_key in get_files():
|
122 |
+
logger.info(f"File already exists on Gemini: {gemini_file_key}")
|
123 |
+
return client.files.get(name=gemini_file_key)
|
124 |
+
|
125 |
+
logger.info(f"Uploading file to Gemini: {gemini_file_key}")
|
126 |
+
|
127 |
+
if isinstance(file, str):
|
128 |
+
with open(file, "rb") as f:
|
129 |
+
return client.files.upload(file=f, config=config)
|
130 |
+
else:
|
131 |
+
return client.files.upload(file=file, config=config)
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
logger.error(f"Failed to upload file '{file_name}': {e}")
|
135 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
def extract_emissions_data_as_json(
|
138 |
api: str,
|
|
|
140 |
file_input: Union[BinaryIO, bytes]
|
141 |
) -> Optional[dict]:
|
142 |
"""
|
143 |
+
Extracts ESG data from a PDF using the Gemini API.
|
144 |
|
145 |
Args:
|
146 |
+
api (str): API provider (must be 'gemini').
|
147 |
+
model (str): Model name (e.g., 'gemini-pro').
|
148 |
+
file_input (Union[BinaryIO, bytes]): File object or byte stream.
|
149 |
|
150 |
Returns:
|
151 |
+
Optional[dict]: Parsed JSON response or raw text if parsing fails.
|
152 |
"""
|
153 |
try:
|
154 |
+
if api.lower() != "gemini":
|
155 |
+
logger.error(f"Unsupported API: {api}")
|
156 |
+
return None
|
157 |
|
158 |
+
file_name = file_input.name if hasattr(file_input, 'name') else "uploaded_file.pdf"
|
159 |
+
uploaded_file = upload_file(file=file_input, file_name=file_name)
|
|
|
|
|
160 |
|
161 |
response = client.models.generate_content(
|
162 |
model=model,
|
163 |
+
contents=[uploaded_file, PROMPT],
|
|
|
|
|
|
|
164 |
config={
|
165 |
'response_mime_type': 'application/json',
|
166 |
+
'response_schema': GEMINI_RESPONSE_FORMAT
|
167 |
}
|
168 |
)
|
169 |
+
|
170 |
logger.info("[Gemini] Response received.")
|
171 |
try:
|
172 |
return json.loads(response.text)
|
|
|
175 |
return {"raw_response": response.text}
|
176 |
|
177 |
except Exception as e:
|
178 |
+
logger.exception("Error during ESG data extraction.")
|
179 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
application/services/streamlit_function.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
import streamlit as st
|
2 |
from typing import Union, List
|
|
|
|
|
|
|
|
|
3 |
from application.utils import logger
|
4 |
|
5 |
logger = logger.get_logger()
|
@@ -75,15 +79,49 @@ def upload_file(
|
|
75 |
st.session_state.pdf_file = uploaded_files
|
76 |
return uploaded_files
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from typing import Union, List
|
3 |
+
import pandas as pd
|
4 |
+
from io import BytesIO
|
5 |
+
import json
|
6 |
+
import os
|
7 |
from application.utils import logger
|
8 |
|
9 |
logger = logger.get_logger()
|
|
|
79 |
st.session_state.pdf_file = uploaded_files
|
80 |
return uploaded_files
|
81 |
|
82 |
+
def export_results_to_excel(results: dict, sheet_name: str, filename: str = "output.xlsx") -> BytesIO:
|
83 |
+
"""
|
84 |
+
Converts a dictionary result into a formatted Excel file.
|
85 |
+
Appends to a file in the 'data/' folder if it already exists,
|
86 |
+
and returns an in-memory Excel file for download.
|
87 |
+
|
88 |
+
Args:
|
89 |
+
results (dict): The data to export.
|
90 |
+
sheet_name (str): The sheet name to write to.
|
91 |
+
filename (str): The Excel file name (with or without '.xlsx').
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
BytesIO: In-memory Excel file for Streamlit download.
|
95 |
+
"""
|
96 |
+
try:
|
97 |
+
df = pd.json_normalize(results, sep='_')
|
98 |
+
df.replace({None: "", "NULL": ""}, inplace=True)
|
99 |
+
except Exception as e:
|
100 |
+
df = pd.DataFrame([{"error": f"Could not parse result: {str(e)}"}])
|
101 |
+
|
102 |
+
# Ensure correct file extension and path
|
103 |
+
filename = f"{filename}.xlsx" if not filename.endswith(".xlsx") else filename
|
104 |
+
full_path = os.path.join("data", filename)
|
105 |
+
|
106 |
+
os.makedirs("data", exist_ok=True) # Ensure the folder exists
|
107 |
+
|
108 |
+
# Save to physical file
|
109 |
+
if os.path.exists(full_path):
|
110 |
+
with pd.ExcelWriter(full_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
|
111 |
+
book = writer.book
|
112 |
+
if sheet_name in book.sheetnames:
|
113 |
+
sheet = book[sheet_name]
|
114 |
+
start_row = sheet.max_row
|
115 |
+
else:
|
116 |
+
start_row = 0
|
117 |
+
df.to_excel(writer, sheet_name=sheet_name, index=False, header=start_row == 0, startrow=start_row)
|
118 |
+
else:
|
119 |
+
df.to_excel(full_path, index=False, engine="openpyxl", sheet_name=sheet_name)
|
120 |
+
|
121 |
+
# Prepare in-memory Excel for download
|
122 |
+
output_stream = BytesIO()
|
123 |
+
with pd.ExcelWriter(output_stream, engine="openpyxl") as writer:
|
124 |
+
df.to_excel(writer, index=False, sheet_name=sheet_name)
|
125 |
+
output_stream.seek(0)
|
126 |
+
|
127 |
+
return output_stream
|
application/services/supabase_service.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
from supabase import create_client, StorageException
|
5 |
+
from utils import logger
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
# Logger Initialization
|
9 |
+
logger = logger.get_logger()
|
10 |
+
|
11 |
+
# Load Environment Variables
|
12 |
+
load_dotenv()
|
13 |
+
SUPABASE_URL = os.getenv('SUPABASE_URL')
|
14 |
+
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
|
15 |
+
SUPABASE_BUCKET = os.getenv('SUPABASE_BUCKET')
|
16 |
+
LLM_MODEL_NAME = os.getenv('LLM_MODEL_NAME')
|
17 |
+
BUCKET_FOLDER = "chat-history"
|
18 |
+
|
19 |
+
# Supabase Client Initialization
|
20 |
+
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
requirements.txt
CHANGED
@@ -4,6 +4,6 @@ dotenv
|
|
4 |
google
|
5 |
google.genai
|
6 |
google-generativeai
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
4 |
google
|
5 |
google.genai
|
6 |
google-generativeai
|
7 |
+
pandas
|
8 |
+
supabase
|
9 |
+
openpyxl
|