Vela commited on
Commit
d1ca23a
·
1 Parent(s): dab58f3

modified gemini service module add API call file handling

Browse files
app.py CHANGED
@@ -1,60 +1,78 @@
1
- from application.services import streamlit_function, llm_service
2
- from application.services import gemini_model, openai_model
3
  import streamlit as st
 
 
4
  from google.genai.errors import ClientError
5
  from application.utils import logger
6
- import test
7
 
8
  logger = logger.get_logger()
9
 
10
- streamlit_function.config_homepage()
11
- pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
 
12
 
13
- available_files = ["Select a pdf file"]
14
- for file in llm_service.get_files():
15
- available_files.append(file.filename)
 
 
16
 
17
- selected_file = st.selectbox("Select a existing file", available_files)
18
 
19
- for key in ["gpt4o_mini_result", "gpt4o_result", "gemini_result", "pdf_file"]:
20
  if key not in st.session_state:
21
  st.session_state[key] = None
22
 
 
 
 
23
  if st.session_state.pdf_file:
24
  with st.container():
25
  col1, col2, col3 = st.columns([5, 5, 5], gap="small")
 
 
26
 
27
  with col1:
28
- if st.button("Generate GPT-4o-min Response"):
29
- with st.spinner("Calling GPT-4o-mini..."):
30
- result = llm_service.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
31
- # result= openai_model.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
32
- st.session_state.gpt4o_mini_result = result
33
- if st.session_state.gpt4o_mini_result:
34
- st.write("Extracted Metrics by gpt-4o-mini")
35
- st.json(st.session_state.gpt4o_mini_result)
36
-
37
  with col2:
38
- if st.button("Generate GPT-4o Response"):
39
- with st.spinner("Calling gpt-4o..."):
40
- result= llm_service.extract_emissions_data_as_json("openai","gpt-4o",pdf_file)
41
- st.session_state.gpt4o_result = result
42
- if st.session_state.gpt4o_result:
43
- st.write("Extracted Metrics by gpt-4o")
44
- st.json(st.session_state.gpt4o_result)
 
45
 
46
  with col3:
47
  try:
48
- if st.button("Generate Gemini Response"):
49
- with st.spinner("Calling gemini-1.5-pro-latest..."):
50
- result = llm_service.extract_emissions_data_as_json("gemini","gemini-2.0-flash", st.session_state.pdf_file)
51
- # result = gemini_model.extract_emissions_data_as_json("gemini","gemini-2.0-flash", pdf_file)
52
- st.session_state.gemini_result = result
53
  except ClientError as e:
54
  st.error(f"Gemini API Error: {e}")
55
  logger.error("Error Details:", e.message, e.response)
56
 
57
- if st.session_state.gemini_result:
58
- st.write("Extracted Metrics by gemini-1.5-pro-latest")
59
- st.json(st.session_state.gemini_result)
 
 
60
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ from application.services import streamlit_function, gemini_model
4
  from google.genai.errors import ClientError
5
  from application.utils import logger
 
6
 
7
  logger = logger.get_logger()
8
 
9
+ MODEL_1 = "gemini-1.5-pro-latest"
10
+ MODEL_2 = "gemini-2.0-flash"
11
+ MODEL_3 = "gemini-1.5-flash"
12
 
13
+ API_1 = "gemini"
14
+ API_2 = "gemini"
15
+ API_3 = "gemini"
16
+
17
+ streamlit_function.config_homepage()
18
 
19
+ pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
20
 
21
+ for key in [f"{MODEL_1}_result", f"{MODEL_2}_result", f"{MODEL_3}_result", "pdf_file"]:
22
  if key not in st.session_state:
23
  st.session_state[key] = None
24
 
25
+ if "excel_file" not in st.session_state:
26
+ st.session_state["excel_file"] = None
27
+
28
  if st.session_state.pdf_file:
29
  with st.container():
30
  col1, col2, col3 = st.columns([5, 5, 5], gap="small")
31
+ file_name = st.session_state.pdf_file.name.removesuffix(".pdf")
32
+ excel_file=None
33
 
34
  with col1:
35
+ if st.button(f"Generate {MODEL_1} Response"):
36
+ with st.spinner(f"Calling {MODEL_1}..."):
37
+ result = gemini_model.extract_emissions_data_as_json(API_1 , MODEL_1, st.session_state.pdf_file)
38
+ excel_file = streamlit_function.export_results_to_excel(result, MODEL_1, file_name)
39
+ st.session_state[f"{MODEL_1}_result"] = result
40
+ if st.session_state[f"{MODEL_1}_result"]:
41
+ st.write(f"Extracted Metrics by {MODEL_1}_result")
42
+ st.json(st.session_state[f"{MODEL_1}_result"])
43
+
44
  with col2:
45
+ if st.button(f"Generate {MODEL_2} Response"):
46
+ with st.spinner(f"Calling {MODEL_2}..."):
47
+ result = gemini_model.extract_emissions_data_as_json(API_2, MODEL_2, st.session_state.pdf_file)
48
+ excel_file = streamlit_function.export_results_to_excel(result, MODEL_2, file_name)
49
+ st.session_state[f"{MODEL_2}_result"] = result
50
+ if st.session_state[f"{MODEL_2}_result"]:
51
+ st.write(f"Extracted Metrics by {MODEL_2}_result")
52
+ st.json(st.session_state[f"{MODEL_2}_result"])
53
 
54
  with col3:
55
  try:
56
+ if st.button(f"Generate {MODEL_3} Response"):
57
+ with st.spinner(f"Calling {MODEL_3}..."):
58
+ result = gemini_model.extract_emissions_data_as_json(API_3, MODEL_3, st.session_state.pdf_file)
59
+ excel_file = streamlit_function.export_results_to_excel(result, MODEL_3, file_name)
60
+ st.session_state[f"{MODEL_3}_result"] = result
61
  except ClientError as e:
62
  st.error(f"Gemini API Error: {e}")
63
  logger.error("Error Details:", e.message, e.response)
64
 
65
+ if st.session_state[f"{MODEL_3}_result"]:
66
+ st.write(f"Extracted Metrics by {MODEL_3}_result")
67
+ st.json(st.session_state[f"{MODEL_3}_result"])
68
+
69
+ file_path = f"data/{file_name}.xlsx"
70
 
71
+ if os.path.exists(file_path):
72
+ with open(file_path, "rb") as file:
73
+ st.download_button(
74
+ label="Download Excel File",
75
+ data=file,
76
+ file_name=f"{file_name}.xlsx",
77
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
78
+ )
application/schemas/response_schema.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  RESPONSE_FORMAT = {
2
  "type": "json_schema",
3
  "json_schema": {
@@ -449,4 +451,64 @@ GEMINI_RESPONSE_FORMAT = {
449
  }
450
  },
451
  "propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
452
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
  RESPONSE_FORMAT = {
4
  "type": "json_schema",
5
  "json_schema": {
 
451
  }
452
  },
453
  "propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
454
+ }
455
+
456
+
457
+ class Parameter(BaseModel):
458
+ """
459
+ A generic class to hold details for a sustainability metric.
460
+ """
461
+ synonym: str
462
+ uom: str
463
+ description: str
464
+ value: str
465
+
466
+ class GreenhouseGasGHGProtocolParameters(BaseModel):
467
+ Total_GHG_Emissions: Parameter
468
+ Scope_1_Emissions: Parameter
469
+ Scope_2_Emissions: Parameter
470
+ Scope_3_Emissions: Parameter
471
+ CO2_Emissions: Parameter
472
+ CH4_Emissions: Parameter
473
+ N2O_Emissions: Parameter
474
+ HFC_Emissions: Parameter
475
+ PFC_Emissions: Parameter
476
+ SF6_Emissions: Parameter
477
+ NF3_Emissions: Parameter
478
+ Biogenic_CO2_Emissions: Parameter
479
+ Emissions_Intensity_per_Revenue: Parameter
480
+ Emissions_Intensity_per_Employee: Parameter
481
+ Base_Year_Emissions: Parameter
482
+ Emissions_Reduction_Target: Parameter
483
+ Emissions_Reduction_Achieved: Parameter
484
+ Energy_Consumption: Parameter
485
+ Renewable_Energy_Consumption: Parameter
486
+ Non_Renewable_Energy_Consumption: Parameter
487
+ Energy_Intensity_per_Revenue: Parameter
488
+ Energy_Intensity_per_Employee: Parameter
489
+ Fuel_Consumption: Parameter
490
+ Electricity_Consumption: Parameter
491
+ Heat_Consumption: Parameter
492
+ Steam_Consumption: Parameter
493
+ Cooling_Consumption: Parameter
494
+ Purchased_Goods_and_Services_Emissions: Parameter
495
+ Capital_Goods_Emissions: Parameter
496
+ Fuel_and_Energy_Related_Activities_Emissions: Parameter
497
+ Upstream_Transportation_and_Distribution_Emissions: Parameter
498
+ Waste_Generated_in_Operations_Emissions: Parameter
499
+ Business_Travel_Emissions: Parameter
500
+ Employee_Commuting_Emissions: Parameter
501
+ Upstream_Leased_Assets_Emissions: Parameter
502
+ # Downstream_Transportation_and_Distribution_Emissions: Parameter
503
+ # Processing_of_Sold_Products_Emissions: Parameter
504
+ # Use_of_Sold_Products_Emissions: Parameter
505
+ # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
506
+ # Downstream_Leased_Assets_Emissions: Parameter
507
+ # Franchises_Emissions: Parameter
508
+ # Investments_Emissions: Parameter
509
+ # Carbon_Offsets_Purchased: Parameter
510
+ # Net_GHG_Emissions: Parameter
511
+ # Carbon_Sequestration: Parameter
512
+
513
+ class EmissionData(BaseModel):
514
+ GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
application/services/gemini_model.py CHANGED
@@ -1,81 +1,138 @@
1
  import os
2
  import json
 
 
3
  from google import genai
4
  from google.genai import types
5
- from pydantic import BaseModel
6
- from typing import Optional, Union, BinaryIO
7
- from application.utils import logger
8
  from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
 
 
 
9
 
10
- logger = logger.get_logger()
11
 
12
  PROMPT = (
13
- """You are a PDF parsing agent.
14
- Your job is to extract from a company’s sustainability or ESG report in PDF format:
15
- If the values are not found in the document, please return json null for that value.
16
- """
17
  )
18
 
19
- class Parameter(BaseModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  """
21
- A generic class to hold details for a sustainability metric.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  """
23
- synonym: str
24
- uom: str
25
- description: str
26
- value: str
27
-
28
- class GreenhouseGasGHGProtocolParameters(BaseModel):
29
- Total_GHG_Emissions: Parameter
30
- Scope_1_Emissions: Parameter
31
- Scope_2_Emissions: Parameter
32
- Scope_3_Emissions: Parameter
33
- CO2_Emissions: Parameter
34
- CH4_Emissions: Parameter
35
- N2O_Emissions: Parameter
36
- HFC_Emissions: Parameter
37
- PFC_Emissions: Parameter
38
- SF6_Emissions: Parameter
39
- NF3_Emissions: Parameter
40
- Biogenic_CO2_Emissions: Parameter
41
- Emissions_Intensity_per_Revenue: Parameter
42
- Emissions_Intensity_per_Employee: Parameter
43
- Base_Year_Emissions: Parameter
44
- Emissions_Reduction_Target: Parameter
45
- Emissions_Reduction_Achieved: Parameter
46
- Energy_Consumption: Parameter
47
- Renewable_Energy_Consumption: Parameter
48
- Non_Renewable_Energy_Consumption: Parameter
49
- Energy_Intensity_per_Revenue: Parameter
50
- Energy_Intensity_per_Employee: Parameter
51
- Fuel_Consumption: Parameter
52
- Electricity_Consumption: Parameter
53
- Heat_Consumption: Parameter
54
- Steam_Consumption: Parameter
55
- Cooling_Consumption: Parameter
56
- Purchased_Goods_and_Services_Emissions: Parameter
57
- Capital_Goods_Emissions: Parameter
58
- Fuel_and_Energy_Related_Activities_Emissions: Parameter
59
- Upstream_Transportation_and_Distribution_Emissions: Parameter
60
- Waste_Generated_in_Operations_Emissions: Parameter
61
- Business_Travel_Emissions: Parameter
62
- Employee_Commuting_Emissions: Parameter
63
- Upstream_Leased_Assets_Emissions: Parameter
64
- # Downstream_Transportation_and_Distribution_Emissions: Parameter
65
- # Processing_of_Sold_Products_Emissions: Parameter
66
- # Use_of_Sold_Products_Emissions: Parameter
67
- # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
68
- # Downstream_Leased_Assets_Emissions: Parameter
69
- # Franchises_Emissions: Parameter
70
- # Investments_Emissions: Parameter
71
- # Carbon_Offsets_Purchased: Parameter
72
- # Net_GHG_Emissions: Parameter
73
- # Carbon_Sequestration: Parameter
74
-
75
- class EmissionData(BaseModel):
76
- GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
77
-
78
- # print(json.dumps(EmissionData.model_json_schema(), indent=2))
79
 
80
  def extract_emissions_data_as_json(
81
  api: str,
@@ -83,34 +140,33 @@ def extract_emissions_data_as_json(
83
  file_input: Union[BinaryIO, bytes]
84
  ) -> Optional[dict]:
85
  """
86
- Extract ESG data from PDF using OpenAI or Gemini APIs.
87
 
88
  Args:
89
- api: 'openai' or 'gemini'
90
- model: Model name (e.g. gpt-4o, gemini-pro)
91
- file_input: File-like object or bytes of the PDF.
92
 
93
  Returns:
94
- Parsed ESG data as dict or None if failed.
95
  """
96
  try:
 
 
 
97
 
98
- client = genai.Client(api_key=os.getenv("gemini_api_key"))
99
-
100
- file_bytes = file_input.read()
101
- logger.info("[Gemini] Sending content for generation...")
102
 
103
  response = client.models.generate_content(
104
  model=model,
105
- contents=[
106
- types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
107
- PROMPT
108
- ],
109
  config={
110
  'response_mime_type': 'application/json',
111
- 'response_schema': GEMINI_RESPONSE_FORMAT,
112
  }
113
  )
 
114
  logger.info("[Gemini] Response received.")
115
  try:
116
  return json.loads(response.text)
@@ -119,181 +175,5 @@ def extract_emissions_data_as_json(
119
  return {"raw_response": response.text}
120
 
121
  except Exception as e:
122
- logger.exception(f"Error during ESG data extraction.{e}")
123
- return None
124
-
125
- # import os
126
- # from google import genai
127
- # from pydantic import BaseModel, Field, ValidationError
128
- # from dotenv import load_dotenv
129
- # from typing import Optional
130
- # from google.genai import types
131
-
132
- # load_dotenv()
133
- # client = genai.Client(api_key=os.getenv("gemini_api_key"))
134
-
135
- # schema= """{
136
- # "parameters": [
137
- # {
138
- # "parameter": "Total GHG Emissions",
139
- # "dataType": "Numeric",
140
- # "synonyms": ["Carbon Footprint"],
141
- # "uom": "Metric Tons CO₂e",
142
- # "description": "Total greenhouse gases emitted by the organization."
143
- # },
144
- # {
145
- # "parameter": "Scope 1 Emissions",
146
- # "dataType": "Numeric",
147
- # "synonyms": ["Direct Emissions"],
148
- # "uom": "Metric Tons CO₂e",
149
- # "description": "Direct GHG emissions from owned or controlled sources."
150
- # },
151
- # {
152
- # "parameter": "Scope 2 Emissions",
153
- # "dataType": "Numeric",
154
- # "synonyms": ["Indirect Energy Emissions"],
155
- # "uom": "Metric Tons CO₂e",
156
- # "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling."
157
- # },
158
- # {
159
- # "parameter": "Scope 3 Emissions",
160
- # "dataType": "Numeric",
161
- # "synonyms": ["Value Chain Emissions"],
162
- # "uom": "Metric Tons CO₂e",
163
- # "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions."
164
- # },
165
- # {
166
- # "parameter": "CO₂ Emissions",
167
- # "dataType": "Numeric",
168
- # "synonyms": ["Carbon Emissions"],
169
- # "uom": "Metric Tons CO₂",
170
- # "description": "Emissions of carbon dioxide."
171
- # },
172
- # {
173
- # "parameter": "CH₄ Emissions",
174
- # "dataType": "Numeric",
175
- # "synonyms": ["Methane Emissions"],
176
- # "uom": "Metric Tons CH₄",
177
- # "description": "Emissions of methane."
178
- # },
179
- # {
180
- # "parameter": "N₂O Emissions",
181
- # "dataType": "Numeric",
182
- # "synonyms": ["Nitrous Oxide Emissions"],
183
- # "uom": "Metric Tons N₂O",
184
- # "description": "Emissions of nitrous oxide."
185
- # },
186
- # {
187
- # "parameter": "HFC Emissions",
188
- # "dataType": "Numeric",
189
- # "synonyms": ["Hydrofluorocarbon Emissions"],
190
- # "uom": "Metric Tons HFCs",
191
- # "description": "Emissions of hydrofluorocarbons."
192
- # },
193
- # {
194
- # "parameter": "PFC Emissions",
195
- # "dataType": "Numeric",
196
- # "synonyms": ["Perfluorocarbon Emissions"],
197
- # "uom": "Metric Tons PFCs",
198
- # "description": "Emissions of perfluorocarbons."
199
- # },
200
- # {
201
- # "parameter": "SF₆ Emissions",
202
- # "dataType": "Numeric",
203
- # "synonyms": ["Sulfur Hexafluoride Emissions"],
204
- # "uom": "Metric Tons SF₆",
205
- # "description": "Emissions of sulfur hexafluoride."
206
- # },
207
- # {
208
- # "parameter": "NF₃ Emissions",
209
- # "dataType": "Numeric",
210
- # "synonyms": ["Nitrogen Trifluoride Emissions"],
211
- # "uom": "Metric Tons NF₃",
212
- # "description": "Emissions of nitrogen trifluoride."
213
- # },
214
- # {
215
- # "parameter": "Biogenic CO₂ Emissions",
216
- # "dataType": "Numeric",
217
- # "synonyms": ["Biogenic Carbon Emissions"],
218
- # "uom": "Metric Tons CO₂",
219
- # "description": "CO₂ emissions from biological sources."
220
- # },
221
- # {
222
- # "parameter": "Emissions Intensity per Revenue",
223
- # "dataType": "Numeric",
224
- # "synonyms": ["Carbon Intensity"],
225
- # "uom": "Metric Tons CO₂e / Revenue",
226
- # "description": "GHG emissions per unit of revenue."
227
- # },
228
- # {
229
- # "parameter": "Emissions Intensity per Employee",
230
- # "dataType": "Numeric",
231
- # "synonyms": ["Emissions per Employee"],
232
- # "uom": "Metric Tons CO₂e / Employee",
233
- # "description": "GHG emissions per employee."
234
- # },
235
- # {
236
- # "parameter": "Base Year Emissions",
237
- # "dataType": "Numeric",
238
- # "synonyms": ["Baseline Emissions"],
239
- # "uom": "Metric Tons CO₂e",
240
- # "description": "GHG emissions in the base year for comparison."
241
- # },
242
- # {
243
- # "parameter": "Emissions Reduction Target",
244
- # "dataType": "Numeric",
245
- # "synonyms": ["Emission Reduction Goal"],
246
- # "uom": "Percentage (%)",
247
- # "description": "Targeted percentage reduction in GHG emissions."
248
- # },
249
- # {
250
- # "parameter": "Emissions Reduction Achieved",
251
- # "dataType": "Numeric",
252
- # "synonyms": ["Emission Reduction Accomplished"],
253
- # "uom": "Percentage (%)",
254
- # "description": "Actual percentage reduction in GHG emissions achieved."
255
- # },
256
- # {
257
- # "parameter": "Energy Consumption",
258
- # "dataType": "Numeric",
259
- # "synonyms": ["Energy Use"],
260
- # "uom": "MWh or GJ",
261
- # "description": "Total energy consumed by the organization."
262
- # },
263
- # {
264
- # "parameter": "Renewable Energy Consumption",
265
- # "dataType": "Numeric",
266
- # "synonyms": ["Green Energy Use"],
267
- # "uom": "MWh or GJ",
268
- # "description": "Amount of energy consumed from renewable sources."
269
- # },
270
- # {
271
- # "parameter": "Non-Renewable Energy Consumption",
272
- # "dataType": "Numeric",
273
- # "synonyms": ["Fossil Energy Use"],
274
- # "uom": "MWh or GJ",
275
- # "description": "Amount of energy consumed from non-renewable sources."
276
- # },
277
- # {
278
- # "parameter": "Carbon Offsets Purchased",
279
- # "dataType": "Numeric",
280
- # "synonyms": ["Carbon Credits"],
281
- # "uom": "Metric Tons CO₂e",
282
- # "description": "Amount of carbon offsets purchased."
283
- # },
284
- # {
285
- # "parameter": "Net GHG Emissions",
286
- # "dataType": "Numeric",
287
- # "synonyms": ["Net Carbon Emissions"],
288
- # "uom": "Metric Tons CO₂e",
289
- # "description": "GHG emissions after accounting for offsets."
290
- # },
291
- # {
292
- # "parameter": "Carbon Sequestration",
293
- # "dataType": "Numeric",
294
- # "synonyms": ["Carbon Capture"],
295
- # "uom": "Metric Tons CO₂e",
296
- # "description": "Amount of CO₂ sequestered or captured."
297
- # }
298
- # ]
299
- # }"""
 
1
  import os
2
  import json
3
+ import re
4
+ from typing import Optional, Dict, Union, IO, List, BinaryIO
5
  from google import genai
6
  from google.genai import types
 
 
 
7
  from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
8
+ from application.utils import logger
9
+
10
+ logger=logger.get_logger()
11
 
12
+ client = genai.Client(api_key=os.getenv("gemini_api_key"))
13
 
14
  PROMPT = (
15
+ """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
16
+ and ESG (Environmental, Social, Governance) Data from a company’s sustainability
17
+ or ESG report in PDF format."""
 
18
  )
19
 
20
+ def sanitize_file_name(name: str, max_length: int = 40) -> str:
21
+ """
22
+ Sanitizes a file name to comply with Gemini API naming rules:
23
+ - Lowercase only
24
+ - Alphanumeric characters and dashes (`-`) allowed
25
+ - Cannot start or end with a dash
26
+ - Max length: 40 characters
27
+
28
+ Args:
29
+ name (str): The original file name (without extension).
30
+ max_length (int, optional): Maximum allowed characters (default: 40).
31
+
32
+ Returns:
33
+ str: Sanitized file name.
34
+
35
+ Raises:
36
+ ValueError: If the sanitized name is empty after cleaning.
37
+ """
38
+ if not name or not isinstance(name, str):
39
+ raise ValueError("Invalid file name: must be a non-empty string.")
40
+
41
+ # Convert to lowercase and replace invalid characters with dashes
42
+ name = re.sub(r'[^a-z0-9]+', '-', name.lower())
43
+
44
+ # Remove leading/trailing dashes and truncate
45
+ name = name.strip('-')[:max_length].rstrip('-')
46
+
47
+ if not name:
48
+ raise ValueError("Sanitized file name is empty or invalid after cleanup.")
49
+
50
+ return name
51
+
52
+ def get_files() -> List[str]:
53
+ """
54
+ Retrieves all uploaded file names from Gemini.
55
+
56
+ Returns:
57
+ List[str]: List of existing file names.
58
  """
59
+ files = client.files.list()
60
+ return [file.name for file in files]
61
+
62
+
63
+ def delete_files(file_names: Union[str, List[str]]) -> None:
64
+ """
65
+ Deletes specified files from Gemini.
66
+
67
+ Args:
68
+ file_names (Union[str, List[str]]): File name or list of names to delete.
69
+ """
70
+ if not file_names:
71
+ logger.warning("No file names provided for deletion.")
72
+ return
73
+
74
+ if isinstance(file_names, str):
75
+ file_names = [file_names]
76
+
77
+ existing_files = get_files()
78
+
79
+ for name in file_names:
80
+ logger.info(f"Attempting to delete file: {name}")
81
+ if name in existing_files:
82
+ client.files.delete(name=name)
83
+ logger.info(f"Deleted file: {name}")
84
+ else:
85
+ logger.warning(f"File not found: {name}")
86
+
87
+ def upload_file(
88
+ file: Union[str, IO[bytes]],
89
+ file_name: Optional[str] = None,
90
+ config: Optional[Dict[str, str]] = None
91
+ ) -> Optional[types.File]:
92
+ """
93
+ Uploads a file to the Gemini API, handling both file paths and binary streams.
94
+
95
+ Args:
96
+ file (Union[str, IO[bytes]]): File path or binary file object (e.g., from Streamlit).
97
+ file_name (Optional[str]): Name for the file. If None, attempts to use file.name.
98
+ config (Optional[Dict[str, str]]): Extra config like 'mime_type'.
99
+
100
+ Returns:
101
+ Optional[types.File]: The uploaded Gemini file object, or existing one if already uploaded.
102
+
103
+ Raises:
104
+ Exception: If upload fails.
105
  """
106
+ try:
107
+ if not file_name:
108
+ if isinstance(file, str):
109
+ file_name = os.path.basename(file)
110
+ elif hasattr(file, "name"):
111
+ file_name = os.path.basename(file.name)
112
+ else:
113
+ raise ValueError("file_name must be provided if file has no 'name' attribute.")
114
+
115
+ sanitized_name = sanitize_file_name(os.path.splitext(file_name)[0])
116
+ mime_type = "application/pdf"
117
+ config = config or {}
118
+ config.update({"name": sanitized_name, "mime_type": mime_type})
119
+ gemini_file_key = f"files/{sanitized_name}"
120
+
121
+ if gemini_file_key in get_files():
122
+ logger.info(f"File already exists on Gemini: {gemini_file_key}")
123
+ return client.files.get(name=gemini_file_key)
124
+
125
+ logger.info(f"Uploading file to Gemini: {gemini_file_key}")
126
+
127
+ if isinstance(file, str):
128
+ with open(file, "rb") as f:
129
+ return client.files.upload(file=f, config=config)
130
+ else:
131
+ return client.files.upload(file=file, config=config)
132
+
133
+ except Exception as e:
134
+ logger.error(f"Failed to upload file '{file_name}': {e}")
135
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  def extract_emissions_data_as_json(
138
  api: str,
 
140
  file_input: Union[BinaryIO, bytes]
141
  ) -> Optional[dict]:
142
  """
143
+ Extracts ESG data from a PDF using the Gemini API.
144
 
145
  Args:
146
+ api (str): API provider (must be 'gemini').
147
+ model (str): Model name (e.g., 'gemini-pro').
148
+ file_input (Union[BinaryIO, bytes]): File object or byte stream.
149
 
150
  Returns:
151
+ Optional[dict]: Parsed JSON response or raw text if parsing fails.
152
  """
153
  try:
154
+ if api.lower() != "gemini":
155
+ logger.error(f"Unsupported API: {api}")
156
+ return None
157
 
158
+ file_name = file_input.name if hasattr(file_input, 'name') else "uploaded_file.pdf"
159
+ uploaded_file = upload_file(file=file_input, file_name=file_name)
 
 
160
 
161
  response = client.models.generate_content(
162
  model=model,
163
+ contents=[uploaded_file, PROMPT],
 
 
 
164
  config={
165
  'response_mime_type': 'application/json',
166
+ 'response_schema': GEMINI_RESPONSE_FORMAT
167
  }
168
  )
169
+
170
  logger.info("[Gemini] Response received.")
171
  try:
172
  return json.loads(response.text)
 
175
  return {"raw_response": response.text}
176
 
177
  except Exception as e:
178
+ logger.exception("Error during ESG data extraction.")
179
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
application/services/streamlit_function.py CHANGED
@@ -1,5 +1,9 @@
1
  import streamlit as st
2
  from typing import Union, List
 
 
 
 
3
  from application.utils import logger
4
 
5
  logger = logger.get_logger()
@@ -75,15 +79,49 @@ def upload_file(
75
  st.session_state.pdf_file = uploaded_files
76
  return uploaded_files
77
 
78
- # def extract_text_from_pdf(file) -> str:
79
- # """
80
- # Extracts and returns the full text content from a PDF file.
81
-
82
- # :param file: PDF file object (BytesIO or UploadedFile from Streamlit)
83
- # :return: Extracted text as a string
84
- # """
85
- # text = ""
86
- # with fitz.open(stream=file.read(), filetype="pdf") as doc:
87
- # for page in doc:
88
- # text += page.get_text()
89
- # return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from typing import Union, List
3
+ import pandas as pd
4
+ from io import BytesIO
5
+ import json
6
+ import os
7
  from application.utils import logger
8
 
9
  logger = logger.get_logger()
 
79
  st.session_state.pdf_file = uploaded_files
80
  return uploaded_files
81
 
82
+ def export_results_to_excel(results: dict, sheet_name: str, filename: str = "output.xlsx") -> BytesIO:
83
+ """
84
+ Converts a dictionary result into a formatted Excel file.
85
+ Appends to a file in the 'data/' folder if it already exists,
86
+ and returns an in-memory Excel file for download.
87
+
88
+ Args:
89
+ results (dict): The data to export.
90
+ sheet_name (str): The sheet name to write to.
91
+ filename (str): The Excel file name (with or without '.xlsx').
92
+
93
+ Returns:
94
+ BytesIO: In-memory Excel file for Streamlit download.
95
+ """
96
+ try:
97
+ df = pd.json_normalize(results, sep='_')
98
+ df.replace({None: "", "NULL": ""}, inplace=True)
99
+ except Exception as e:
100
+ df = pd.DataFrame([{"error": f"Could not parse result: {str(e)}"}])
101
+
102
+ # Ensure correct file extension and path
103
+ filename = f"{filename}.xlsx" if not filename.endswith(".xlsx") else filename
104
+ full_path = os.path.join("data", filename)
105
+
106
+ os.makedirs("data", exist_ok=True) # Ensure the folder exists
107
+
108
+ # Save to physical file
109
+ if os.path.exists(full_path):
110
+ with pd.ExcelWriter(full_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
111
+ book = writer.book
112
+ if sheet_name in book.sheetnames:
113
+ sheet = book[sheet_name]
114
+ start_row = sheet.max_row
115
+ else:
116
+ start_row = 0
117
+ df.to_excel(writer, sheet_name=sheet_name, index=False, header=start_row == 0, startrow=start_row)
118
+ else:
119
+ df.to_excel(full_path, index=False, engine="openpyxl", sheet_name=sheet_name)
120
+
121
+ # Prepare in-memory Excel for download
122
+ output_stream = BytesIO()
123
+ with pd.ExcelWriter(output_stream, engine="openpyxl") as writer:
124
+ df.to_excel(writer, index=False, sheet_name=sheet_name)
125
+ output_stream.seek(0)
126
+
127
+ return output_stream
application/services/supabase_service.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from supabase import create_client, StorageException
5
+ from utils import logger
6
+ from dotenv import load_dotenv
7
+
8
+ # Logger Initialization
9
+ logger = logger.get_logger()
10
+
11
+ # Load Environment Variables
12
+ load_dotenv()
13
+ SUPABASE_URL = os.getenv('SUPABASE_URL')
14
+ SUPABASE_KEY = os.getenv('SUPABASE_KEY')
15
+ SUPABASE_BUCKET = os.getenv('SUPABASE_BUCKET')
16
+ LLM_MODEL_NAME = os.getenv('LLM_MODEL_NAME')
17
+ BUCKET_FOLDER = "chat-history"
18
+
19
+ # Supabase Client Initialization
20
+ supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
requirements.txt CHANGED
@@ -4,6 +4,6 @@ dotenv
4
  google
5
  google.genai
6
  google-generativeai
7
- pymupdf
8
- openpyxl
9
- pandas
 
4
  google
5
  google.genai
6
  google-generativeai
7
+ pandas
8
+ supabase
9
+ openpyxl