Vela commited on
Commit
22481bd
·
1 Parent(s): d1ca23a

Added multiple file upload functionality

Browse files
.gitignore CHANGED
@@ -2,4 +2,5 @@
2
  .env
3
  data
4
  __pycache__/
5
- logs/
 
 
2
  .env
3
  data
4
  __pycache__/
5
+ logs/
6
+ test.py
app.py CHANGED
@@ -3,8 +3,20 @@ import os
3
  from application.services import streamlit_function, gemini_model
4
  from google.genai.errors import ClientError
5
  from application.utils import logger
 
 
 
 
 
 
 
6
 
7
  logger = logger.get_logger()
 
 
 
 
 
8
 
9
  MODEL_1 = "gemini-1.5-pro-latest"
10
  MODEL_2 = "gemini-2.0-flash"
@@ -14,8 +26,6 @@ API_1 = "gemini"
14
  API_2 = "gemini"
15
  API_3 = "gemini"
16
 
17
- streamlit_function.config_homepage()
18
-
19
  pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
20
 
21
  for key in [f"{MODEL_1}_result", f"{MODEL_2}_result", f"{MODEL_3}_result", "pdf_file"]:
@@ -28,13 +38,13 @@ if "excel_file" not in st.session_state:
28
  if st.session_state.pdf_file:
29
  with st.container():
30
  col1, col2, col3 = st.columns([5, 5, 5], gap="small")
31
- file_name = st.session_state.pdf_file.name.removesuffix(".pdf")
32
  excel_file=None
33
 
34
  with col1:
35
  if st.button(f"Generate {MODEL_1} Response"):
36
  with st.spinner(f"Calling {MODEL_1}..."):
37
- result = gemini_model.extract_emissions_data_as_json(API_1 , MODEL_1, st.session_state.pdf_file)
38
  excel_file = streamlit_function.export_results_to_excel(result, MODEL_1, file_name)
39
  st.session_state[f"{MODEL_1}_result"] = result
40
  if st.session_state[f"{MODEL_1}_result"]:
@@ -44,7 +54,7 @@ if st.session_state.pdf_file:
44
  with col2:
45
  if st.button(f"Generate {MODEL_2} Response"):
46
  with st.spinner(f"Calling {MODEL_2}..."):
47
- result = gemini_model.extract_emissions_data_as_json(API_2, MODEL_2, st.session_state.pdf_file)
48
  excel_file = streamlit_function.export_results_to_excel(result, MODEL_2, file_name)
49
  st.session_state[f"{MODEL_2}_result"] = result
50
  if st.session_state[f"{MODEL_2}_result"]:
@@ -55,7 +65,7 @@ if st.session_state.pdf_file:
55
  try:
56
  if st.button(f"Generate {MODEL_3} Response"):
57
  with st.spinner(f"Calling {MODEL_3}..."):
58
- result = gemini_model.extract_emissions_data_as_json(API_3, MODEL_3, st.session_state.pdf_file)
59
  excel_file = streamlit_function.export_results_to_excel(result, MODEL_3, file_name)
60
  st.session_state[f"{MODEL_3}_result"] = result
61
  except ClientError as e:
@@ -75,4 +85,4 @@ if st.session_state.pdf_file:
75
  data=file,
76
  file_name=f"{file_name}.xlsx",
77
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
78
- )
 
3
  from application.services import streamlit_function, gemini_model
4
  from google.genai.errors import ClientError
5
  from application.utils import logger
6
+ from application.schemas.response_schema import (
7
+ GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
8
+ GEMINI_ENVIRONMENT_PARAMETERS, GEMINI_SOCIAL_PARAMETERS,
9
+ GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
10
+ GEMINI_NET_ZERO_INTERVENTION_PARAMETERS, FULL_RESPONSE_SCHEMA
11
+ )
12
+ import test
13
 
14
  logger = logger.get_logger()
15
+ streamlit_function.config_homepage()
16
+ st.title("Sustainability Report Analyzer")
17
+ st.write("Upload your sustainability report PDF and generate insights using different models.")
18
+
19
+ MODEL = ["gemini-1.5-pro-latest", "gemini-2.0-flash", "gemini-1.5-flash", "gemini-2.5-exp"]
20
 
21
  MODEL_1 = "gemini-1.5-pro-latest"
22
  MODEL_2 = "gemini-2.0-flash"
 
26
  API_2 = "gemini"
27
  API_3 = "gemini"
28
 
 
 
29
  pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
30
 
31
  for key in [f"{MODEL_1}_result", f"{MODEL_2}_result", f"{MODEL_3}_result", "pdf_file"]:
 
38
  if st.session_state.pdf_file:
39
  with st.container():
40
  col1, col2, col3 = st.columns([5, 5, 5], gap="small")
41
+ file_name = st.session_state.pdf_file[0].name.removesuffix(".pdf")
42
  excel_file=None
43
 
44
  with col1:
45
  if st.button(f"Generate {MODEL_1} Response"):
46
  with st.spinner(f"Calling {MODEL_1}..."):
47
+ result = gemini_model.extract_emissions_data_as_json(API_1 , MODEL_1, st.session_state.pdf_file[0],FULL_RESPONSE_SCHEMA)
48
  excel_file = streamlit_function.export_results_to_excel(result, MODEL_1, file_name)
49
  st.session_state[f"{MODEL_1}_result"] = result
50
  if st.session_state[f"{MODEL_1}_result"]:
 
54
  with col2:
55
  if st.button(f"Generate {MODEL_2} Response"):
56
  with st.spinner(f"Calling {MODEL_2}..."):
57
+ result = gemini_model.extract_emissions_data_as_json(API_2, MODEL_2, st.session_state.pdf_file[0],FULL_RESPONSE_SCHEMA)
58
  excel_file = streamlit_function.export_results_to_excel(result, MODEL_2, file_name)
59
  st.session_state[f"{MODEL_2}_result"] = result
60
  if st.session_state[f"{MODEL_2}_result"]:
 
65
  try:
66
  if st.button(f"Generate {MODEL_3} Response"):
67
  with st.spinner(f"Calling {MODEL_3}..."):
68
+ result = gemini_model.extract_emissions_data_as_json(API_3, MODEL_3, st.session_state.pdf_file[0], FULL_RESPONSE_SCHEMA)
69
  excel_file = streamlit_function.export_results_to_excel(result, MODEL_3, file_name)
70
  st.session_state[f"{MODEL_3}_result"] = result
71
  except ClientError as e:
 
85
  data=file,
86
  file_name=f"{file_name}.xlsx",
87
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
88
+ )
application/schemas/response_schema.py CHANGED
The diff for this file is too large to render. See raw diff
 
application/services/gemini_model.py CHANGED
@@ -4,7 +4,6 @@ import re
4
  from typing import Optional, Dict, Union, IO, List, BinaryIO
5
  from google import genai
6
  from google.genai import types
7
- from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
8
  from application.utils import logger
9
 
10
  logger=logger.get_logger()
@@ -14,7 +13,20 @@ client = genai.Client(api_key=os.getenv("gemini_api_key"))
14
  PROMPT = (
15
  """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
16
  and ESG (Environmental, Social, Governance) Data from a company’s sustainability
17
- or ESG report in PDF format."""
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  )
19
 
20
  def sanitize_file_name(name: str, max_length: int = 40) -> str:
@@ -59,7 +71,6 @@ def get_files() -> List[str]:
59
  files = client.files.list()
60
  return [file.name for file in files]
61
 
62
-
63
  def delete_files(file_names: Union[str, List[str]]) -> None:
64
  """
65
  Deletes specified files from Gemini.
@@ -137,7 +148,8 @@ def upload_file(
137
  def extract_emissions_data_as_json(
138
  api: str,
139
  model: str,
140
- file_input: Union[BinaryIO, bytes]
 
141
  ) -> Optional[dict]:
142
  """
143
  Extracts ESG data from a PDF using the Gemini API.
@@ -163,9 +175,15 @@ def extract_emissions_data_as_json(
163
  contents=[uploaded_file, PROMPT],
164
  config={
165
  'response_mime_type': 'application/json',
166
- 'response_schema': GEMINI_RESPONSE_FORMAT
167
- }
168
  )
 
 
 
 
 
 
169
 
170
  logger.info("[Gemini] Response received.")
171
  try:
 
4
  from typing import Optional, Dict, Union, IO, List, BinaryIO
5
  from google import genai
6
  from google.genai import types
 
7
  from application.utils import logger
8
 
9
  logger=logger.get_logger()
 
13
  PROMPT = (
14
  """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
15
  and ESG (Environmental, Social, Governance) Data from a company’s sustainability
16
+ or ESG report in PDF format.
17
+
18
+ You must extract the data based on a predefined response schema. It is critical
19
+ that you return all keys specified in the schema, even if the value is not present
20
+ or not found in the document. If a value is missing or unavailable, return a suitable
21
+ placeholder according to the format used
22
+ in the schema.
23
+
24
+ Your output should strictly follow the structure of the schema, ensuring completeness
25
+ and consistency for downstream processing.
26
+
27
+ Be precise in extracting values and identifying relevant context from the PDF. Use
28
+ surrounding text or tables to identify the most likely match for each field.
29
+ """
30
  )
31
 
32
  def sanitize_file_name(name: str, max_length: int = 40) -> str:
 
71
  files = client.files.list()
72
  return [file.name for file in files]
73
 
 
74
  def delete_files(file_names: Union[str, List[str]]) -> None:
75
  """
76
  Deletes specified files from Gemini.
 
148
  def extract_emissions_data_as_json(
149
  api: str,
150
  model: str,
151
+ file_input: Union[BinaryIO, bytes],
152
+ response_schema
153
  ) -> Optional[dict]:
154
  """
155
  Extracts ESG data from a PDF using the Gemini API.
 
175
  contents=[uploaded_file, PROMPT],
176
  config={
177
  'response_mime_type': 'application/json',
178
+ 'response_schema': response_schema,
179
+ },
180
  )
181
+ if hasattr(response, 'usage_metadata'):
182
+ logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}")
183
+ logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}")
184
+ logger.info(f"Total tokens: {response.usage_metadata.total_token_count}")
185
+ else:
186
+ logger.info("Token usage metadata not available in response")
187
 
188
  logger.info("[Gemini] Response received.")
189
  try:
application/services/llm_service.py CHANGED
@@ -151,8 +151,6 @@ def extract_emissions_data_as_json(
151
  logger.exception("Error during ESG data extraction.")
152
  return None
153
 
154
- # --- Debug Helper ---
155
-
156
  def list_all_files():
157
  """Lists all files currently uploaded to OpenAI."""
158
  try:
@@ -160,189 +158,4 @@ def list_all_files():
160
  for file in files:
161
  logger.info(f"File ID: {file.id}, Name: {file.filename}, Size: {file.bytes} bytes")
162
  except Exception as e:
163
- logger.error(f"Failed to list files: {e}")
164
-
165
-
166
-
167
-
168
-
169
-
170
-
171
-
172
-
173
-
174
-
175
-
176
-
177
-
178
-
179
- # import os
180
- # import json
181
- # from google import genai
182
- # from google.genai import types
183
- # from openai import OpenAI
184
- # from dotenv import load_dotenv
185
- # from application.utils import logger
186
- # import pandas as pd
187
- # import openpyxl
188
-
189
- # load_dotenv()
190
- # logger = logger.get_logger()
191
-
192
-
193
-
194
- # def load_schema_from_excel(file_path) -> str:
195
- # df = pd.read_excel(file_path,engine='openpyxl')
196
-
197
- # schema_lines = ["Schema fields and expected format:\n"]
198
- # for _, row in df.iterrows():
199
- # field = row.get("Field", "")
200
- # description = row.get("Description", "")
201
- # example = row.get("Example", "")
202
- # schema_lines.append(f"- {field}: {description} (e.g., {example})")
203
-
204
- # return "\n".join(schema_lines)
205
-
206
- # schema_text = load_schema_from_excel("application/schemas/schema.xlsx")
207
-
208
- # # print(schema_text)
209
-
210
- # PROMPT = (f"""You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters and ESG (Environmental, Social, Governance) Data from a company’s sustainability or ESG report in PDF format.
211
- # Please return the response as raw JSON without markdown formatting (no triple backticks or json tags) using the following fields:
212
- # Total GHG emissions (Metric Tons CO₂e)
213
- # Scope 1, 2, and 3 emissions
214
- # Emissions by gas (CO₂, CH₄, N₂O, HFCs, etc.)
215
- # Energy and fuel consumption (MWh, GJ, Liters)
216
- # Carbon offsets, intensity metrics, and reduction targets
217
- # ESG disclosures including:
218
- # Environmental Policies
219
- # Whether the company has an Environmental Management System (EMS)
220
- # Environmental certifications (if any)
221
- # Ensure values include their units, are extracted accurately, and the fields match the schema provided below and If the value is zero replace it with null:
222
-
223
- # {schema_text}
224
-
225
- # """)
226
-
227
- # def extract_emissions_data_as_json(api, model, file_input):
228
-
229
- # if api.lower()=="openai":
230
-
231
- # client = OpenAI()
232
-
233
- # file = client.files.create(
234
- # file=("uploaded.pdf", file_input),
235
- # purpose="assistants"
236
- # )
237
-
238
- # completion = client.chat.completions.create(
239
- # model=model,
240
- # messages=[
241
- # {
242
- # "role": "user",
243
- # "content": [
244
- # {
245
- # "type": "file",
246
- # "file": {
247
- # "file_id": file.id,
248
- # }
249
- # },
250
- # {
251
- # "type": "text",
252
- # "text":PROMPT,
253
- # },
254
- # ]
255
- # }
256
- # ]
257
- # )
258
-
259
- # try:
260
- # return json.loads(completion.choices[0].message.content)
261
- # except json.JSONDecodeError:
262
- # logger.error("Warning: Output was not valid JSON.")
263
- # return {"raw_response": completion.choices[0].message.content}
264
-
265
- # if api.lower()=="gemini":
266
-
267
- # client = genai.Client(api_key=os.getenv('gemini_api_key'))
268
-
269
- # file_bytes= file_input.read()
270
- # response = client.models.generate_content(
271
- # model=model,
272
- # contents=[
273
- # types.Part.from_bytes(
274
- # data=file_bytes,
275
- # mime_type='application/pdf',
276
- # ),
277
- # PROMPT])
278
-
279
- # try:
280
- # return json.loads(response.text)
281
- # except json.JSONDecodeError:
282
- # return {"raw_response": response.text}
283
-
284
-
285
-
286
- # # {
287
- # # "type": "object",
288
- # # "properties": {
289
- # # "GHG_Protocol_Parameters": {
290
- # # "type": "object",
291
- # # "properties": {
292
- # # "Total_GHG_Emissions": { "type": "number" },
293
- # # "Scope_1_Emissions": { "type": "number" },
294
- # # "Scope_2_Emissions": { "type": "number" },
295
- # # "Scope_3_Emissions": { "type": "number" },
296
- # # "CO2_Emissions": { "type": "number" },
297
- # # "CH4_Emissions": { "type": "number" },
298
- # # "N2O_Emissions": { "type": "number" },
299
- # # "HFC_Emissions": { "type": "number" },
300
- # # "PFC_Emissions": { "type": "number" },
301
- # # "SF6_Emissions": { "type": "number" },
302
- # # "NF3_Emissions": { "type": "number" },
303
- # # "Biogenic_CO2_Emissions": { "type": "number" },
304
- # # "Emissions_Intensity_per_Revenue": { "type": "number" },
305
- # # "Emissions_Intensity_per_Employee": { "type": "number" },
306
- # # "Base_Year_Emissions": { "type": "number" },
307
- # # "Emissions_Reduction_Target": { "type": "number" },
308
- # # "Emissions_Reduction_Achieved": { "type": "number" },
309
- # # "Energy_Consumption": { "type": "number" },
310
- # # "Renewable_Energy_Consumption": { "type": "number" },
311
- # # "Non_Renewable_Energy_Consumption": { "type": "number" },
312
- # # "Energy_Intensity_per_Revenue": { "type": "number" },
313
- # # "Energy_Intensity_per_Employee": { "type": "number" },
314
- # # "Fuel_Consumption": { "type": "number" },
315
- # # "Electricity_Consumption": { "type": "number" },
316
- # # "Heat_Consumption": { "type": "number" },
317
- # # "Steam_Consumption": { "type": "number" },
318
- # # "Cooling_Consumption": { "type": "number" },
319
- # # "Purchased_Goods_and_Services_Emissions": { "type": "number" },
320
- # # "Capital_Goods_Emissions": { "type": "number" },
321
- # # "Fuel_and_Energy_Related_Activities_Emissions": { "type": "number" },
322
- # # "Upstream_Transportation_and_Distribution_Emissions": { "type": "number" },
323
- # # "Waste_Generated_in_Operations_Emissions": { "type": "number" },
324
- # # "Business_Travel_Emissions": { "type": "number" },
325
- # # "Employee_Commuting_Emissions": { "type": "number" },
326
- # # "Upstream_Leased_Assets_Emissions": { "type": "number" },
327
- # # "Downstream_Transportation_and_Distribution_Emissions": { "type": "number" },
328
- # # "Processing_of_Sold_Products_Emissions": { "type": "number" },
329
- # # "Use_of_Sold_Products_Emissions": { "type": "number" },
330
- # # "End_of_Life_Treatment_of_Sold_Products_Emissions": { "type": "number" },
331
- # # "Downstream_Leased_Assets_Emissions": { "type": "number" },
332
- # # "Franchises_Emissions": { "type": "number" },
333
- # # "Investments_Emissions": { "type": "number" },
334
- # # "Carbon_Offsets_Purchased": { "type": "number" },
335
- # # "Net_GHG_Emissions": { "type": "number" },
336
- # # "Carbon_Sequestration": { "type": "number" }
337
- # # }
338
- # # },
339
- # # "ESG_Parameters_CSRS": {
340
- # # "type": "object",
341
- # # "properties": {
342
- # # "Environmental_Policies": { "type": "string" },
343
- # # "Environmental_Management_System": { "type": "boolean" },
344
- # # "Environmental_Certifications": { "type": "string" }
345
- # # }
346
- # # }
347
- # # },
348
- # # "required": ["GHG_Protocol_Parameters", "ESG_Parameters_CSRS"]}
 
151
  logger.exception("Error during ESG data extraction.")
152
  return None
153
 
 
 
154
  def list_all_files():
155
  """Lists all files currently uploaded to OpenAI."""
156
  try:
 
158
  for file in files:
159
  logger.info(f"File ID: {file.id}, Name: {file.filename}, Size: {file.bytes} bytes")
160
  except Exception as e:
161
+ logger.error(f"Failed to list files: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
application/services/streamlit_function.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  from io import BytesIO
5
  import json
6
  import os
 
7
  from application.utils import logger
8
 
9
  logger = logger.get_logger()
@@ -51,7 +52,7 @@ def upload_file(
51
  file_types: Union[str, List[str]] = "pdf",
52
  label: str = "📤 Upload a file",
53
  help_text: str = "Upload your file for processing.",
54
- allow_multiple: bool = False,
55
  ):
56
  """
57
  Streamlit file uploader widget with options.
@@ -78,8 +79,9 @@ def upload_file(
78
  if st.button("Submit"):
79
  st.session_state.pdf_file = uploaded_files
80
  return uploaded_files
 
 
81
 
82
- def export_results_to_excel(results: dict, sheet_name: str, filename: str = "output.xlsx") -> BytesIO:
83
  """
84
  Converts a dictionary result into a formatted Excel file.
85
  Appends to a file in the 'data/' folder if it already exists,
@@ -94,34 +96,53 @@ def export_results_to_excel(results: dict, sheet_name: str, filename: str = "out
94
  BytesIO: In-memory Excel file for Streamlit download.
95
  """
96
  try:
97
- df = pd.json_normalize(results, sep='_')
98
- df.replace({None: "", "NULL": ""}, inplace=True)
99
- except Exception as e:
100
- df = pd.DataFrame([{"error": f"Could not parse result: {str(e)}"}])
 
 
 
 
 
 
 
 
101
 
102
- # Ensure correct file extension and path
103
- filename = f"{filename}.xlsx" if not filename.endswith(".xlsx") else filename
104
- full_path = os.path.join("data", filename)
105
 
106
- os.makedirs("data", exist_ok=True) # Ensure the folder exists
 
107
 
108
- # Save to physical file
109
- if os.path.exists(full_path):
110
- with pd.ExcelWriter(full_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
111
- book = writer.book
 
 
112
  if sheet_name in book.sheetnames:
113
  sheet = book[sheet_name]
114
  start_row = sheet.max_row
 
115
  else:
116
  start_row = 0
117
- df.to_excel(writer, sheet_name=sheet_name, index=False, header=start_row == 0, startrow=start_row)
118
- else:
119
- df.to_excel(full_path, index=False, engine="openpyxl", sheet_name=sheet_name)
120
 
121
- # Prepare in-memory Excel for download
122
- output_stream = BytesIO()
123
- with pd.ExcelWriter(output_stream, engine="openpyxl") as writer:
124
- df.to_excel(writer, index=False, sheet_name=sheet_name)
125
- output_stream.seek(0)
 
 
 
 
 
126
 
127
- return output_stream
 
 
 
 
 
 
 
4
  from io import BytesIO
5
  import json
6
  import os
7
+ from openpyxl import load_workbook
8
  from application.utils import logger
9
 
10
  logger = logger.get_logger()
 
52
  file_types: Union[str, List[str]] = "pdf",
53
  label: str = "📤 Upload a file",
54
  help_text: str = "Upload your file for processing.",
55
+ allow_multiple: bool = True,
56
  ):
57
  """
58
  Streamlit file uploader widget with options.
 
79
  if st.button("Submit"):
80
  st.session_state.pdf_file = uploaded_files
81
  return uploaded_files
82
+
83
+ def export_results_to_excel(results: dict, sheet_name: str, filename: str = "output.xlsx", column: str = None) -> BytesIO:
84
 
 
85
  """
86
  Converts a dictionary result into a formatted Excel file.
87
  Appends to a file in the 'data/' folder if it already exists,
 
96
  BytesIO: In-memory Excel file for Streamlit download.
97
  """
98
  try:
99
+ if not results:
100
+ logger.error("Results object is None or empty.")
101
+ return None
102
+
103
+ filename = filename if filename.endswith(".xlsx") else f"{filename}.xlsx"
104
+ data = results.get(column, {})
105
+
106
+ logger.info(f"Exporting data for column '{column}' to {filename}")
107
+
108
+ if not isinstance(data, dict):
109
+ logger.error(f"Expected dictionary for column '{column}', but got {type(data)}")
110
+ return None
111
 
112
+ df = pd.DataFrame(data.items(), columns=[column, "Value"])
113
+ df.fillna(0, inplace=True)
 
114
 
115
+ os.makedirs("data", exist_ok=True)
116
+ physical_path = os.path.join("data", filename)
117
 
118
+ file_exists = os.path.exists(physical_path)
119
+ start_row = 0
120
+ start_column = 0
121
+
122
+ if file_exists:
123
+ book = load_workbook(physical_path)
124
  if sheet_name in book.sheetnames:
125
  sheet = book[sheet_name]
126
  start_row = sheet.max_row
127
+ start_column = sheet.max_column
128
  else:
129
  start_row = 0
 
 
 
130
 
131
+ if file_exists:
132
+ with pd.ExcelWriter(physical_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
133
+ df.to_excel(writer, sheet_name=sheet_name, index=False, header=True, startrow=0, startcol=start_column)
134
+ else:
135
+ with pd.ExcelWriter(physical_path, engine='openpyxl', mode='w') as writer:
136
+ df.to_excel(writer, sheet_name=sheet_name, index=False, header=True, startrow=0)
137
+
138
+ output_stream = BytesIO()
139
+ with pd.ExcelWriter(output_stream, engine='openpyxl') as writer:
140
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
141
 
142
+ output_stream.seek(0)
143
+ logger.info(f"Data exported to {filename} successfully.")
144
+ return output_stream
145
+
146
+ except Exception as e:
147
+ logger.error(f"Error creating Excel export: {e}")
148
+ return None
pages/multiple_pdf_extractor.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from application.schemas.response_schema import (
4
+ GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
5
+ GEMINI_ENVIRONMENT_PARAMETERS, GEMINI_SOCIAL_PARAMETERS,
6
+ GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
7
+ GEMINI_NET_ZERO_INTERVENTION_PARAMETERS
8
+ )
9
+ from application.services import streamlit_function, gemini_model
10
+ from application.utils import logger
11
+
12
+ logger = logger.get_logger()
13
+ streamlit_function.config_homepage()
14
+
15
+ st.title("Sustainability Report Analyzer")
16
+ st.write("Upload your sustainability report PDF and generate insights using Gemini models.")
17
+
18
+ AVAILABLE_MODELS = [
19
+ "gemini-1.5-pro-latest",
20
+ "gemini-2.0-flash",
21
+ "gemini-1.5-flash",
22
+ "gemini-2.5-pro-exp-03-25"
23
+ ]
24
+
25
+ RESPONSE_SCHEMAS = {
26
+ "Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
27
+ "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
28
+ "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
29
+ "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
30
+ "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
31
+ "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
32
+ "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
33
+ }
34
+
35
+ selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)
36
+
37
+ uploaded_files = streamlit_function.upload_file("pdf", label="📤 Upload Sustainability Report PDF")
38
+ if uploaded_files:
39
+ st.session_state.uploaded_files = uploaded_files
40
+
41
+ if "uploaded_files" not in st.session_state:
42
+ st.session_state.uploaded_files = []
43
+
44
+ if st.session_state.uploaded_files:
45
+ columns = st.columns(3)
46
+
47
+ for i, pdf_file in enumerate(st.session_state.uploaded_files):
48
+ with columns[i % 3]:
49
+ file_name = pdf_file.name.removesuffix(".pdf")
50
+ st.write(f"📄 **File {i+1}:** `{pdf_file.name}`")
51
+
52
+ extract_btn = st.button(f"Extract Data from File {i+1}", key=f"extract_{i}")
53
+ result_key = f"{selected_model}_result_file_{i+1}"
54
+
55
+ if extract_btn:
56
+ with st.spinner(f"Extracting data from `{pdf_file.name}` using `{selected_model}`..."):
57
+ try:
58
+ all_results = {}
59
+
60
+ for label, schema in RESPONSE_SCHEMAS.items():
61
+ result = gemini_model.extract_emissions_data_as_json("gemini", selected_model, pdf_file, schema)
62
+ streamlit_function.export_results_to_excel(result, sheet_name=selected_model, filename=file_name, column=label)
63
+ all_results[label] = result
64
+ st.session_state[result_key] = all_results
65
+ st.success("Data extraction complete.")
66
+ except Exception as e:
67
+ logger.error(f"Extraction failed: {e}")
68
+ st.error("Failed to extract data.")
69
+
70
+ if st.session_state.get(result_key):
71
+ st.write(f"🧾 **Extracted Metrics for File {i+1}:**")
72
+ st.json(st.session_state[result_key])
73
+
74
+ file_path = f"data/{file_name}.xlsx"
75
+
76
+ if os.path.exists(file_path):
77
+ with open(file_path, "rb") as file:
78
+ st.download_button(
79
+ label="Download Excel File",
80
+ data=file,
81
+ file_name=f"{file_name}.xlsx",
82
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
83
+ )
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+ # import streamlit as st
111
+ # from application.schemas.response_schema import GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS, GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS, GEMINI_NET_ZERO_INTERVENTION_PARAMETERS
112
+ # from application.services import streamlit_function, gemini_model
113
+ # from application.utils import logger
114
+ # import test
115
+
116
+ # logger = logger.get_logger()
117
+ # streamlit_function.config_homepage()
118
+ # st.title("Sustainability Report Analyzer")
119
+ # st.write("Upload your sustainability report PDF and generate insights using different models.")
120
+
121
+ # MODEL = ["gemini-1.5-pro-latest", "gemini-2.0-flash", "gemini-1.5-flash", "gemini-2.5-pro-exp-03-25"]
122
+
123
+ # MODEL_1 = "gemini-1.5-pro-latest"
124
+ # MODEL_2 = "gemini-2.0-flash"
125
+ # MODEL_3 = "gemini-1.5-flash"
126
+
127
+ # API_1 = "gemini"
128
+ # API_2 = "gemini"
129
+ # API_3 = "gemini"
130
+
131
+ # response_schema = [ GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
132
+ # GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS,
133
+ # GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
134
+ # GEMINI_NET_ZERO_INTERVENTION_PARAMETERS]
135
+
136
+ # if "uploaded_files" not in st.session_state:
137
+ # st.session_state.uploaded_files = []
138
+
139
+ # MODEL = st.selectbox(
140
+ # "Select Model",
141
+ # options=MODEL,
142
+ # index=0,
143
+ # )
144
+
145
+ # uploaded_files = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
146
+
147
+ # if uploaded_files:
148
+ # st.session_state.uploaded_files = uploaded_files
149
+
150
+ # if st.session_state.uploaded_files:
151
+ # columns = st.columns([5, 5, 5], gap="small")
152
+
153
+ # for i, col in enumerate(columns):
154
+ # if i < len(st.session_state.uploaded_files):
155
+ # pdf_file = st.session_state.uploaded_files[i]
156
+ # file_name = pdf_file.name.removesuffix(".pdf")
157
+ # result_key = f"{MODEL}_result_file_{i+1}"
158
+
159
+ # with col:
160
+ # st.write(f"**File {i+1}:** `{pdf_file.name}`")
161
+ # if st.button(f"Extract Data from File {i+1}", key=f"extract_btn_{i}"):
162
+ # with st.spinner(f"Extracting data from File {i+1} using {MODEL}..."):
163
+ # for schema in response_schema:
164
+ # result = gemini_model.extract_emissions_data_as_json(API_1, MODEL, pdf_file, schema)
165
+ # if schema == GEMINI_GHG_PARAMETERS:
166
+ # column = "Greenhouse Gas (GHG) Protocol Parameters"
167
+ # elif schema == GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD:
168
+ # column = "Environmental Parameters (CSRD)"
169
+ # elif schema == GEMINI_ENVIRONMENT_PARAMETERS:
170
+ # column = "Environmental Parameters"
171
+ # elif schema == GEMINI_SOCIAL_PARAMETERS:
172
+ # column = "Social Parameters"
173
+ # elif schema == GEMINI_GOVERNANCE_PARAMETERS:
174
+ # column = "Governance Parameters"
175
+ # elif schema == GEMINI_MATERIALITY_PARAMETERS:
176
+ # column = "Materiality Parameters"
177
+ # elif schema == GEMINI_NET_ZERO_INTERVENTION_PARAMETERS:
178
+ # column = "Net Zero Intervention Parameters"
179
+ # else:
180
+ # column = None
181
+
182
+ # test.export_results_to_excel(result, sheet_name=MODEL, filename=file_name, column=column )
183
+ # st.session_state[result_key] = result
184
+
185
+ # if st.session_state.get(result_key):
186
+ # st.write(f"**Extracted Metrics for File {i+1}:**")
187
+ # st.json(st.session_state[result_key])
test.py CHANGED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from io import BytesIO
4
+ from openpyxl import load_workbook
5
+ from application.utils import logger
6
+
7
+ logger = logger.get_logger()
8
+
9
+ def export_results_to_excel(results: dict, sheet_name: str, filename: str = "output.xlsx", column: str = None) -> BytesIO:
10
+ try:
11
+ if not results:
12
+ logger.error("Results object is None or empty.")
13
+ return None
14
+
15
+ filename = filename if filename.endswith(".xlsx") else f"{filename}.xlsx"
16
+ data = results.get(column, {})
17
+
18
+ logger.info(f"Exporting data for column '{column}' to {filename}")
19
+
20
+ if not isinstance(data, dict):
21
+ logger.error(f"Expected dictionary for column '{column}', but got {type(data)}")
22
+ return None
23
+
24
+ df = pd.DataFrame(data.items(), columns=[column, "Value"])
25
+ df.fillna(0, inplace=True)
26
+
27
+ os.makedirs("data", exist_ok=True)
28
+ physical_path = os.path.join("data", filename)
29
+
30
+ file_exists = os.path.exists(physical_path)
31
+ start_row = 0
32
+ start_column = 0
33
+
34
+ if file_exists:
35
+ book = load_workbook(physical_path)
36
+ if sheet_name in book.sheetnames:
37
+ sheet = book[sheet_name]
38
+ start_row = sheet.max_row
39
+ start_column = sheet.max_column
40
+ else:
41
+ start_row = 0
42
+
43
+ if file_exists:
44
+ with pd.ExcelWriter(physical_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
45
+ df.to_excel(writer, sheet_name=sheet_name, index=False, header=True, startrow=0, startcol=start_column)
46
+ else:
47
+ with pd.ExcelWriter(physical_path, engine='openpyxl', mode='w') as writer:
48
+ df.to_excel(writer, sheet_name=sheet_name, index=False, header=True, startrow=0)
49
+
50
+ output_stream = BytesIO()
51
+ with pd.ExcelWriter(output_stream, engine='openpyxl') as writer:
52
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
53
+
54
+ output_stream.seek(0)
55
+ logger.info(f"Data exported to {filename} successfully.")
56
+ return output_stream
57
+
58
+ except Exception as e:
59
+ logger.error(f"Error creating Excel export: {e}")
60
+ return None
61
+
62
+ # export_results_to_excel(zalando_data, "Zalando Data","test", "Greenhouse Gas (GHG) Protocol Parameters")