Vela commited on
Commit
f7d4608
·
1 Parent(s): 16f68b6

Created a PdfExtraction application with basic functionality

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .venv
2
+ .env
3
+ data
4
+ __pycache__/
5
+ logs/
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from application.services import streamlit_function, llm_service
2
+ from application.services import gemini_model, openai_model
3
+ import streamlit as st
4
+ from google.genai.errors import ClientError
5
+ from application.utils import logger
6
+ import test
7
+
8
+ logger = logger.get_logger()
9
+
10
+ streamlit_function.config_homepage()
11
+ pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
12
+
13
+ available_files = ["Select a pdf file"]
14
+ for file in llm_service.get_files():
15
+ available_files.append(file.filename)
16
+
17
+ selected_file = st.selectbox("Select a existing file", available_files)
18
+
19
+ for key in ["gpt4o_mini_result", "gpt4o_result", "gemini_result", "pdf_file"]:
20
+ if key not in st.session_state:
21
+ st.session_state[key] = None
22
+
23
+ if st.session_state.pdf_file:
24
+ with st.container():
25
+ col1, col2, col3 = st.columns([5, 5, 5], gap="small")
26
+
27
+ with col1:
28
+ if st.button("Generate GPT-4o-min Response"):
29
+ with st.spinner("Calling GPT-4o-mini..."):
30
+ result = llm_service.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
31
+ # result= openai_model.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
32
+ st.session_state.gpt4o_mini_result = result
33
+ if st.session_state.gpt4o_mini_result:
34
+ st.write("Extracted Metrics by gpt-4o-mini")
35
+ st.json(st.session_state.gpt4o_mini_result)
36
+
37
+ with col2:
38
+ if st.button("Generate GPT-4o Response"):
39
+ with st.spinner("Calling gpt-4o..."):
40
+ result= llm_service.extract_emissions_data_as_json("openai","gpt-4o",pdf_file)
41
+ st.session_state.gpt4o_result = result
42
+ if st.session_state.gpt4o_result:
43
+ st.write("Extracted Metrics by gpt-4o")
44
+ st.json(st.session_state.gpt4o_result)
45
+
46
+ with col3:
47
+ try:
48
+ if st.button("Generate Gemini Response"):
49
+ with st.spinner("Calling gemini-1.5-pro-latest..."):
50
+ result = llm_service.extract_emissions_data_as_json("gemini","gemini-2.0-flash", st.session_state.pdf_file)
51
+ # result = gemini_model.extract_emissions_data_as_json("gemini","gemini-2.0-flash", pdf_file)
52
+ st.session_state.gemini_result = result
53
+ except ClientError as e:
54
+ st.error(f"Gemini API Error: {e}")
55
+ logger.error("Error Details:", e.message, e.response)
56
+
57
+ if st.session_state.gemini_result:
58
+ st.write("Extracted Metrics by gemini-1.5-pro-latest")
59
+ st.json(st.session_state.gemini_result)
60
+
application/schemas/response_schema.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RESPONSE_FORMAT = {
2
+ "type": "json_schema",
3
+ "json_schema": {
4
+ "name": "esg_response",
5
+ "strict": True,
6
+ "schema": {
7
+ "type": "object",
8
+ "properties": {
9
+ "company_name": {"type": "string"},
10
+ "Greenhouse Gas (GHG) Protocol Parameters": {
11
+ "type": "array",
12
+ "items": {
13
+ "type": "object",
14
+ "properties": {
15
+ "Total GHG Emissions": {"type": ["integer", "null"]},
16
+ "Total GHG Emissions Description": {
17
+ "type": "string",
18
+ "description": "Total greenhouse gases emitted by the organization."
19
+ },
20
+ "Scope 1 Emissions": {"type": ["integer", "null"]},
21
+ "Scope 1 Emissions Description": {
22
+ "type": "string",
23
+ "description": "Direct GHG emissions from owned or controlled sources."
24
+ },
25
+ "Scope 2 Emissions": {"type": ["integer", "null"]},
26
+ "Scope 2 Emissions Description": {
27
+ "type": "string",
28
+ "description": "Indirect emissions from the generation of purchased electricity."
29
+ },
30
+ "Scope 3 Emissions": {"type": ["integer", "null"]},
31
+ "Scope 3 Emissions Description": {
32
+ "type": "string",
33
+ "description": "All other indirect emissions that occur in a company’s value chain."
34
+ },
35
+ "CO₂ Emissions": {"type": ["integer", "null"]},
36
+ "CO₂ Emissions Description": {
37
+ "type": "string",
38
+ "description": "Emissions of carbon dioxide."
39
+ },
40
+ "CH₄ Emissions": {"type": ["integer", "null"]},
41
+ "CH₄ Emissions Description": {
42
+ "type": "string",
43
+ "description": "Emissions of methane."
44
+ },
45
+ "N₂O Emissions": {"type": ["integer", "null"]},
46
+ "N₂O Emissions Description": {
47
+ "type": "string",
48
+ "description": "Emissions of nitrous oxide."
49
+ },
50
+ "HFC Emissions": {"type": ["integer", "null"]},
51
+ "HFC Emissions Description": {
52
+ "type": "string",
53
+ "description": "Emissions of hydrofluorocarbons."
54
+ },
55
+ "PFC Emissions": {"type": ["integer", "null"]},
56
+ "PFC Emissions Description": {
57
+ "type": "string",
58
+ "description": "Emissions of perfluorocarbons."
59
+ }
60
+ },
61
+ "required": [
62
+ "Total GHG Emissions", "Total GHG Emissions Description",
63
+ "Scope 1 Emissions", "Scope 1 Emissions Description",
64
+ "Scope 2 Emissions", "Scope 2 Emissions Description",
65
+ "Scope 3 Emissions", "Scope 3 Emissions Description",
66
+ "CO₂ Emissions", "CO₂ Emissions Description",
67
+ "CH₄ Emissions", "CH₄ Emissions Description",
68
+ "N₂O Emissions", "N₂O Emissions Description",
69
+ "HFC Emissions", "HFC Emissions Description",
70
+ "PFC Emissions", "PFC Emissions Description"
71
+ ],
72
+ "additionalProperties": False
73
+ }
74
+ },
75
+
76
+ "Net Zero Intervention Parameters": {
77
+ "type": "array",
78
+ "items": {
79
+ "type": "object",
80
+ "properties": {
81
+ "Renewable Energy Adoption": {"type": ["number", "null"]},
82
+ "Renewable Energy Adoption Description": {
83
+ "type": "string",
84
+ "description": "Proportion of energy consumption derived from renewable sources."
85
+ },
86
+ "Energy Efficiency Improvements": {"type": ["number", "null"]},
87
+ "Energy Efficiency Improvements Description": {
88
+ "type": "string",
89
+ "description": "Reduction in energy consumption due to efficiency measures."
90
+ },
91
+ "Electrification of Operations": {"type": ["number", "null"]},
92
+ "Electrification of Operations Description": {
93
+ "type": "string",
94
+ "description": "Extent to which operations have shifted from fossil fuels to electric power."
95
+ },
96
+ "Carbon Capture and Storage (CCS) Implementation": {"type": ["number", "null"]},
97
+ "Carbon Capture and Storage (CCS) Implementation Description": {
98
+ "type": "string",
99
+ "description": "Amount of CO₂ captured and stored to prevent atmospheric release."
100
+ },
101
+ "Reforestation and Afforestation Initiatives": {"type": ["number", "null"]},
102
+ "Reforestation and Afforestation Initiatives Description": {
103
+ "type": "string",
104
+ "description": "Efforts to plant trees to absorb CO₂ from the atmosphere."
105
+ },
106
+ "Sustainable Transportation Adoption": {"type": ["number", "null"]},
107
+ "Sustainable Transportation Adoption Description": {
108
+ "type": "string",
109
+ "description": "Proportion of transportation utilizing low-emission or electric vehicles."
110
+ },
111
+ "Supply Chain Emissions Reduction": {"type": ["number", "null"]},
112
+ "Supply Chain Emissions Reduction Description": {
113
+ "type": "string",
114
+ "description": "Decrease in emissions from upstream and downstream supply chain activities."
115
+ },
116
+ "Waste-to-Energy Conversion": {"type": ["number", "null"]},
117
+ "Waste-to-Energy Conversion Description": {
118
+ "type": "string",
119
+ "description": "Energy produced from the processing of waste materials."
120
+ },
121
+ "Carbon Offset Investments": {"type": ["number", "null"]},
122
+ "Carbon Offset Investments Description": {
123
+ "type": "string",
124
+ "description": "Amount of emissions offset through investments in environmental projects."
125
+ },
126
+ "Climate Risk Assessment": {"type": ["string", "null"]},
127
+ "Climate Risk Assessment Description": {
128
+ "type": "string",
129
+ "description": "Evaluation of potential risks posed by climate change to the organization."
130
+ },
131
+ "Climate Adaptation Strategies": {"type": ["string", "null"]},
132
+ "Climate Adaptation Strategies Description": {
133
+ "type": "string",
134
+ "description": "Plans implemented to adapt operations to changing climate conditions."
135
+ },
136
+ "Internal Carbon Pricing": {"type": ["number", "null"]},
137
+ "Internal Carbon Pricing Description": {
138
+ "type": "string",
139
+ "description": "Monetary value assigned to carbon emissions to incentivize reduction."
140
+ },
141
+ "Net-Zero Target Year": {"type": ["string", "null"]},
142
+ "Net-Zero Target Year Description": {
143
+ "type": "string",
144
+ "description": "Specific year by which the organization aims to achieve net-zero emissions."
145
+ },
146
+ "Interim Emission Reduction Targets": {"type": ["number", "null"]},
147
+ "Interim Emission Reduction Targets Description": {
148
+ "type": "string",
149
+ "description": "Short-term targets set to progressively reduce emissions en route to net-zero."
150
+ },
151
+ "Employee Engagement in Sustainability": {"type": ["number", "null"]},
152
+ "Employee Engagement in Sustainability Description": {
153
+ "type": "string",
154
+ "description": "Proportion of employees actively involved in sustainability programs."
155
+ },
156
+ "Investment in Low-Carbon Technologies": {"type": ["number", "null"]},
157
+ "Investment in Low-Carbon Technologies Description": {
158
+ "type": "string",
159
+ "description": "Financial resources allocated to developing or adopting low-carbon technologies."
160
+ },
161
+ "Public Disclosure of Net-Zero Progress": {"type": ["string", "null"]},
162
+ "Public Disclosure of Net-Zero Progress Description": {
163
+ "type": "string",
164
+ "description": "Regular public updates on progress toward net-zero commitments."
165
+ },
166
+ "Third-Party Verification of Emission Data": {"type": ["boolean", "null"]},
167
+ "Third-Party Verification of Emission Data Description": {
168
+ "type": "string",
169
+ "description": "Confirmation that emission data has been verified by an external party."
170
+ },
171
+ "Participation in Carbon Markets": {"type": ["boolean", "null"]},
172
+ "Participation in Carbon Markets Description": {
173
+ "type": "string",
174
+ "description": "Involvement in systems where carbon credits are bought and sold."
175
+ },
176
+ "Development of Climate-Resilient Infrastructure": {"type": ["string", "null"]},
177
+ "Development of Climate-Resilient Infrastructure Description": {
178
+ "type": "string",
179
+ "description": "Initiatives to build infrastructure resilient to climate impacts."
180
+ },
181
+ "Reduction of Methane Emissions": {"type": ["number", "null"]},
182
+ "Reduction of Methane Emissions Description": {
183
+ "type": "string",
184
+ "description": "Efforts to decrease methane emissions from operations."
185
+ },
186
+ "Implementation of Circular Economy Practices": {"type": ["string", "null"]},
187
+ "Implementation of Circular Economy Practices Description": {
188
+ "type": "string",
189
+ "description": "Adoption of processes that emphasize reuse and recycling to minimize waste."
190
+ },
191
+ "Collaboration with Industry Peers on Climate Action": {"type": ["string", "null"]},
192
+ "Collaboration with Industry Peers on Climate Action Description": {
193
+ "type": "string",
194
+ "description": "Joint initiatives with other organizations to address climate challenges."
195
+ },
196
+ "Use of Science-Based Targets": {"type": ["boolean", "null"]},
197
+ "Use of Science-Based Targets Description": {
198
+ "type": "string",
199
+ "description": "Setting emission reduction targets in line with scientific recommendations."
200
+ },
201
+ "Monitoring and Reporting Mechanisms": {"type": ["string", "null"]},
202
+ "Monitoring and Reporting Mechanisms Description": {
203
+ "type": "string",
204
+ "description": "Systems established to track and report emissions data accurately."
205
+ }
206
+ },
207
+ "required": [
208
+ "Renewable Energy Adoption", "Renewable Energy Adoption Description",
209
+ "Energy Efficiency Improvements", "Energy Efficiency Improvements Description",
210
+ "Electrification of Operations", "Electrification of Operations Description",
211
+ "Carbon Capture and Storage (CCS) Implementation", "Carbon Capture and Storage (CCS) Implementation Description",
212
+ "Reforestation and Afforestation Initiatives", "Reforestation and Afforestation Initiatives Description",
213
+ "Sustainable Transportation Adoption", "Sustainable Transportation Adoption Description",
214
+ "Supply Chain Emissions Reduction", "Supply Chain Emissions Reduction Description",
215
+ "Waste-to-Energy Conversion", "Waste-to-Energy Conversion Description",
216
+ "Carbon Offset Investments", "Carbon Offset Investments Description",
217
+ "Climate Risk Assessment", "Climate Risk Assessment Description",
218
+ "Climate Adaptation Strategies", "Climate Adaptation Strategies Description",
219
+ "Internal Carbon Pricing", "Internal Carbon Pricing Description",
220
+ "Net-Zero Target Year", "Net-Zero Target Year Description",
221
+ "Interim Emission Reduction Targets", "Interim Emission Reduction Targets Description",
222
+ "Employee Engagement in Sustainability", "Employee Engagement in Sustainability Description",
223
+ "Investment in Low-Carbon Technologies", "Investment in Low-Carbon Technologies Description",
224
+ "Public Disclosure of Net-Zero Progress", "Public Disclosure of Net-Zero Progress Description",
225
+ "Third-Party Verification of Emission Data", "Third-Party Verification of Emission Data Description",
226
+ "Participation in Carbon Markets", "Participation in Carbon Markets Description",
227
+ "Development of Climate-Resilient Infrastructure", "Development of Climate-Resilient Infrastructure Description",
228
+ "Reduction of Methane Emissions", "Reduction of Methane Emissions Description",
229
+ "Implementation of Circular Economy Practices", "Implementation of Circular Economy Practices Description",
230
+ "Collaboration with Industry Peers on Climate Action", "Collaboration with Industry Peers on Climate Action Description",
231
+ "Use of Science-Based Targets", "Use of Science-Based Targets Description",
232
+ "Monitoring and Reporting Mechanisms", "Monitoring and Reporting Mechanisms Description"
233
+ ],
234
+ "additionalProperties": False
235
+ }
236
+ },
237
+
238
+ "Materiality Parameters": {
239
+ "type": "array",
240
+ "items": {
241
+ "type": "object",
242
+ "properties": {
243
+ "Stakeholder Engagement Level": {
244
+ "type": ["string", "null"]
245
+ },
246
+ "Stakeholder Engagement Level Description": {
247
+ "type": "string",
248
+ "description": "Degree to which stakeholders are involved in organizational activities or decisions."
249
+ },
250
+ "Stakeholder Feedback Mechanisms": {
251
+ "type": ["string", "null"]
252
+ },
253
+ "Stakeholder Feedback Mechanisms Description": {
254
+ "type": "string",
255
+ "description": "Systems in place for stakeholders to provide feedback to the organization."
256
+ },
257
+ "Identification of Material Issues": {
258
+ "type": ["string", "null"]
259
+ },
260
+ "Identification of Material Issues Description": {
261
+ "type": "string",
262
+ "description": "Process of determining the most significant environmental, social, and governance issues relevant to the organization."
263
+ },
264
+ "Prioritization of Material Issues": {
265
+ "type": ["string", "null"]
266
+ },
267
+ "Prioritization of Material Issues Description": {
268
+ "type": "string",
269
+ "description": "Ranking of identified material issues based on their significance to stakeholders and the organization."
270
+ },
271
+ "Double Materiality Assessment": {
272
+ "type": ["string", "null"]
273
+ },
274
+ "Double Materiality Assessment Description": {
275
+ "type": "string",
276
+ "description": "Evaluation considering both the organization's impact on sustainability matters and the impact of those matters on the organization."
277
+ },
278
+ "Materiality Matrix Development": {
279
+ "type": ["string", "null"]
280
+ },
281
+ "Materiality Matrix Development Description": {
282
+ "type": "string",
283
+ "description": "Creation of a visual matrix plotting material issues based on their importance to stakeholders and the organization."
284
+ },
285
+ "Regular Review of Material Issues": {
286
+ "type": ["string", "null"]
287
+ },
288
+ "Regular Review of Material Issues Description": {
289
+ "type": "string",
290
+ "description": "Frequency and process for updating the assessment of material issues."
291
+ },
292
+ "Integration of Material Issues into Strategy": {
293
+ "type": ["string", "null"]
294
+ },
295
+ "Integration of Material Issues into Strategy Description": {
296
+ "type": "string",
297
+ "description": "How identified material issues are incorporated into the organization's strategic planning."
298
+ },
299
+ "Disclosure of Material Issues": {
300
+ "type": ["string", "null"]
301
+ },
302
+ "Disclosure of Material Issues Description": {
303
+ "type": "string",
304
+ "description": "Public reporting on identified material issues and how they are managed."
305
+ },
306
+ "Impact Assessment of Material Issues": {
307
+ "type": ["string", "null"]
308
+ },
309
+ "Impact Assessment of Material Issues Description": {
310
+ "type": "string",
311
+ "description": "Analysis of the potential or actual impact of material issues on the organization and its stakeholders."
312
+ }
313
+ },
314
+ "required": [
315
+ "Stakeholder Engagement Level",
316
+ "Stakeholder Engagement Level Description",
317
+ "Stakeholder Feedback Mechanisms",
318
+ "Stakeholder Feedback Mechanisms Description",
319
+ "Identification of Material Issues",
320
+ "Identification of Material Issues Description",
321
+ "Prioritization of Material Issues",
322
+ "Prioritization of Material Issues Description",
323
+ "Double Materiality Assessment",
324
+ "Double Materiality Assessment Description",
325
+ "Materiality Matrix Development",
326
+ "Materiality Matrix Development Description",
327
+ "Regular Review of Material Issues",
328
+ "Regular Review of Material Issues Description",
329
+ "Integration of Material Issues into Strategy",
330
+ "Integration of Material Issues into Strategy Description",
331
+ "Disclosure of Material Issues",
332
+ "Disclosure of Material Issues Description",
333
+ "Impact Assessment of Material Issues",
334
+ "Impact Assessment of Material Issues Description"
335
+ ],
336
+ "additionalProperties": False
337
+ }
338
+ }
339
+ },
340
+ "required": ["company_name", "Greenhouse Gas (GHG) Protocol Parameters", "Net Zero Intervention Parameters", "Materiality Parameters"],
341
+ "additionalProperties": False
342
+ }
343
+ }
344
+ }
345
+
346
+ GEMINI_RESPONSE_FORMAT = {
347
+ "type": "object",
348
+ "properties": {
349
+ "Company Name": {
350
+ "type": "string",
351
+ "description": "Name of the company."
352
+ },
353
+ "Greenhouse Gas (GHG) Protocol Parameters": {
354
+ "type": "object",
355
+ "properties": {
356
+ "Total GHG Emissions": { "type": "integer", "nullable": True, "description": "Total greenhouse gases emitted by the organization. Units: Metric Tons CO₂e." },
357
+ "Scope 1 Emissions": { "type": "integer", "nullable": True, "description": "Direct GHG emissions from owned or controlled sources. Units: Metric Tons CO₂e." },
358
+ "Scope 2 Emissions": { "type": "integer", "nullable": True, "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling. Units: Metric Tons CO₂e." },
359
+ "Scope 3 Emissions": { "type": "integer", "nullable": True, "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions. Units: Metric Tons CO₂e." },
360
+ "CO₂ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of carbon dioxide. Units: Metric Tons CO₂." },
361
+ "CH₄ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of methane. Units: Metric Tons CH₄." },
362
+ "N₂O Emissions": { "type": "integer", "nullable": True, "description": "Emissions of nitrous oxide. Units: Metric Tons N₂O." },
363
+ "HFC Emissions": { "type": "integer", "nullable": True, "description": "Emissions of hydrofluorocarbons. Units: Metric Tons HFCs" },
364
+ "PFC Emissions": { "type": "integer", "nullable": True, "description": "Emissions of perfluorocarbons. Units: Metric Tons PFCs" },
365
+ "SF₆ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of sulfur hexafluoride. Units: Metric Tons SF₆." },
366
+ "NF₃ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of nitrogen trifluoride. Units: Metric Tons NF₃." },
367
+ "Biogenic CO₂ Emissions": { "type": "integer", "nullable": True, "description": "CO₂ emissions from biological sources. Units: Metric Tons CO₂." },
368
+ "Emissions Intensity per Revenue": { "type": "number", "nullable": True, "description": "GHG emissions per unit of revenue. Units: Metric Tons CO₂e / Revenue." },
369
+ "Emissions Intensity per Employee": { "type": "number", "nullable": True, "description": "GHG emissions per employee. Units: Metric Tons CO₂e / Employee." },
370
+ "Base Year Emissions": { "type": "integer", "nullable": True, "description": "GHG emissions in the base year for comparison. Units: Metric Tons CO₂e." },
371
+ "Emissions Reduction Target": { "type": "number", "nullable": True, "description": "Targeted percentage reduction in GHG emissions. Units: Percentage (%)." },
372
+ "Emissions Reduction Achieved": { "type": "number", "nullable": True, "description": "Actual percentage reduction in GHG emissions achieved. Units: Percentage (%)." },
373
+ "Energy Consumption": { "type": "number", "nullable": True, "description": "Total energy consumed by the organization. Units: MWh or GJ." },
374
+ "Renewable Energy Consumption": { "type": "number", "nullable": True, "description": "Amount of energy consumed from renewable sources. Units: MWh or GJ." },
375
+ "Non-Renewable Energy Consumption": { "type": "number", "nullable": True, "description": "Amount of energy consumed from non-renewable sources. Units: MWh or GJ." },
376
+ "Energy Intensity per Revenue": { "type": "number", "nullable": True, "description": "Energy consumption per unit of revenue. Units: MWh or GJ / Revenue." },
377
+ "Energy Intensity per Employee": { "type": "number", "nullable": True, "description": "Energy consumption per employee. Units: MWh or GJ / Employee." },
378
+ "Fuel Consumption": { "type": "number", "nullable": True, "description": "Total fuel consumed by the organization. Units: Liters or GJ." },
379
+ "Electricity Consumption": { "type": "number", "nullable": True, "description": "Total electricity consumed. Units: MWh." },
380
+ "Heat Consumption": { "type": "number", "nullable": True, "description": "Total heat energy consumed. Units: GJ." },
381
+ "Steam Consumption": { "type": "number", "nullable": True, "description": "Total steam energy consumed. Units: GJ." },
382
+ "Cooling Consumption": { "type": "number", "nullable": True, "description": "Total energy consumed for cooling. Units: GJ." },
383
+ "Purchased Goods and Services Emissions": { "type": "integer", "nullable": True, "description": "Emissions from purchased goods and services. Units: Metric Tons CO₂e." },
384
+ "Capital Goods Emissions": { "type": "integer", "nullable": True, "description": "Emissions from the production of capital goods. Units: Metric Tons CO₂e." },
385
+ "Fuel- and Energy-Related Activities Emissions": { "type": "integer", "nullable": True, "description": "Emissions related to fuel and energy production not included in Scope 1 or 2. Units: Metric Tons CO₂e." },
386
+ "Upstream Transportation and Distribution Emissions": { "type": "integer", "nullable": True, "description": "Emissions from transportation and distribution in the supply chain. Units: Metric Tons CO₂e." },
387
+ "Waste Generated in Operations Emissions": { "type": "integer", "nullable": True, "description": "Emissions from waste generated during operations. Units: Metric Tons CO₂e." },
388
+ "Business Travel Emissions": { "type": "integer", "nullable": True, "description": "Emissions from employee business travel. Units: Metric Tons CO₂e." },
389
+ "Employee Commuting Emissions": { "type": "integer", "nullable": True, "description": "Emissions from employees commuting to and from work. Units: Metric Tons CO₂e." },
390
+ "Upstream Leased Assets Emissions": { "type": "integer", "nullable": True, "description": "Emissions from leased assets upstream in the value chain. Units: Metric Tons CO₂e." },
391
+ "Downstream Transportation and Distribution Emissions": { "type": "integer", "nullable": True, "description": "Emissions from transportation and distribution of sold products. Units: Metric Tons CO₂e." },
392
+ "Processing of Sold Products Emissions": { "type": "integer", "nullable": True, "description": "Emissions from processing intermediate products sold by the organization. Units: Metric Tons CO₂e." },
393
+ "Use of Sold Products Emissions": { "type": "integer", "nullable": True, "description": "Emissions from the use of sold products by consumers. Units: Metric Tons CO₂e." },
394
+ "End-of-Life Treatment of Sold Products Emissions": { "type": "integer", "nullable": True, "description": "Emissions from the disposal of sold products at end of life. Units: Metric Tons CO₂e." },
395
+ "Downstream Leased Assets Emissions": { "type": "integer", "nullable": True, "description": "Emissions from leased assets downstream in the value chain. Units: Metric Tons CO₂e." },
396
+ "Franchises Emissions": { "type": "integer", "nullable": True, "description": "Emissions from franchise operations. Units: Metric Tons CO₂e." },
397
+ "Investments Emissions": { "type": "integer", "nullable": True, "description": "Emissions from investments. Units: Metric Tons CO₂e." },
398
+ "Carbon Offsets Purchased": { "type": "integer", "nullable": True, "description": "Amount of carbon offsets purchased. Units: Metric Tons CO₂e." },
399
+ "Net GHG Emissions": { "type": "integer", "nullable": True, "description": "GHG emissions after accounting for offsets. Units: Metric Tons CO₂e." },
400
+ "Carbon Sequestration": { "type": "integer", "nullable": True, "description": "Amount of CO₂ sequestered or captured. Units: Metric Tons CO₂e." }
401
+ },
402
+ "propertyOrdering": [
403
+ "Total GHG Emissions",
404
+ "Scope 1 Emissions",
405
+ "Scope 2 Emissions",
406
+ "Scope 3 Emissions",
407
+ "CO₂ Emissions",
408
+ "CH₄ Emissions",
409
+ "N₂O Emissions",
410
+ "HFC Emissions",
411
+ "PFC Emissions",
412
+ "SF₆ Emissions",
413
+ "NF₃ Emissions",
414
+ "Biogenic CO₂ Emissions",
415
+ "Emissions Intensity per Revenue",
416
+ "Emissions Intensity per Employee",
417
+ "Base Year Emissions",
418
+ "Emissions Reduction Target",
419
+ "Emissions Reduction Achieved",
420
+ "Energy Consumption",
421
+ "Renewable Energy Consumption",
422
+ "Non-Renewable Energy Consumption",
423
+ "Energy Intensity per Revenue",
424
+ "Energy Intensity per Employee",
425
+ "Fuel Consumption",
426
+ "Electricity Consumption",
427
+ "Heat Consumption",
428
+ "Steam Consumption",
429
+ "Cooling Consumption",
430
+ "Purchased Goods and Services Emissions",
431
+ "Capital Goods Emissions",
432
+ "Fuel- and Energy-Related Activities Emissions",
433
+ "Upstream Transportation and Distribution Emissions",
434
+ "Waste Generated in Operations Emissions",
435
+ "Business Travel Emissions",
436
+ "Employee Commuting Emissions",
437
+ "Upstream Leased Assets Emissions",
438
+ "Downstream Transportation and Distribution Emissions",
439
+ "Processing of Sold Products Emissions",
440
+ "Use of Sold Products Emissions",
441
+ "End-of-Life Treatment of Sold Products Emissions",
442
+ "Downstream Leased Assets Emissions",
443
+ "Franchises Emissions",
444
+ "Investments Emissions",
445
+ "Carbon Offsets Purchased",
446
+ "Net GHG Emissions",
447
+ "Carbon Sequestration"
448
+ ]
449
+ }
450
+ },
451
+ "propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
452
+ }
application/schemas/schema.xlsx ADDED
Binary file (55.5 kB). View file
 
application/services/gemini_model.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from google import genai
4
+ from google.genai import types
5
+ from pydantic import BaseModel
6
+ from typing import Optional, Union, BinaryIO
7
+ from application.utils import logger
8
+ from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
9
+
10
+ logger = logger.get_logger()
11
+
12
+ PROMPT = (
13
+ """You are a PDF parsing agent.
14
+ Your job is to extract from a company’s sustainability or ESG report in PDF format:
15
+ If the values are not found in the document, please return json null for that value.
16
+ """
17
+ )
18
+
19
+ class Parameter(BaseModel):
20
+ """
21
+ A generic class to hold details for a sustainability metric.
22
+ """
23
+ synonym: str
24
+ uom: str
25
+ description: str
26
+ value: str
27
+
28
+ class GreenhouseGasGHGProtocolParameters(BaseModel):
29
+ Total_GHG_Emissions: Parameter
30
+ Scope_1_Emissions: Parameter
31
+ Scope_2_Emissions: Parameter
32
+ Scope_3_Emissions: Parameter
33
+ CO2_Emissions: Parameter
34
+ CH4_Emissions: Parameter
35
+ N2O_Emissions: Parameter
36
+ HFC_Emissions: Parameter
37
+ PFC_Emissions: Parameter
38
+ SF6_Emissions: Parameter
39
+ NF3_Emissions: Parameter
40
+ Biogenic_CO2_Emissions: Parameter
41
+ Emissions_Intensity_per_Revenue: Parameter
42
+ Emissions_Intensity_per_Employee: Parameter
43
+ Base_Year_Emissions: Parameter
44
+ Emissions_Reduction_Target: Parameter
45
+ Emissions_Reduction_Achieved: Parameter
46
+ Energy_Consumption: Parameter
47
+ Renewable_Energy_Consumption: Parameter
48
+ Non_Renewable_Energy_Consumption: Parameter
49
+ Energy_Intensity_per_Revenue: Parameter
50
+ Energy_Intensity_per_Employee: Parameter
51
+ Fuel_Consumption: Parameter
52
+ Electricity_Consumption: Parameter
53
+ Heat_Consumption: Parameter
54
+ Steam_Consumption: Parameter
55
+ Cooling_Consumption: Parameter
56
+ Purchased_Goods_and_Services_Emissions: Parameter
57
+ Capital_Goods_Emissions: Parameter
58
+ Fuel_and_Energy_Related_Activities_Emissions: Parameter
59
+ Upstream_Transportation_and_Distribution_Emissions: Parameter
60
+ Waste_Generated_in_Operations_Emissions: Parameter
61
+ Business_Travel_Emissions: Parameter
62
+ Employee_Commuting_Emissions: Parameter
63
+ Upstream_Leased_Assets_Emissions: Parameter
64
+ # Downstream_Transportation_and_Distribution_Emissions: Parameter
65
+ # Processing_of_Sold_Products_Emissions: Parameter
66
+ # Use_of_Sold_Products_Emissions: Parameter
67
+ # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
68
+ # Downstream_Leased_Assets_Emissions: Parameter
69
+ # Franchises_Emissions: Parameter
70
+ # Investments_Emissions: Parameter
71
+ # Carbon_Offsets_Purchased: Parameter
72
+ # Net_GHG_Emissions: Parameter
73
+ # Carbon_Sequestration: Parameter
74
+
75
+ class EmissionData(BaseModel):
76
+ GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
77
+
78
+ # print(json.dumps(EmissionData.model_json_schema(), indent=2))
79
+
80
+ def extract_emissions_data_as_json(
81
+ api: str,
82
+ model: str,
83
+ file_input: Union[BinaryIO, bytes]
84
+ ) -> Optional[dict]:
85
+ """
86
+ Extract ESG data from PDF using OpenAI or Gemini APIs.
87
+
88
+ Args:
89
+ api: 'openai' or 'gemini'
90
+ model: Model name (e.g. gpt-4o, gemini-pro)
91
+ file_input: File-like object or bytes of the PDF.
92
+
93
+ Returns:
94
+ Parsed ESG data as dict or None if failed.
95
+ """
96
+ try:
97
+
98
+ client = genai.Client(api_key=os.getenv("gemini_api_key"))
99
+
100
+ file_bytes = file_input.read()
101
+ logger.info("[Gemini] Sending content for generation...")
102
+
103
+ response = client.models.generate_content(
104
+ model=model,
105
+ contents=[
106
+ types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
107
+ PROMPT
108
+ ],
109
+ config={
110
+ 'response_mime_type': 'application/json',
111
+ 'response_schema': GEMINI_RESPONSE_FORMAT,
112
+ }
113
+ )
114
+ logger.info("[Gemini] Response received.")
115
+ try:
116
+ return json.loads(response.text)
117
+ except json.JSONDecodeError:
118
+ logger.warning("Failed to parse JSON, returning raw response.")
119
+ return {"raw_response": response.text}
120
+
121
+ except Exception as e:
122
+ logger.exception(f"Error during ESG data extraction.{e}")
123
+ return None
124
+
125
+ # import os
126
+ # from google import genai
127
+ # from pydantic import BaseModel, Field, ValidationError
128
+ # from dotenv import load_dotenv
129
+ # from typing import Optional
130
+ # from google.genai import types
131
+
132
+ # load_dotenv()
133
+ # client = genai.Client(api_key=os.getenv("gemini_api_key"))
134
+
135
+ # schema= """{
136
+ # "parameters": [
137
+ # {
138
+ # "parameter": "Total GHG Emissions",
139
+ # "dataType": "Numeric",
140
+ # "synonyms": ["Carbon Footprint"],
141
+ # "uom": "Metric Tons CO₂e",
142
+ # "description": "Total greenhouse gases emitted by the organization."
143
+ # },
144
+ # {
145
+ # "parameter": "Scope 1 Emissions",
146
+ # "dataType": "Numeric",
147
+ # "synonyms": ["Direct Emissions"],
148
+ # "uom": "Metric Tons CO₂e",
149
+ # "description": "Direct GHG emissions from owned or controlled sources."
150
+ # },
151
+ # {
152
+ # "parameter": "Scope 2 Emissions",
153
+ # "dataType": "Numeric",
154
+ # "synonyms": ["Indirect Energy Emissions"],
155
+ # "uom": "Metric Tons CO₂e",
156
+ # "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling."
157
+ # },
158
+ # {
159
+ # "parameter": "Scope 3 Emissions",
160
+ # "dataType": "Numeric",
161
+ # "synonyms": ["Value Chain Emissions"],
162
+ # "uom": "Metric Tons CO₂e",
163
+ # "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions."
164
+ # },
165
+ # {
166
+ # "parameter": "CO₂ Emissions",
167
+ # "dataType": "Numeric",
168
+ # "synonyms": ["Carbon Emissions"],
169
+ # "uom": "Metric Tons CO₂",
170
+ # "description": "Emissions of carbon dioxide."
171
+ # },
172
+ # {
173
+ # "parameter": "CH₄ Emissions",
174
+ # "dataType": "Numeric",
175
+ # "synonyms": ["Methane Emissions"],
176
+ # "uom": "Metric Tons CH₄",
177
+ # "description": "Emissions of methane."
178
+ # },
179
+ # {
180
+ # "parameter": "N₂O Emissions",
181
+ # "dataType": "Numeric",
182
+ # "synonyms": ["Nitrous Oxide Emissions"],
183
+ # "uom": "Metric Tons N₂O",
184
+ # "description": "Emissions of nitrous oxide."
185
+ # },
186
+ # {
187
+ # "parameter": "HFC Emissions",
188
+ # "dataType": "Numeric",
189
+ # "synonyms": ["Hydrofluorocarbon Emissions"],
190
+ # "uom": "Metric Tons HFCs",
191
+ # "description": "Emissions of hydrofluorocarbons."
192
+ # },
193
+ # {
194
+ # "parameter": "PFC Emissions",
195
+ # "dataType": "Numeric",
196
+ # "synonyms": ["Perfluorocarbon Emissions"],
197
+ # "uom": "Metric Tons PFCs",
198
+ # "description": "Emissions of perfluorocarbons."
199
+ # },
200
+ # {
201
+ # "parameter": "SF₆ Emissions",
202
+ # "dataType": "Numeric",
203
+ # "synonyms": ["Sulfur Hexafluoride Emissions"],
204
+ # "uom": "Metric Tons SF₆",
205
+ # "description": "Emissions of sulfur hexafluoride."
206
+ # },
207
+ # {
208
+ # "parameter": "NF₃ Emissions",
209
+ # "dataType": "Numeric",
210
+ # "synonyms": ["Nitrogen Trifluoride Emissions"],
211
+ # "uom": "Metric Tons NF₃",
212
+ # "description": "Emissions of nitrogen trifluoride."
213
+ # },
214
+ # {
215
+ # "parameter": "Biogenic CO₂ Emissions",
216
+ # "dataType": "Numeric",
217
+ # "synonyms": ["Biogenic Carbon Emissions"],
218
+ # "uom": "Metric Tons CO₂",
219
+ # "description": "CO₂ emissions from biological sources."
220
+ # },
221
+ # {
222
+ # "parameter": "Emissions Intensity per Revenue",
223
+ # "dataType": "Numeric",
224
+ # "synonyms": ["Carbon Intensity"],
225
+ # "uom": "Metric Tons CO₂e / Revenue",
226
+ # "description": "GHG emissions per unit of revenue."
227
+ # },
228
+ # {
229
+ # "parameter": "Emissions Intensity per Employee",
230
+ # "dataType": "Numeric",
231
+ # "synonyms": ["Emissions per Employee"],
232
+ # "uom": "Metric Tons CO₂e / Employee",
233
+ # "description": "GHG emissions per employee."
234
+ # },
235
+ # {
236
+ # "parameter": "Base Year Emissions",
237
+ # "dataType": "Numeric",
238
+ # "synonyms": ["Baseline Emissions"],
239
+ # "uom": "Metric Tons CO₂e",
240
+ # "description": "GHG emissions in the base year for comparison."
241
+ # },
242
+ # {
243
+ # "parameter": "Emissions Reduction Target",
244
+ # "dataType": "Numeric",
245
+ # "synonyms": ["Emission Reduction Goal"],
246
+ # "uom": "Percentage (%)",
247
+ # "description": "Targeted percentage reduction in GHG emissions."
248
+ # },
249
+ # {
250
+ # "parameter": "Emissions Reduction Achieved",
251
+ # "dataType": "Numeric",
252
+ # "synonyms": ["Emission Reduction Accomplished"],
253
+ # "uom": "Percentage (%)",
254
+ # "description": "Actual percentage reduction in GHG emissions achieved."
255
+ # },
256
+ # {
257
+ # "parameter": "Energy Consumption",
258
+ # "dataType": "Numeric",
259
+ # "synonyms": ["Energy Use"],
260
+ # "uom": "MWh or GJ",
261
+ # "description": "Total energy consumed by the organization."
262
+ # },
263
+ # {
264
+ # "parameter": "Renewable Energy Consumption",
265
+ # "dataType": "Numeric",
266
+ # "synonyms": ["Green Energy Use"],
267
+ # "uom": "MWh or GJ",
268
+ # "description": "Amount of energy consumed from renewable sources."
269
+ # },
270
+ # {
271
+ # "parameter": "Non-Renewable Energy Consumption",
272
+ # "dataType": "Numeric",
273
+ # "synonyms": ["Fossil Energy Use"],
274
+ # "uom": "MWh or GJ",
275
+ # "description": "Amount of energy consumed from non-renewable sources."
276
+ # },
277
+ # {
278
+ # "parameter": "Carbon Offsets Purchased",
279
+ # "dataType": "Numeric",
280
+ # "synonyms": ["Carbon Credits"],
281
+ # "uom": "Metric Tons CO₂e",
282
+ # "description": "Amount of carbon offsets purchased."
283
+ # },
284
+ # {
285
+ # "parameter": "Net GHG Emissions",
286
+ # "dataType": "Numeric",
287
+ # "synonyms": ["Net Carbon Emissions"],
288
+ # "uom": "Metric Tons CO₂e",
289
+ # "description": "GHG emissions after accounting for offsets."
290
+ # },
291
+ # {
292
+ # "parameter": "Carbon Sequestration",
293
+ # "dataType": "Numeric",
294
+ # "synonyms": ["Carbon Capture"],
295
+ # "uom": "Metric Tons CO₂e",
296
+ # "description": "Amount of CO₂ sequestered or captured."
297
+ # }
298
+ # ]
299
+ # }"""
application/services/llm_service.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Union, BinaryIO, Optional
4
+ from openai import OpenAI
5
+ from google import genai
6
+ from google.genai import types
7
+ from application.utils import logger
8
+ from application.schemas.response_schema import RESPONSE_FORMAT,GEMINI_RESPONSE_FORMAT
9
+
10
+ logger = logger.get_logger()
11
+ client = OpenAI()
12
+
13
+ # --- Constants ---
14
+
15
+ PROMPT = (
16
+ "You are a PDF parsing agent. "
17
+ "Your job is to extract GHG Protocol Parameters and ESG (Environmental, Social, Governance) Data "
18
+ "from a company’s sustainability or ESG report in PDF format."
19
+ )
20
+
21
+ # --- OpenAI Helpers ---
22
+
23
+ def get_files() -> list:
24
+ """Retrieve all files from OpenAI client."""
25
+ try:
26
+ files = client.files.list()
27
+ logger.info(f"Retrieved {len(files.data)} files.")
28
+ return files.data
29
+ except Exception as e:
30
+ logger.error(f"Failed to retrieve files: {e}")
31
+ raise
32
+
33
+ def get_or_create_file(file_input: BinaryIO, client) -> object:
34
+ """
35
+ Retrieve a file from OpenAI by name or upload it if not present.
36
+
37
+ Args:
38
+ file_input: File-like object with `.name` attribute.
39
+ client: OpenAI client instance.
40
+
41
+ Returns:
42
+ File object.
43
+ """
44
+ file_name = getattr(file_input, 'name', None)
45
+ if not file_name:
46
+ raise ValueError("File input must have a 'name' attribute.")
47
+
48
+ try:
49
+ for file in get_files():
50
+ if file.filename == file_name:
51
+ logger.info(f"File '{file_name}' already exists with ID: {file.id}")
52
+ return client.files.retrieve(file.id)
53
+
54
+ logger.info(f"Uploading new file '{file_name}'...")
55
+ new_file = client.files.create(file=(file_name, file_input), purpose="assistants")
56
+ logger.info(f"File uploaded successfully with ID: {new_file.id}")
57
+ return new_file
58
+
59
+ except Exception as e:
60
+ logger.error(f"Error during get_or_create_file: {e}")
61
+ raise
62
+
63
+ def delete_file_by_size(size: int, client):
64
+ """
65
+ Deletes files from OpenAI that match a given byte size.
66
+
67
+ Args:
68
+ size: File size in bytes to match for deletion.
69
+ client: OpenAI client instance.
70
+ """
71
+ try:
72
+ files = get_files()
73
+ for file in files:
74
+ if file.bytes == size:
75
+ client.files.delete(file.id)
76
+ logger.info(f"File {file.filename} deleted (size matched: {size} bytes).")
77
+ else:
78
+ logger.info(f"File {file.filename} skipped (size mismatch).")
79
+ except Exception as e:
80
+ logger.error(f"Failed to delete files: {e}")
81
+ raise
82
+
83
+ # --- Main Function ---
84
+
85
+ def extract_emissions_data_as_json(
86
+ api: str,
87
+ model: str,
88
+ file_input: Union[BinaryIO, bytes]
89
+ ) -> Optional[dict]:
90
+ """
91
+ Extract ESG data from PDF using OpenAI or Gemini APIs.
92
+
93
+ Args:
94
+ api: 'openai' or 'gemini'
95
+ model: Model name (e.g. gpt-4o, gemini-pro)
96
+ file_input: File-like object or bytes of the PDF.
97
+
98
+ Returns:
99
+ Parsed ESG data as dict or None if failed.
100
+ """
101
+ try:
102
+ if api.lower() == "openai":
103
+ client = OpenAI()
104
+ file = get_or_create_file(file_input, client)
105
+
106
+ logger.info("[OpenAI] Sending content for generation...")
107
+
108
+ response = client.chat.completions.create(
109
+ model=model,
110
+ messages=[{
111
+ "role": "user",
112
+ "content": [
113
+ {"type": "file", "file": {"file_id": file.id}},
114
+ {"type": "text", "text": PROMPT}
115
+ ]
116
+ }],
117
+ response_format=RESPONSE_FORMAT
118
+ )
119
+
120
+ result = response.choices[0].message.content
121
+ logger.info("ESG data extraction successful.")
122
+ return result
123
+
124
+ elif api.lower() == "gemini":
125
+ client = genai.Client(api_key=os.getenv("gemini_api_key"))
126
+
127
+ file_bytes = file_input.read()
128
+ logger.info("[Gemini] Sending content for generation...")
129
+
130
+ response = client.models.generate_content(
131
+ model=model,
132
+ contents=[
133
+ types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
134
+ PROMPT
135
+ ],
136
+ config={
137
+ 'response_mime_type': 'application/json',
138
+ 'response_schema': GEMINI_RESPONSE_FORMAT,
139
+ }
140
+ )
141
+ logger.info("[Gemini] Response received.")
142
+ try:
143
+ return json.loads(response.text)
144
+ except json.JSONDecodeError:
145
+ logger.warning("Failed to parse JSON, returning raw response.")
146
+ return {"raw_response": response.text}
147
+ else:
148
+ logger.error(f"Unsupported API: {api}")
149
+ return None
150
+
151
+ except Exception as e:
152
+ logger.exception("Error during ESG data extraction.")
153
+ return None
154
+
155
+ # --- Debug Helper ---
156
+
157
+ def list_all_files():
158
+ """Lists all files currently uploaded to OpenAI."""
159
+ try:
160
+ files = get_files()
161
+ for file in files:
162
+ logger.info(f"File ID: {file.id}, Name: {file.filename}, Size: {file.bytes} bytes")
163
+ except Exception as e:
164
+ logger.error(f"Failed to list files: {e}")
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+ # import os
181
+ # import json
182
+ # from google import genai
183
+ # from google.genai import types
184
+ # from openai import OpenAI
185
+ # from dotenv import load_dotenv
186
+ # from application.utils import logger
187
+ # import pandas as pd
188
+ # import openpyxl
189
+
190
+ # load_dotenv()
191
+ # logger = logger.get_logger()
192
+
193
+
194
+
195
+ # def load_schema_from_excel(file_path) -> str:
196
+ # df = pd.read_excel(file_path,engine='openpyxl')
197
+
198
+ # schema_lines = ["Schema fields and expected format:\n"]
199
+ # for _, row in df.iterrows():
200
+ # field = row.get("Field", "")
201
+ # description = row.get("Description", "")
202
+ # example = row.get("Example", "")
203
+ # schema_lines.append(f"- {field}: {description} (e.g., {example})")
204
+
205
+ # return "\n".join(schema_lines)
206
+
207
+ # schema_text = load_schema_from_excel("application/schemas/schema.xlsx")
208
+
209
+ # # print(schema_text)
210
+
211
+ # PROMPT = (f"""You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters and ESG (Environmental, Social, Governance) Data from a company’s sustainability or ESG report in PDF format.
212
+ # Please return the response as raw JSON without markdown formatting (no triple backticks or json tags) using the following fields:
213
+ # Total GHG emissions (Metric Tons CO₂e)
214
+ # Scope 1, 2, and 3 emissions
215
+ # Emissions by gas (CO₂, CH₄, N₂O, HFCs, etc.)
216
+ # Energy and fuel consumption (MWh, GJ, Liters)
217
+ # Carbon offsets, intensity metrics, and reduction targets
218
+ # ESG disclosures including:
219
+ # Environmental Policies
220
+ # Whether the company has an Environmental Management System (EMS)
221
+ # Environmental certifications (if any)
222
+ # Ensure values include their units, are extracted accurately, and the fields match the schema provided below and If the value is zero replace it with null:
223
+
224
+ # {schema_text}
225
+
226
+ # """)
227
+
228
+ # def extract_emissions_data_as_json(api, model, file_input):
229
+
230
+ # if api.lower()=="openai":
231
+
232
+ # client = OpenAI()
233
+
234
+ # file = client.files.create(
235
+ # file=("uploaded.pdf", file_input),
236
+ # purpose="assistants"
237
+ # )
238
+
239
+ # completion = client.chat.completions.create(
240
+ # model=model,
241
+ # messages=[
242
+ # {
243
+ # "role": "user",
244
+ # "content": [
245
+ # {
246
+ # "type": "file",
247
+ # "file": {
248
+ # "file_id": file.id,
249
+ # }
250
+ # },
251
+ # {
252
+ # "type": "text",
253
+ # "text":PROMPT,
254
+ # },
255
+ # ]
256
+ # }
257
+ # ]
258
+ # )
259
+
260
+ # try:
261
+ # return json.loads(completion.choices[0].message.content)
262
+ # except json.JSONDecodeError:
263
+ # logger.error("Warning: Output was not valid JSON.")
264
+ # return {"raw_response": completion.choices[0].message.content}
265
+
266
+ # if api.lower()=="gemini":
267
+
268
+ # client = genai.Client(api_key=os.getenv('gemini_api_key'))
269
+
270
+ # file_bytes= file_input.read()
271
+ # response = client.models.generate_content(
272
+ # model=model,
273
+ # contents=[
274
+ # types.Part.from_bytes(
275
+ # data=file_bytes,
276
+ # mime_type='application/pdf',
277
+ # ),
278
+ # PROMPT])
279
+
280
+ # try:
281
+ # return json.loads(response.text)
282
+ # except json.JSONDecodeError:
283
+ # return {"raw_response": response.text}
284
+
285
+
286
+
287
+ # # {
288
+ # # "type": "object",
289
+ # # "properties": {
290
+ # # "GHG_Protocol_Parameters": {
291
+ # # "type": "object",
292
+ # # "properties": {
293
+ # # "Total_GHG_Emissions": { "type": "number" },
294
+ # # "Scope_1_Emissions": { "type": "number" },
295
+ # # "Scope_2_Emissions": { "type": "number" },
296
+ # # "Scope_3_Emissions": { "type": "number" },
297
+ # # "CO2_Emissions": { "type": "number" },
298
+ # # "CH4_Emissions": { "type": "number" },
299
+ # # "N2O_Emissions": { "type": "number" },
300
+ # # "HFC_Emissions": { "type": "number" },
301
+ # # "PFC_Emissions": { "type": "number" },
302
+ # # "SF6_Emissions": { "type": "number" },
303
+ # # "NF3_Emissions": { "type": "number" },
304
+ # # "Biogenic_CO2_Emissions": { "type": "number" },
305
+ # # "Emissions_Intensity_per_Revenue": { "type": "number" },
306
+ # # "Emissions_Intensity_per_Employee": { "type": "number" },
307
+ # # "Base_Year_Emissions": { "type": "number" },
308
+ # # "Emissions_Reduction_Target": { "type": "number" },
309
+ # # "Emissions_Reduction_Achieved": { "type": "number" },
310
+ # # "Energy_Consumption": { "type": "number" },
311
+ # # "Renewable_Energy_Consumption": { "type": "number" },
312
+ # # "Non_Renewable_Energy_Consumption": { "type": "number" },
313
+ # # "Energy_Intensity_per_Revenue": { "type": "number" },
314
+ # # "Energy_Intensity_per_Employee": { "type": "number" },
315
+ # # "Fuel_Consumption": { "type": "number" },
316
+ # # "Electricity_Consumption": { "type": "number" },
317
+ # # "Heat_Consumption": { "type": "number" },
318
+ # # "Steam_Consumption": { "type": "number" },
319
+ # # "Cooling_Consumption": { "type": "number" },
320
+ # # "Purchased_Goods_and_Services_Emissions": { "type": "number" },
321
+ # # "Capital_Goods_Emissions": { "type": "number" },
322
+ # # "Fuel_and_Energy_Related_Activities_Emissions": { "type": "number" },
323
+ # # "Upstream_Transportation_and_Distribution_Emissions": { "type": "number" },
324
+ # # "Waste_Generated_in_Operations_Emissions": { "type": "number" },
325
+ # # "Business_Travel_Emissions": { "type": "number" },
326
+ # # "Employee_Commuting_Emissions": { "type": "number" },
327
+ # # "Upstream_Leased_Assets_Emissions": { "type": "number" },
328
+ # # "Downstream_Transportation_and_Distribution_Emissions": { "type": "number" },
329
+ # # "Processing_of_Sold_Products_Emissions": { "type": "number" },
330
+ # # "Use_of_Sold_Products_Emissions": { "type": "number" },
331
+ # # "End_of_Life_Treatment_of_Sold_Products_Emissions": { "type": "number" },
332
+ # # "Downstream_Leased_Assets_Emissions": { "type": "number" },
333
+ # # "Franchises_Emissions": { "type": "number" },
334
+ # # "Investments_Emissions": { "type": "number" },
335
+ # # "Carbon_Offsets_Purchased": { "type": "number" },
336
+ # # "Net_GHG_Emissions": { "type": "number" },
337
+ # # "Carbon_Sequestration": { "type": "number" }
338
+ # # }
339
+ # # },
340
+ # # "ESG_Parameters_CSRS": {
341
+ # # "type": "object",
342
+ # # "properties": {
343
+ # # "Environmental_Policies": { "type": "string" },
344
+ # # "Environmental_Management_System": { "type": "boolean" },
345
+ # # "Environmental_Certifications": { "type": "string" }
346
+ # # }
347
+ # # }
348
+ # # },
349
+ # # "required": ["GHG_Protocol_Parameters", "ESG_Parameters_CSRS"]}
application/services/openai_model.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from pydantic import BaseModel
2
+ # from openai import OpenAI
3
+ # from typing import List, Dict, Optional, Union
4
+
5
+ # client = OpenAI()
6
+
7
+ # class GHGParameter(BaseModel):
8
+ # parameter: str
9
+ # data_type: str
10
+ # synonyms: Optional[List[str]] = None
11
+ # uom: Optional[str] = None
12
+ # description: Optional[str] = None
13
+ # value: Union[int, str, None]
14
+
15
+
16
+ # class GHGCategory(BaseModel):
17
+ # category: str
18
+ # parameters: List[GHGParameter]
19
+
20
+ # SCHEMA = """{
21
+ # "Gas (GHG)": {
22
+ # "Total GHG Emissions": {
23
+ # "data_type": "Numeric",
24
+ # "synonyms": ["Carbon Footprint"],
25
+ # "uom": "Metric Tons CO₂e",
26
+ # "description": "Total greenhouse gases emitted by the organization.",
27
+ # "value": null
28
+ # }"""
29
+
30
+ # PROMPT = (f"""You are a PDF parsing agent.
31
+ # Fetch the following data from pdf : {SCHEMA}"""
32
+ # )
33
+
34
+ # def extract_emissions_data_as_json(api, model, file_input):
35
+ # if api.lower() == "openai":
36
+ # file = client.files.create(
37
+ # file=("uploaded.pdf", file_input),
38
+ # purpose="assistants"
39
+ # )
40
+
41
+ # completion = client.beta.chat.completions.parse(
42
+ # model="gpt-4o-2024-08-06",
43
+ # messages=[
44
+ # {
45
+ # "role": "user",
46
+ # "content": [
47
+ # {
48
+ # "type": "file",
49
+ # "file": {
50
+ # "file_id": file.id,
51
+ # }
52
+ # },
53
+ # {
54
+ # "type": "text",
55
+ # "text":PROMPT,
56
+ # },
57
+ # ]
58
+ # }
59
+ # ],
60
+ # response_format=GHGCategory,
61
+ # )
62
+
63
+ # research_paper = completion.choices[0].message.parsed
64
+ # return research_paper
65
+
66
+ # from pydantic import BaseModel
67
+ # from openai import OpenAI
68
+
69
+ # client = OpenAI()
70
+
71
+ # class CalendarEvent(BaseModel):
72
+ # name: str
73
+ # date: str
74
+ # participants: list[str]
75
+
76
+ # def extract_emissions_data_as_json(api, model, file_input):
77
+ # if api.lower() == "openai":
78
+ # file = client.files.create(
79
+ # file=("uploaded.pdf", file_input),
80
+ # purpose="assistants"
81
+ # )
82
+
83
+ # completion = client.beta.chat.completions.parse(
84
+ # model="gpt-4o-2024-08-06",
85
+ # messages=[
86
+ # {
87
+ # "role": "user",
88
+ # "content": [
89
+ # {
90
+ # "type": "file",
91
+ # "file": {
92
+ # "file_id": file.id,
93
+ # }
94
+ # },
95
+ # {
96
+ # "type": "text",
97
+ # "text":PROMPT,
98
+ # },
99
+ # ]
100
+ # }
101
+ # ],
102
+ # response_format=GHGCategory,
103
+ # )
104
+
105
+ # event = completion.choices[0].message.parsed
106
+
107
+ # response = client.chat.completions.create(
108
+ # model="gpt-4o-2024-08-06",
109
+ # messages=[
110
+ # {"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
111
+ # {"role": "user", "content": "how can I solve 8x + 7 = -23"}
112
+ # ],
113
+ # response_format={
114
+ # "type": "json_schema",
115
+ # "json_schema": {
116
+ # "name": "GHGCategory",
117
+ # "schema": {
118
+ # "type": "object",
119
+ # "properties": {
120
+ # "steps": {
121
+ # "type": "array",
122
+ # "items": {
123
+ # "type": "object",
124
+ # "properties": {
125
+ # "explanation": {"type": "string"},
126
+ # "output": {"type": "string"}
127
+ # },
128
+ # "required": ["explanation", "output"],
129
+ # "additionalProperties": False
130
+ # }
131
+ # },
132
+ # "final_answer": {"type": "string"}
133
+ # },
134
+ # "required": ["steps", "final_answer"],
135
+ # "additionalProperties": False
136
+ # },
137
+ # "strict": True
138
+ # }
139
+ # }
140
+ # )
141
+
142
+ # print(response.choices[0].message.content)
143
+
144
+
145
+ # response = await async_client.responses.create(
146
+ # model="gpt-4o",
147
+ # input=[
148
+ # {
149
+ # "role": "user",
150
+ # "content": [
151
+ # {
152
+ # "type": "input_file",
153
+ # "file_id": uploaded_file.id,
154
+ # },
155
+ # {
156
+ # "type": "input_text",
157
+ # "text": """
158
+ # You are an intelligent PDF data extractor designed to extract structured information from Brand Books. A Brand Book contains guidelines and details about a brand's identity, including its logo, colors, typography, messaging, and more.
159
+ # Ensure the extracted data follows this schema strictly.
160
+ # Return the extracted brand information in JSON format with no explaination.
161
+ # For brand_logo and favicon, always provide a direct URL to the image instead of just the image name or a placeholder. If no valid URLs are found, return an empty array. """
162
+ # }
163
+ # ]
164
+ # }
165
+ # ],
166
+ # text={
167
+ # "format": {
168
+ # "type": "json_schema",
169
+ # "name": "BrandBook",
170
+ # "strict": True,
171
+ # "schema": {
172
+ # "type": "object",
173
+ # "properties": {
174
+ # "brand_url": {
175
+ # "type": "string",
176
+ # "description": "The URL associated with the brand."
177
+ # },
178
+ # "brand_name": {
179
+ # "type": "string",
180
+ # "description": "The name of the brand."
181
+ # },
182
+ # "brand_category": {
183
+ # "type": "array",
184
+ # "description": "A list of categories that the brand belongs to.",
185
+ # "items": {
186
+ # "type": "string"
187
+ # }
188
+ # },
189
+ # "brand_description": {
190
+ # "type": "string",
191
+ # "description": "A brief description of the brand."
192
+ # },
193
+ # "brand_colors": {
194
+ # "type": "array",
195
+ # "description": "A list of colors associated with the brand.",
196
+ # "items": {
197
+ # "type": "string"
198
+ # }
199
+ # },
200
+ # "brand_fonts": {
201
+ # "type": "array",
202
+ # "description": "A list of fonts used by the brand.",
203
+ # "items": {
204
+ # "type": "string"
205
+ # }
206
+ # },
207
+ # "brand_logo": {
208
+ # "type": "array",
209
+ # "description": "A list of logo urls associated with the brand.",
210
+ # "items": {
211
+ # "type": "string"
212
+ # }
213
+ # },
214
+ # "target_audience": {
215
+ # "type": "string",
216
+ # "description": "The target audience for the brand."
217
+ # },
218
+ # "competitors": {
219
+ # "type": "string",
220
+ # "description": "The competitors of the brand."
221
+ # },
222
+ # "aspirational_brands": {
223
+ # "type": "string",
224
+ # "description": "Brands that the brand aspires to be like."
225
+ # },
226
+ # "favicon": {
227
+ # "type": "array",
228
+ # "description": "A list of favicon URLs associated with the brand.",
229
+ # "items": {
230
+ # "type": "string"
231
+ # }
232
+ # }
233
+ # },
234
+ # "required": [
235
+ # "brand_url",
236
+ # "brand_name",
237
+ # "brand_category",
238
+ # "brand_description",
239
+ # "brand_colors",
240
+ # "brand_fonts",
241
+ # "brand_logo",
242
+ # "target_audience",
243
+ # "competitors",
244
+ # "aspirational_brands",
245
+ # "favicon"
246
+ # ],
247
+ # "additionalProperties": False
248
+ # }
249
+ # }
250
+ # }
251
+ # )
application/services/streamlit_function.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from typing import Union, List
3
+ from application.utils import logger
4
+
5
+ logger = logger.get_logger()
6
+
7
+ PAGE_TITLE = "PDF Extractor"
8
+ PAGE_LAYOUT = "wide"
9
+ # PAGE_ICON = "src/frontend/images/page_icon.jpg"
10
+ # GITHUB_LINK = "https://github.com/Vela-Test1993/yuvabe-care-companion-ai"
11
+ # ABOUT_US = "An AI-powered assistant for personalized healthcare guidance."
12
+
13
+
14
+ def config_homepage(page_title=PAGE_TITLE):
15
+ """
16
+ Configures the Streamlit homepage with essential settings.
17
+
18
+ This function sets up the page title, icon, layout, and sidebar state.
19
+ It also defines custom menu items for better navigation.
20
+
21
+ Args:
22
+ page_title (str): The title displayed on the browser tab (default is PAGE_TITLE).
23
+
24
+ Key Features:
25
+ - Ensures `st.set_page_config()` is called only once to avoid errors.
26
+ - Uses constants for improved maintainability and consistency.
27
+ - Provides links for help, bug reporting, and an 'About' section.
28
+
29
+ Example:
30
+ >>> config_homepage("My Custom App")
31
+ """
32
+ if "page_config_set" not in st.session_state:
33
+ st.set_page_config(
34
+ page_title=page_title,
35
+ # page_icon=PAGE_ICON,
36
+ layout=PAGE_LAYOUT,
37
+ initial_sidebar_state="collapsed",
38
+ # menu_items={
39
+ # "Get help": GITHUB_LINK,
40
+ # "Report a bug": GITHUB_LINK,
41
+ # "About": ABOUT_US
42
+ # }
43
+ )
44
+ # st.session_state.page_config_set = True
45
+
46
+ def upload_file(
47
+ file_types: Union[str, List[str]] = "pdf",
48
+ label: str = "📤 Upload a file",
49
+ help_text: str = "Upload your file for processing.",
50
+ allow_multiple: bool = False,
51
+ ):
52
+ """
53
+ Streamlit file uploader widget with options.
54
+
55
+ Args:
56
+ file_types (str or list): Allowed file type(s), e.g., "pdf" or ["pdf", "docx"].
57
+ label (str): Label displayed above the uploader.
58
+ help_text (str): Tooltip help text.
59
+ allow_multiple (bool): Allow multiple file uploads.
60
+
61
+ Returns:
62
+ Uploaded file(s): A single file object or a list of file objects.
63
+ """
64
+ if isinstance(file_types, str):
65
+ file_types = [file_types]
66
+
67
+ uploaded_files = st.file_uploader(
68
+ label=label,
69
+ type=file_types,
70
+ help=help_text,
71
+ accept_multiple_files=allow_multiple
72
+ )
73
+
74
+ if st.button("Submit"):
75
+ st.session_state.pdf_file = uploaded_files
76
+ return uploaded_files
77
+
78
+ # def extract_text_from_pdf(file) -> str:
79
+ # """
80
+ # Extracts and returns the full text content from a PDF file.
81
+
82
+ # :param file: PDF file object (BytesIO or UploadedFile from Streamlit)
83
+ # :return: Extracted text as a string
84
+ # """
85
+ # text = ""
86
+ # with fitz.open(stream=file.read(), filetype="pdf") as doc:
87
+ # for page in doc:
88
+ # text += page.get_text()
89
+ # return text.strip()
application/utils/logger.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from logging.handlers import RotatingFileHandler
3
+ import os
4
+
5
+ log_file = 'eco_scribe.log'
6
+ log_dir = 'logs/app'
7
+ log_level=logging.INFO
8
+
9
+ def get_logger( ):
10
+
11
+ if not os.path.exists(log_dir):
12
+ os.makedirs(log_dir)
13
+
14
+ log_file_path = os.path.join(log_dir, log_file)
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ if not logger.hasHandlers():
19
+ logger.setLevel(log_level)
20
+
21
+ console_handler = logging.StreamHandler()
22
+ console_handler.setLevel(logging.DEBUG)
23
+
24
+ file_handler = RotatingFileHandler(log_file_path, maxBytes=5*1024*1024, backupCount=3)
25
+ file_handler.setLevel(logging.INFO)
26
+
27
+ log_format = '%(asctime)s - %(levelname)s - %(message)s'
28
+ formatter = logging.Formatter(log_format, datefmt='%Y-%m-%d %H:%M')
29
+ console_handler.setFormatter(formatter)
30
+ file_handler.setFormatter(formatter)
31
+
32
+ logger.addHandler(console_handler)
33
+ logger.addHandler(file_handler)
34
+
35
+ return logger
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ openai
3
+ dotenv
4
+ google
5
+ google.genai
6
+ google-generativeai
7
+ pymupdf
8
+ openpyxl
9
+ pandas
test.py ADDED
File without changes