Spaces:

VelaTest
/

PDFExtractor

Sleeping

App Files Files Community

Vela commited on Apr 12

Commit

f7d4608

1 Parent(s): 16f68b6

Created a PdfExtraction application with basic functionality

Browse files

Files changed (11) hide show

.gitignore +5 -0
app.py +60 -0
application/schemas/response_schema.py +452 -0
application/schemas/schema.xlsx +0 -0
application/services/gemini_model.py +299 -0
application/services/llm_service.py +349 -0
application/services/openai_model.py +251 -0
application/services/streamlit_function.py +89 -0
application/utils/logger.py +35 -0
requirements.txt +9 -0
test.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.venv
+.env
+data
+__pycache__/
+logs/

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from application.services import streamlit_function, llm_service
+from application.services import gemini_model, openai_model
+import streamlit as st
+from google.genai.errors import ClientError
+from application.utils import logger
+import test
+logger = logger.get_logger()
+streamlit_function.config_homepage()
+pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
+available_files = ["Select a pdf file"]
+for file in llm_service.get_files():
+    available_files.append(file.filename)
+selected_file = st.selectbox("Select a existing file", available_files)
+for key in ["gpt4o_mini_result", "gpt4o_result", "gemini_result", "pdf_file"]:
+    if key not in st.session_state:
+        st.session_state[key] = None
+if st.session_state.pdf_file:
+    with st.container():
+        col1, col2, col3 = st.columns([5, 5, 5], gap="small")
+        with col1:
+            if st.button("Generate GPT-4o-min Response"):
+                with st.spinner("Calling GPT-4o-mini..."):
+                    result = llm_service.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
+                    # result= openai_model.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
+                    st.session_state.gpt4o_mini_result = result
+            if st.session_state.gpt4o_mini_result:
+                st.write("Extracted Metrics by gpt-4o-mini")
+                st.json(st.session_state.gpt4o_mini_result)
+        with col2:
+            if st.button("Generate GPT-4o Response"):
+                with st.spinner("Calling gpt-4o..."):
+                    result= llm_service.extract_emissions_data_as_json("openai","gpt-4o",pdf_file)
+                    st.session_state.gpt4o_result = result
+            if st.session_state.gpt4o_result:
+                st.write("Extracted Metrics by gpt-4o")
+                st.json(st.session_state.gpt4o_result)
+        with col3:
+            try:
+                if st.button("Generate Gemini Response"):
+                    with st.spinner("Calling gemini-1.5-pro-latest..."):
+                        result = llm_service.extract_emissions_data_as_json("gemini","gemini-2.0-flash", st.session_state.pdf_file)
+                        # result = gemini_model.extract_emissions_data_as_json("gemini","gemini-2.0-flash", pdf_file)
+                        st.session_state.gemini_result = result
+            except ClientError as e:
+                st.error(f"Gemini API Error: {e}")
+                logger.error("Error Details:", e.message, e.response)
+            if st.session_state.gemini_result:
+                st.write("Extracted Metrics by gemini-1.5-pro-latest")
+                st.json(st.session_state.gemini_result)

application/schemas/response_schema.py ADDED Viewed

	@@ -0,0 +1,452 @@

+RESPONSE_FORMAT = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "esg_response",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "properties": {
+                "company_name": {"type": "string"},
+                "Greenhouse Gas (GHG) Protocol Parameters": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "Total GHG Emissions": {"type": ["integer", "null"]},
+                            "Total GHG Emissions Description": {
+                                "type": "string",
+                                "description": "Total greenhouse gases emitted by the organization."
+                            },
+                            "Scope 1 Emissions": {"type": ["integer", "null"]},
+                            "Scope 1 Emissions Description": {
+                                "type": "string",
+                                "description": "Direct GHG emissions from owned or controlled sources."
+                            },
+                            "Scope 2 Emissions": {"type": ["integer", "null"]},
+                            "Scope 2 Emissions Description": {
+                                "type": "string",
+                                "description": "Indirect emissions from the generation of purchased electricity."
+                            },
+                            "Scope 3 Emissions": {"type": ["integer", "null"]},
+                            "Scope 3 Emissions Description": {
+                                "type": "string",
+                                "description": "All other indirect emissions that occur in a company’s value chain."
+                            },
+                            "CO₂ Emissions": {"type": ["integer", "null"]},
+                            "CO₂ Emissions Description": {
+                                "type": "string",
+                                "description": "Emissions of carbon dioxide."
+                            },
+                            "CH₄ Emissions": {"type": ["integer", "null"]},
+                            "CH₄ Emissions Description": {
+                                "type": "string",
+                                "description": "Emissions of methane."
+                            },
+                            "N₂O Emissions": {"type": ["integer", "null"]},
+                            "N₂O Emissions Description": {
+                                "type": "string",
+                                "description": "Emissions of nitrous oxide."
+                            },
+                            "HFC Emissions": {"type": ["integer", "null"]},
+                            "HFC Emissions Description": {
+                                "type": "string",
+                                "description": "Emissions of hydrofluorocarbons."
+                            },
+                            "PFC Emissions": {"type": ["integer", "null"]},
+                            "PFC Emissions Description": {
+                                "type": "string",
+                                "description": "Emissions of perfluorocarbons."
+                            }
+                        },
+                        "required": [
+                            "Total GHG Emissions", "Total GHG Emissions Description",
+                            "Scope 1 Emissions", "Scope 1 Emissions Description",
+                            "Scope 2 Emissions", "Scope 2 Emissions Description",
+                            "Scope 3 Emissions", "Scope 3 Emissions Description",
+                            "CO₂ Emissions", "CO₂ Emissions Description",
+                            "CH₄ Emissions", "CH₄ Emissions Description",
+                            "N₂O Emissions", "N₂O Emissions Description",
+                            "HFC Emissions", "HFC Emissions Description",
+                            "PFC Emissions", "PFC Emissions Description"
+                        ],
+                        "additionalProperties": False
+                    }
+                },
+                "Net Zero Intervention Parameters": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "Renewable Energy Adoption": {"type": ["number", "null"]},
+                            "Renewable Energy Adoption Description": {
+                                "type": "string",
+                                "description": "Proportion of energy consumption derived from renewable sources."
+                            },
+                            "Energy Efficiency Improvements": {"type": ["number", "null"]},
+                            "Energy Efficiency Improvements Description": {
+                                "type": "string",
+                                "description": "Reduction in energy consumption due to efficiency measures."
+                            },
+                            "Electrification of Operations": {"type": ["number", "null"]},
+                            "Electrification of Operations Description": {
+                                "type": "string",
+                                "description": "Extent to which operations have shifted from fossil fuels to electric power."
+                            },
+                            "Carbon Capture and Storage (CCS) Implementation": {"type": ["number", "null"]},
+                            "Carbon Capture and Storage (CCS) Implementation Description": {
+                                "type": "string",
+                                "description": "Amount of CO₂ captured and stored to prevent atmospheric release."
+                            },
+                            "Reforestation and Afforestation Initiatives": {"type": ["number", "null"]},
+                            "Reforestation and Afforestation Initiatives Description": {
+                                "type": "string",
+                                "description": "Efforts to plant trees to absorb CO₂ from the atmosphere."
+                            },
+                            "Sustainable Transportation Adoption": {"type": ["number", "null"]},
+                            "Sustainable Transportation Adoption Description": {
+                                "type": "string",
+                                "description": "Proportion of transportation utilizing low-emission or electric vehicles."
+                            },
+                            "Supply Chain Emissions Reduction": {"type": ["number", "null"]},
+                            "Supply Chain Emissions Reduction Description": {
+                                "type": "string",
+                                "description": "Decrease in emissions from upstream and downstream supply chain activities."
+                            },
+                            "Waste-to-Energy Conversion": {"type": ["number", "null"]},
+                            "Waste-to-Energy Conversion Description": {
+                                "type": "string",
+                                "description": "Energy produced from the processing of waste materials."
+                            },
+                            "Carbon Offset Investments": {"type": ["number", "null"]},
+                            "Carbon Offset Investments Description": {
+                                "type": "string",
+                                "description": "Amount of emissions offset through investments in environmental projects."
+                            },
+                            "Climate Risk Assessment": {"type": ["string", "null"]},
+                            "Climate Risk Assessment Description": {
+                                "type": "string",
+                                "description": "Evaluation of potential risks posed by climate change to the organization."
+                            },
+                            "Climate Adaptation Strategies": {"type": ["string", "null"]},
+                            "Climate Adaptation Strategies Description": {
+                                "type": "string",
+                                "description": "Plans implemented to adapt operations to changing climate conditions."
+                            },
+                            "Internal Carbon Pricing": {"type": ["number", "null"]},
+                            "Internal Carbon Pricing Description": {
+                                "type": "string",
+                                "description": "Monetary value assigned to carbon emissions to incentivize reduction."
+                            },
+                            "Net-Zero Target Year": {"type": ["string", "null"]},
+                            "Net-Zero Target Year Description": {
+                                "type": "string",
+                                "description": "Specific year by which the organization aims to achieve net-zero emissions."
+                            },
+                            "Interim Emission Reduction Targets": {"type": ["number", "null"]},
+                            "Interim Emission Reduction Targets Description": {
+                                "type": "string",
+                                "description": "Short-term targets set to progressively reduce emissions en route to net-zero."
+                            },
+                            "Employee Engagement in Sustainability": {"type": ["number", "null"]},
+                            "Employee Engagement in Sustainability Description": {
+                                "type": "string",
+                                "description": "Proportion of employees actively involved in sustainability programs."
+                            },
+                            "Investment in Low-Carbon Technologies": {"type": ["number", "null"]},
+                            "Investment in Low-Carbon Technologies Description": {
+                                "type": "string",
+                                "description": "Financial resources allocated to developing or adopting low-carbon technologies."
+                            },
+                            "Public Disclosure of Net-Zero Progress": {"type": ["string", "null"]},
+                            "Public Disclosure of Net-Zero Progress Description": {
+                                "type": "string",
+                                "description": "Regular public updates on progress toward net-zero commitments."
+                            },
+                            "Third-Party Verification of Emission Data": {"type": ["boolean", "null"]},
+                            "Third-Party Verification of Emission Data Description": {
+                                "type": "string",
+                                "description": "Confirmation that emission data has been verified by an external party."
+                            },
+                            "Participation in Carbon Markets": {"type": ["boolean", "null"]},
+                            "Participation in Carbon Markets Description": {
+                                "type": "string",
+                                "description": "Involvement in systems where carbon credits are bought and sold."
+                            },
+                            "Development of Climate-Resilient Infrastructure": {"type": ["string", "null"]},
+                            "Development of Climate-Resilient Infrastructure Description": {
+                                "type": "string",
+                                "description": "Initiatives to build infrastructure resilient to climate impacts."
+                            },
+                            "Reduction of Methane Emissions": {"type": ["number", "null"]},
+                            "Reduction of Methane Emissions Description": {
+                                "type": "string",
+                                "description": "Efforts to decrease methane emissions from operations."
+                            },
+                            "Implementation of Circular Economy Practices": {"type": ["string", "null"]},
+                            "Implementation of Circular Economy Practices Description": {
+                                "type": "string",
+                                "description": "Adoption of processes that emphasize reuse and recycling to minimize waste."
+                            },
+                            "Collaboration with Industry Peers on Climate Action": {"type": ["string", "null"]},
+                            "Collaboration with Industry Peers on Climate Action Description": {
+                                "type": "string",
+                                "description": "Joint initiatives with other organizations to address climate challenges."
+                            },
+                            "Use of Science-Based Targets": {"type": ["boolean", "null"]},
+                            "Use of Science-Based Targets Description": {
+                                "type": "string",
+                                "description": "Setting emission reduction targets in line with scientific recommendations."
+                            },
+                            "Monitoring and Reporting Mechanisms": {"type": ["string", "null"]},
+                            "Monitoring and Reporting Mechanisms Description": {
+                                "type": "string",
+                                "description": "Systems established to track and report emissions data accurately."
+                            }
+                        },
+                        "required": [
+                            "Renewable Energy Adoption", "Renewable Energy Adoption Description",
+                            "Energy Efficiency Improvements", "Energy Efficiency Improvements Description",
+                            "Electrification of Operations", "Electrification of Operations Description",
+                            "Carbon Capture and Storage (CCS) Implementation", "Carbon Capture and Storage (CCS) Implementation Description",
+                            "Reforestation and Afforestation Initiatives", "Reforestation and Afforestation Initiatives Description",
+                            "Sustainable Transportation Adoption", "Sustainable Transportation Adoption Description",
+                            "Supply Chain Emissions Reduction", "Supply Chain Emissions Reduction Description",
+                            "Waste-to-Energy Conversion", "Waste-to-Energy Conversion Description",
+                            "Carbon Offset Investments", "Carbon Offset Investments Description",
+                            "Climate Risk Assessment", "Climate Risk Assessment Description",
+                            "Climate Adaptation Strategies", "Climate Adaptation Strategies Description",
+                            "Internal Carbon Pricing", "Internal Carbon Pricing Description",
+                            "Net-Zero Target Year", "Net-Zero Target Year Description",
+                            "Interim Emission Reduction Targets", "Interim Emission Reduction Targets Description",
+                            "Employee Engagement in Sustainability", "Employee Engagement in Sustainability Description",
+                            "Investment in Low-Carbon Technologies", "Investment in Low-Carbon Technologies Description",
+                            "Public Disclosure of Net-Zero Progress", "Public Disclosure of Net-Zero Progress Description",
+                            "Third-Party Verification of Emission Data", "Third-Party Verification of Emission Data Description",
+                            "Participation in Carbon Markets", "Participation in Carbon Markets Description",
+                            "Development of Climate-Resilient Infrastructure", "Development of Climate-Resilient Infrastructure Description",
+                            "Reduction of Methane Emissions", "Reduction of Methane Emissions Description",
+                            "Implementation of Circular Economy Practices", "Implementation of Circular Economy Practices Description",
+                            "Collaboration with Industry Peers on Climate Action", "Collaboration with Industry Peers on Climate Action Description",
+                            "Use of Science-Based Targets", "Use of Science-Based Targets Description",
+                            "Monitoring and Reporting Mechanisms", "Monitoring and Reporting Mechanisms Description"
+                        ],
+                        "additionalProperties": False
+                    }
+                },
+                "Materiality Parameters": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "Stakeholder Engagement Level": {
+                                "type": ["string", "null"]
+                            },
+                            "Stakeholder Engagement Level Description": {
+                                "type": "string",
+                                "description": "Degree to which stakeholders are involved in organizational activities or decisions."
+                            },
+                            "Stakeholder Feedback Mechanisms": {
+                                "type": ["string", "null"]
+                            },
+                            "Stakeholder Feedback Mechanisms Description": {
+                                "type": "string",
+                                "description": "Systems in place for stakeholders to provide feedback to the organization."
+                            },
+                            "Identification of Material Issues": {
+                                "type": ["string", "null"]
+                            },
+                            "Identification of Material Issues Description": {
+                                "type": "string",
+                                "description": "Process of determining the most significant environmental, social, and governance issues relevant to the organization."
+                            },
+                            "Prioritization of Material Issues": {
+                                "type": ["string", "null"]
+                            },
+                            "Prioritization of Material Issues Description": {
+                                "type": "string",
+                                "description": "Ranking of identified material issues based on their significance to stakeholders and the organization."
+                            },
+                            "Double Materiality Assessment": {
+                                "type": ["string", "null"]
+                            },
+                            "Double Materiality Assessment Description": {
+                                "type": "string",
+                                "description": "Evaluation considering both the organization's impact on sustainability matters and the impact of those matters on the organization."
+                            },
+                            "Materiality Matrix Development": {
+                                "type": ["string", "null"]
+                            },
+                            "Materiality Matrix Development Description": {
+                                "type": "string",
+                                "description": "Creation of a visual matrix plotting material issues based on their importance to stakeholders and the organization."
+                            },
+                            "Regular Review of Material Issues": {
+                                "type": ["string", "null"]
+                            },
+                            "Regular Review of Material Issues Description": {
+                                "type": "string",
+                                "description": "Frequency and process for updating the assessment of material issues."
+                            },
+                            "Integration of Material Issues into Strategy": {
+                                "type": ["string", "null"]
+                            },
+                            "Integration of Material Issues into Strategy Description": {
+                                "type": "string",
+                                "description": "How identified material issues are incorporated into the organization's strategic planning."
+                            },
+                            "Disclosure of Material Issues": {
+                                "type": ["string", "null"]
+                            },
+                            "Disclosure of Material Issues Description": {
+                                "type": "string",
+                                "description": "Public reporting on identified material issues and how they are managed."
+                            },
+                            "Impact Assessment of Material Issues": {
+                                "type": ["string", "null"]
+                            },
+                            "Impact Assessment of Material Issues Description": {
+                                "type": "string",
+                                "description": "Analysis of the potential or actual impact of material issues on the organization and its stakeholders."
+                            }
+                        },
+                        "required": [
+                            "Stakeholder Engagement Level",
+                            "Stakeholder Engagement Level Description",
+                            "Stakeholder Feedback Mechanisms",
+                            "Stakeholder Feedback Mechanisms Description",
+                            "Identification of Material Issues",
+                            "Identification of Material Issues Description",
+                            "Prioritization of Material Issues",
+                            "Prioritization of Material Issues Description",
+                            "Double Materiality Assessment",
+                            "Double Materiality Assessment Description",
+                            "Materiality Matrix Development",
+                            "Materiality Matrix Development Description",
+                            "Regular Review of Material Issues",
+                            "Regular Review of Material Issues Description",
+                            "Integration of Material Issues into Strategy",
+                            "Integration of Material Issues into Strategy Description",
+                            "Disclosure of Material Issues",
+                            "Disclosure of Material Issues Description",
+                            "Impact Assessment of Material Issues",
+                            "Impact Assessment of Material Issues Description"
+                        ],
+                        "additionalProperties": False
+                    }
+                }
+            },
+            "required": ["company_name", "Greenhouse Gas (GHG) Protocol Parameters", "Net Zero Intervention Parameters", "Materiality Parameters"],
+            "additionalProperties": False
+        }
+    }
+}
+GEMINI_RESPONSE_FORMAT = {
+    "type": "object",
+    "properties": {
+        "Company Name": {
+        "type": "string",
+        "description": "Name of the company."
+        },
+        "Greenhouse Gas (GHG) Protocol Parameters": {
+        "type": "object",
+        "properties": {
+            "Total GHG Emissions": { "type": "integer", "nullable": True, "description": "Total greenhouse gases emitted by the organization. Units: Metric Tons CO₂e." },
+            "Scope 1 Emissions": { "type": "integer", "nullable": True, "description": "Direct GHG emissions from owned or controlled sources. Units: Metric Tons CO₂e." },
+            "Scope 2 Emissions": { "type": "integer", "nullable": True, "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling. Units: Metric Tons CO₂e." },
+            "Scope 3 Emissions": { "type": "integer", "nullable": True, "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions. Units: Metric Tons CO₂e." },
+            "CO₂ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of carbon dioxide. Units: Metric Tons CO₂." },
+            "CH₄ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of methane. Units: Metric Tons CH₄." },
+            "N₂O Emissions": { "type": "integer", "nullable": True, "description": "Emissions of nitrous oxide. Units: Metric Tons N₂O." },
+            "HFC Emissions": { "type": "integer", "nullable": True, "description": "Emissions of hydrofluorocarbons. Units: Metric Tons HFCs" },
+            "PFC Emissions": { "type": "integer", "nullable": True, "description": "Emissions of perfluorocarbons. Units: Metric Tons PFCs" },
+            "SF₆ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of sulfur hexafluoride. Units: Metric Tons SF₆." },
+            "NF₃ Emissions": { "type": "integer", "nullable": True, "description": "Emissions of nitrogen trifluoride. Units: Metric Tons NF₃." },
+            "Biogenic CO₂ Emissions": { "type": "integer", "nullable": True, "description": "CO₂ emissions from biological sources. Units: Metric Tons CO₂." },
+            "Emissions Intensity per Revenue": { "type": "number", "nullable": True, "description": "GHG emissions per unit of revenue. Units: Metric Tons CO₂e / Revenue." },
+            "Emissions Intensity per Employee": { "type": "number", "nullable": True, "description": "GHG emissions per employee. Units: Metric Tons CO₂e / Employee." },
+            "Base Year Emissions": { "type": "integer", "nullable": True, "description": "GHG emissions in the base year for comparison. Units: Metric Tons CO₂e." },
+            "Emissions Reduction Target": { "type": "number", "nullable": True, "description": "Targeted percentage reduction in GHG emissions. Units: Percentage (%)." },
+            "Emissions Reduction Achieved": { "type": "number", "nullable": True, "description": "Actual percentage reduction in GHG emissions achieved. Units: Percentage (%)." },
+            "Energy Consumption": { "type": "number", "nullable": True, "description": "Total energy consumed by the organization. Units: MWh or GJ." },
+            "Renewable Energy Consumption": { "type": "number", "nullable": True, "description": "Amount of energy consumed from renewable sources. Units: MWh or GJ." },
+            "Non-Renewable Energy Consumption": { "type": "number", "nullable": True, "description": "Amount of energy consumed from non-renewable sources. Units: MWh or GJ." },
+            "Energy Intensity per Revenue": { "type": "number", "nullable": True, "description": "Energy consumption per unit of revenue. Units: MWh or GJ / Revenue." },
+            "Energy Intensity per Employee": { "type": "number", "nullable": True, "description": "Energy consumption per employee. Units: MWh or GJ / Employee." },
+            "Fuel Consumption": { "type": "number", "nullable": True, "description": "Total fuel consumed by the organization. Units: Liters or GJ." },
+            "Electricity Consumption": { "type": "number", "nullable": True, "description": "Total electricity consumed. Units: MWh." },
+            "Heat Consumption": { "type": "number", "nullable": True, "description": "Total heat energy consumed. Units: GJ." },
+            "Steam Consumption": { "type": "number", "nullable": True, "description": "Total steam energy consumed. Units: GJ." },
+            "Cooling Consumption": { "type": "number", "nullable": True, "description": "Total energy consumed for cooling. Units: GJ." },
+            "Purchased Goods and Services Emissions": { "type": "integer", "nullable": True, "description": "Emissions from purchased goods and services. Units: Metric Tons CO₂e." },
+            "Capital Goods Emissions": { "type": "integer", "nullable": True, "description": "Emissions from the production of capital goods. Units: Metric Tons CO₂e." },
+            "Fuel- and Energy-Related Activities Emissions": { "type": "integer", "nullable": True, "description": "Emissions related to fuel and energy production not included in Scope 1 or 2. Units: Metric Tons CO₂e." },
+            "Upstream Transportation and Distribution Emissions": { "type": "integer", "nullable": True, "description": "Emissions from transportation and distribution in the supply chain. Units: Metric Tons CO₂e." },
+            "Waste Generated in Operations Emissions": { "type": "integer", "nullable": True, "description": "Emissions from waste generated during operations. Units: Metric Tons CO₂e." },
+            "Business Travel Emissions": { "type": "integer", "nullable": True, "description": "Emissions from employee business travel. Units: Metric Tons CO₂e." },
+            "Employee Commuting Emissions": { "type": "integer", "nullable": True, "description": "Emissions from employees commuting to and from work. Units: Metric Tons CO₂e." },
+            "Upstream Leased Assets Emissions": { "type": "integer", "nullable": True, "description": "Emissions from leased assets upstream in the value chain. Units: Metric Tons CO₂e." },
+            "Downstream Transportation and Distribution Emissions": { "type": "integer", "nullable": True, "description": "Emissions from transportation and distribution of sold products. Units: Metric Tons CO₂e." },
+            "Processing of Sold Products Emissions": { "type": "integer", "nullable": True, "description": "Emissions from processing intermediate products sold by the organization. Units: Metric Tons CO₂e." },
+            "Use of Sold Products Emissions": { "type": "integer", "nullable": True, "description": "Emissions from the use of sold products by consumers. Units: Metric Tons CO₂e." },
+            "End-of-Life Treatment of Sold Products Emissions": { "type": "integer", "nullable": True, "description": "Emissions from the disposal of sold products at end of life. Units: Metric Tons CO₂e." },
+            "Downstream Leased Assets Emissions": { "type": "integer", "nullable": True, "description": "Emissions from leased assets downstream in the value chain. Units: Metric Tons CO₂e." },
+            "Franchises Emissions": { "type": "integer", "nullable": True, "description": "Emissions from franchise operations. Units: Metric Tons CO₂e." },
+            "Investments Emissions": { "type": "integer", "nullable": True, "description": "Emissions from investments. Units: Metric Tons CO₂e." },
+            "Carbon Offsets Purchased": { "type": "integer", "nullable": True, "description": "Amount of carbon offsets purchased. Units: Metric Tons CO₂e." },
+            "Net GHG Emissions": { "type": "integer", "nullable": True, "description": "GHG emissions after accounting for offsets. Units: Metric Tons CO₂e." },
+            "Carbon Sequestration": { "type": "integer", "nullable": True, "description": "Amount of CO₂ sequestered or captured. Units: Metric Tons CO₂e." }
+        },
+        "propertyOrdering": [
+            "Total GHG Emissions",
+            "Scope 1 Emissions",
+            "Scope 2 Emissions",
+            "Scope 3 Emissions",
+            "CO₂ Emissions",
+            "CH₄ Emissions",
+            "N₂O Emissions",
+            "HFC Emissions",
+            "PFC Emissions",
+            "SF₆ Emissions",
+            "NF₃ Emissions",
+            "Biogenic CO₂ Emissions",
+            "Emissions Intensity per Revenue",
+            "Emissions Intensity per Employee",
+            "Base Year Emissions",
+            "Emissions Reduction Target",
+            "Emissions Reduction Achieved",
+            "Energy Consumption",
+            "Renewable Energy Consumption",
+            "Non-Renewable Energy Consumption",
+            "Energy Intensity per Revenue",
+            "Energy Intensity per Employee",
+            "Fuel Consumption",
+            "Electricity Consumption",
+            "Heat Consumption",
+            "Steam Consumption",
+            "Cooling Consumption",
+            "Purchased Goods and Services Emissions",
+            "Capital Goods Emissions",
+            "Fuel- and Energy-Related Activities Emissions",
+            "Upstream Transportation and Distribution Emissions",
+            "Waste Generated in Operations Emissions",
+            "Business Travel Emissions",
+            "Employee Commuting Emissions",
+            "Upstream Leased Assets Emissions",
+            "Downstream Transportation and Distribution Emissions",
+            "Processing of Sold Products Emissions",
+            "Use of Sold Products Emissions",
+            "End-of-Life Treatment of Sold Products Emissions",
+            "Downstream Leased Assets Emissions",
+            "Franchises Emissions",
+            "Investments Emissions",
+            "Carbon Offsets Purchased",
+            "Net GHG Emissions",
+            "Carbon Sequestration"
+        ]
+        }
+    },
+    "propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
+}

application/schemas/schema.xlsx ADDED Viewed

Binary file (55.5 kB). View file

application/services/gemini_model.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import os
+import json
+from google import genai
+from google.genai import types
+from pydantic import BaseModel
+from typing import Optional, Union, BinaryIO
+from application.utils import logger
+from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
+logger = logger.get_logger()
+PROMPT = (
+    """You are a PDF parsing agent.
+    Your job is to extract from a company’s sustainability or ESG report in PDF format:
+    If the values are not found in the document, please return json null for that value.
+    """
+)
+class Parameter(BaseModel):
+    """
+    A generic class to hold details for a sustainability metric.
+    """
+    synonym: str
+    uom: str
+    description: str
+    value: str
+class GreenhouseGasGHGProtocolParameters(BaseModel):
+    Total_GHG_Emissions: Parameter
+    Scope_1_Emissions: Parameter
+    Scope_2_Emissions: Parameter
+    Scope_3_Emissions: Parameter
+    CO2_Emissions: Parameter
+    CH4_Emissions: Parameter
+    N2O_Emissions: Parameter
+    HFC_Emissions: Parameter
+    PFC_Emissions: Parameter
+    SF6_Emissions: Parameter
+    NF3_Emissions: Parameter
+    Biogenic_CO2_Emissions: Parameter
+    Emissions_Intensity_per_Revenue: Parameter
+    Emissions_Intensity_per_Employee: Parameter
+    Base_Year_Emissions: Parameter
+    Emissions_Reduction_Target: Parameter
+    Emissions_Reduction_Achieved: Parameter
+    Energy_Consumption: Parameter
+    Renewable_Energy_Consumption: Parameter
+    Non_Renewable_Energy_Consumption: Parameter
+    Energy_Intensity_per_Revenue: Parameter
+    Energy_Intensity_per_Employee: Parameter
+    Fuel_Consumption: Parameter
+    Electricity_Consumption: Parameter
+    Heat_Consumption: Parameter
+    Steam_Consumption: Parameter
+    Cooling_Consumption: Parameter
+    Purchased_Goods_and_Services_Emissions: Parameter
+    Capital_Goods_Emissions: Parameter
+    Fuel_and_Energy_Related_Activities_Emissions: Parameter
+    Upstream_Transportation_and_Distribution_Emissions: Parameter
+    Waste_Generated_in_Operations_Emissions: Parameter
+    Business_Travel_Emissions: Parameter
+    Employee_Commuting_Emissions: Parameter
+    Upstream_Leased_Assets_Emissions: Parameter
+    # Downstream_Transportation_and_Distribution_Emissions: Parameter
+    # Processing_of_Sold_Products_Emissions: Parameter
+    # Use_of_Sold_Products_Emissions: Parameter
+    # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
+    # Downstream_Leased_Assets_Emissions: Parameter
+    # Franchises_Emissions: Parameter
+    # Investments_Emissions: Parameter
+    # Carbon_Offsets_Purchased: Parameter
+    # Net_GHG_Emissions: Parameter
+    # Carbon_Sequestration: Parameter
+class EmissionData(BaseModel):
+    GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
+# print(json.dumps(EmissionData.model_json_schema(), indent=2))
+def extract_emissions_data_as_json(
+    api: str,
+    model: str,
+    file_input: Union[BinaryIO, bytes]
+) -> Optional[dict]:
+    """
+    Extract ESG data from PDF using OpenAI or Gemini APIs.
+    Args:
+        api: 'openai' or 'gemini'
+        model: Model name (e.g. gpt-4o, gemini-pro)
+        file_input: File-like object or bytes of the PDF.
+    Returns:
+        Parsed ESG data as dict or None if failed.
+    """
+    try:
+        client = genai.Client(api_key=os.getenv("gemini_api_key"))
+        file_bytes = file_input.read()
+        logger.info("[Gemini] Sending content for generation...")
+        response = client.models.generate_content(
+            model=model,
+            contents=[
+                types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
+                PROMPT
+            ],
+            config={
+                'response_mime_type': 'application/json',
+                'response_schema': GEMINI_RESPONSE_FORMAT,
+            }
+        )
+        logger.info("[Gemini] Response received.")
+        try:
+            return json.loads(response.text)
+        except json.JSONDecodeError:
+            logger.warning("Failed to parse JSON, returning raw response.")
+            return {"raw_response": response.text}
+    except Exception as e:
+        logger.exception(f"Error during ESG data extraction.{e}")
+        return None
+# import os
+# from google import genai
+# from pydantic import BaseModel, Field, ValidationError
+# from dotenv import load_dotenv
+# from typing import Optional
+# from google.genai import types
+# load_dotenv()
+# client = genai.Client(api_key=os.getenv("gemini_api_key"))
+# schema= """{
+#   "parameters": [
+#     {
+#       "parameter": "Total GHG Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Carbon Footprint"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Total greenhouse gases emitted by the organization."
+#     },
+#     {
+#       "parameter": "Scope 1 Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Direct Emissions"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Direct GHG emissions from owned or controlled sources."
+#     },
+#     {
+#       "parameter": "Scope 2 Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Indirect Energy Emissions"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling."
+#     },
+#     {
+#       "parameter": "Scope 3 Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Value Chain Emissions"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions."
+#     },
+#     {
+#       "parameter": "CO₂ Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Carbon Emissions"],
+#       "uom": "Metric Tons CO₂",
+#       "description": "Emissions of carbon dioxide."
+#     },
+#     {
+#       "parameter": "CH₄ Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Methane Emissions"],
+#       "uom": "Metric Tons CH₄",
+#       "description": "Emissions of methane."
+#     },
+#     {
+#       "parameter": "N₂O Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Nitrous Oxide Emissions"],
+#       "uom": "Metric Tons N₂O",
+#       "description": "Emissions of nitrous oxide."
+#     },
+#     {
+#       "parameter": "HFC Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Hydrofluorocarbon Emissions"],
+#       "uom": "Metric Tons HFCs",
+#       "description": "Emissions of hydrofluorocarbons."
+#     },
+#     {
+#       "parameter": "PFC Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Perfluorocarbon Emissions"],
+#       "uom": "Metric Tons PFCs",
+#       "description": "Emissions of perfluorocarbons."
+#     },
+#     {
+#       "parameter": "SF₆ Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Sulfur Hexafluoride Emissions"],
+#       "uom": "Metric Tons SF₆",
+#       "description": "Emissions of sulfur hexafluoride."
+#     },
+#     {
+#       "parameter": "NF₃ Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Nitrogen Trifluoride Emissions"],
+#       "uom": "Metric Tons NF₃",
+#       "description": "Emissions of nitrogen trifluoride."
+#     },
+#     {
+#       "parameter": "Biogenic CO₂ Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Biogenic Carbon Emissions"],
+#       "uom": "Metric Tons CO₂",
+#       "description": "CO₂ emissions from biological sources."
+#     },
+#     {
+#       "parameter": "Emissions Intensity per Revenue",
+#       "dataType": "Numeric",
+#       "synonyms": ["Carbon Intensity"],
+#       "uom": "Metric Tons CO₂e / Revenue",
+#       "description": "GHG emissions per unit of revenue."
+#     },
+#     {
+#       "parameter": "Emissions Intensity per Employee",
+#       "dataType": "Numeric",
+#       "synonyms": ["Emissions per Employee"],
+#       "uom": "Metric Tons CO₂e / Employee",
+#       "description": "GHG emissions per employee."
+#     },
+#     {
+#       "parameter": "Base Year Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Baseline Emissions"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "GHG emissions in the base year for comparison."
+#     },
+#     {
+#       "parameter": "Emissions Reduction Target",
+#       "dataType": "Numeric",
+#       "synonyms": ["Emission Reduction Goal"],
+#       "uom": "Percentage (%)",
+#       "description": "Targeted percentage reduction in GHG emissions."
+#     },
+#     {
+#       "parameter": "Emissions Reduction Achieved",
+#       "dataType": "Numeric",
+#       "synonyms": ["Emission Reduction Accomplished"],
+#       "uom": "Percentage (%)",
+#       "description": "Actual percentage reduction in GHG emissions achieved."
+#     },
+#     {
+#       "parameter": "Energy Consumption",
+#       "dataType": "Numeric",
+#       "synonyms": ["Energy Use"],
+#       "uom": "MWh or GJ",
+#       "description": "Total energy consumed by the organization."
+#     },
+#     {
+#       "parameter": "Renewable Energy Consumption",
+#       "dataType": "Numeric",
+#       "synonyms": ["Green Energy Use"],
+#       "uom": "MWh or GJ",
+#       "description": "Amount of energy consumed from renewable sources."
+#     },
+#     {
+#       "parameter": "Non-Renewable Energy Consumption",
+#       "dataType": "Numeric",
+#       "synonyms": ["Fossil Energy Use"],
+#       "uom": "MWh or GJ",
+#       "description": "Amount of energy consumed from non-renewable sources."
+#     },
+#     {
+#       "parameter": "Carbon Offsets Purchased",
+#       "dataType": "Numeric",
+#       "synonyms": ["Carbon Credits"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Amount of carbon offsets purchased."
+#     },
+#     {
+#       "parameter": "Net GHG Emissions",
+#       "dataType": "Numeric",
+#       "synonyms": ["Net Carbon Emissions"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "GHG emissions after accounting for offsets."
+#     },
+#     {
+#       "parameter": "Carbon Sequestration",
+#       "dataType": "Numeric",
+#       "synonyms": ["Carbon Capture"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Amount of CO₂ sequestered or captured."
+#     }
+#   ]
+# }"""

application/services/llm_service.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import os
+import json
+from typing import Union, BinaryIO, Optional
+from openai import OpenAI
+from google import genai
+from google.genai import types
+from application.utils import logger
+from application.schemas.response_schema import RESPONSE_FORMAT,GEMINI_RESPONSE_FORMAT
+logger = logger.get_logger()
+client = OpenAI()
+# --- Constants ---
+PROMPT = (
+    "You are a PDF parsing agent. "
+    "Your job is to extract GHG Protocol Parameters and ESG (Environmental, Social, Governance) Data "
+    "from a company’s sustainability or ESG report in PDF format."
+)
+# --- OpenAI Helpers ---
+def get_files() -> list:
+    """Retrieve all files from OpenAI client."""
+    try:
+        files = client.files.list()
+        logger.info(f"Retrieved {len(files.data)} files.")
+        return files.data
+    except Exception as e:
+        logger.error(f"Failed to retrieve files: {e}")
+        raise
+def get_or_create_file(file_input: BinaryIO, client) -> object:
+    """
+    Retrieve a file from OpenAI by name or upload it if not present.
+    Args:
+        file_input: File-like object with `.name` attribute.
+        client: OpenAI client instance.
+    Returns:
+        File object.
+    """
+    file_name = getattr(file_input, 'name', None)
+    if not file_name:
+        raise ValueError("File input must have a 'name' attribute.")
+    try:
+        for file in get_files():
+            if file.filename == file_name:
+                logger.info(f"File '{file_name}' already exists with ID: {file.id}")
+                return client.files.retrieve(file.id)
+        logger.info(f"Uploading new file '{file_name}'...")
+        new_file = client.files.create(file=(file_name, file_input), purpose="assistants")
+        logger.info(f"File uploaded successfully with ID: {new_file.id}")
+        return new_file
+    except Exception as e:
+        logger.error(f"Error during get_or_create_file: {e}")
+        raise
+def delete_file_by_size(size: int, client):
+    """
+    Deletes files from OpenAI that match a given byte size.
+    Args:
+        size: File size in bytes to match for deletion.
+        client: OpenAI client instance.
+    """
+    try:
+        files = get_files()
+        for file in files:
+            if file.bytes == size:
+                client.files.delete(file.id)
+                logger.info(f"File {file.filename} deleted (size matched: {size} bytes).")
+            else:
+                logger.info(f"File {file.filename} skipped (size mismatch).")
+    except Exception as e:
+        logger.error(f"Failed to delete files: {e}")
+        raise
+# --- Main Function ---
+def extract_emissions_data_as_json(
+    api: str,
+    model: str,
+    file_input: Union[BinaryIO, bytes]
+) -> Optional[dict]:
+    """
+    Extract ESG data from PDF using OpenAI or Gemini APIs.
+    Args:
+        api: 'openai' or 'gemini'
+        model: Model name (e.g. gpt-4o, gemini-pro)
+        file_input: File-like object or bytes of the PDF.
+    Returns:
+        Parsed ESG data as dict or None if failed.
+    """
+    try:
+        if api.lower() == "openai":
+            client = OpenAI()
+            file = get_or_create_file(file_input, client)
+            logger.info("[OpenAI] Sending content for generation...")
+            response = client.chat.completions.create(
+                model=model,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "file", "file": {"file_id": file.id}},
+                        {"type": "text", "text": PROMPT}
+                    ]
+                }],
+                response_format=RESPONSE_FORMAT
+            )
+            result = response.choices[0].message.content
+            logger.info("ESG data extraction successful.")
+            return result
+        elif api.lower() == "gemini":
+            client = genai.Client(api_key=os.getenv("gemini_api_key"))
+            file_bytes = file_input.read()
+            logger.info("[Gemini] Sending content for generation...")
+            response = client.models.generate_content(
+                model=model,
+                contents=[
+                    types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
+                    PROMPT
+                ],
+                config={
+                    'response_mime_type': 'application/json',
+                    'response_schema': GEMINI_RESPONSE_FORMAT,
+                }
+            )
+            logger.info("[Gemini] Response received.")
+            try:
+                return json.loads(response.text)
+            except json.JSONDecodeError:
+                logger.warning("Failed to parse JSON, returning raw response.")
+                return {"raw_response": response.text}
+        else:
+            logger.error(f"Unsupported API: {api}")
+            return None
+    except Exception as e:
+        logger.exception("Error during ESG data extraction.")
+        return None
+# --- Debug Helper ---
+def list_all_files():
+    """Lists all files currently uploaded to OpenAI."""
+    try:
+        files = get_files()
+        for file in files:
+            logger.info(f"File ID: {file.id}, Name: {file.filename}, Size: {file.bytes} bytes")
+    except Exception as e:
+        logger.error(f"Failed to list files: {e}")
+# import os
+# import json
+# from google import genai
+# from google.genai import types
+# from openai import OpenAI
+# from dotenv import load_dotenv
+# from application.utils import logger
+# import pandas as pd
+# import openpyxl
+# load_dotenv()
+# logger = logger.get_logger()
+# def load_schema_from_excel(file_path) -> str:
+#     df = pd.read_excel(file_path,engine='openpyxl')
+#     schema_lines = ["Schema fields and expected format:\n"]
+#     for _, row in df.iterrows():
+#         field = row.get("Field", "")
+#         description = row.get("Description", "")
+#         example = row.get("Example", "")
+#         schema_lines.append(f"- {field}: {description} (e.g., {example})")
+#     return "\n".join(schema_lines)
+# schema_text = load_schema_from_excel("application/schemas/schema.xlsx")
+# # print(schema_text)
+# PROMPT = (f"""You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters and ESG (Environmental, Social, Governance) Data from a company’s sustainability or ESG report in PDF format.
+#             Please return the response as raw JSON without markdown formatting (no triple backticks or json tags) using the following fields:
+#             Total GHG emissions (Metric Tons CO₂e)
+#             Scope 1, 2, and 3 emissions
+#             Emissions by gas (CO₂, CH₄, N₂O, HFCs, etc.)
+#             Energy and fuel consumption (MWh, GJ, Liters)
+#             Carbon offsets, intensity metrics, and reduction targets
+#             ESG disclosures including:
+#             Environmental Policies
+#             Whether the company has an Environmental Management System (EMS)
+#             Environmental certifications (if any)
+#             Ensure values include their units, are extracted accurately, and the fields match the schema provided below and If the value is zero replace it with null:
+#             {schema_text}
+#          """)
+# def extract_emissions_data_as_json(api, model, file_input):
+#     if api.lower()=="openai":
+#         client = OpenAI()
+#         file = client.files.create(
+#             file=("uploaded.pdf", file_input),
+#             purpose="assistants"
+#         )
+#         completion = client.chat.completions.create(
+#             model=model,
+#             messages=[
+#                 {
+#                     "role": "user",
+#                     "content": [
+#                         {
+#                             "type": "file",
+#                             "file": {
+#                                 "file_id": file.id,
+#                             }
+#                         },
+#                         {
+#                             "type": "text",
+#                             "text":PROMPT,
+#                         },
+#                     ]
+#                 }
+#             ]
+#         )
+#         try:
+#             return json.loads(completion.choices[0].message.content)
+#         except json.JSONDecodeError:
+#             logger.error("Warning: Output was not valid JSON.")
+#             return {"raw_response": completion.choices[0].message.content}
+#     if api.lower()=="gemini":
+#         client = genai.Client(api_key=os.getenv('gemini_api_key'))
+#         file_bytes= file_input.read()
+#         response = client.models.generate_content(
+#         model=model,
+#         contents=[
+#             types.Part.from_bytes(
+#                 data=file_bytes,
+#                 mime_type='application/pdf',
+#             ),
+#             PROMPT])
+#         try:
+#             return json.loads(response.text)
+#         except json.JSONDecodeError:
+#             return {"raw_response": response.text}
+#                 # {
+#             # "type": "object",
+#             # "properties": {
+#             # "GHG_Protocol_Parameters": {
+#             #     "type": "object",
+#             #     "properties": {
+#             #     "Total_GHG_Emissions": { "type": "number" },
+#             #     "Scope_1_Emissions": { "type": "number" },
+#             #     "Scope_2_Emissions": { "type": "number" },
+#             #     "Scope_3_Emissions": { "type": "number" },
+#             #     "CO2_Emissions": { "type": "number" },
+#             #     "CH4_Emissions": { "type": "number" },
+#             #     "N2O_Emissions": { "type": "number" },
+#             #     "HFC_Emissions": { "type": "number" },
+#             #     "PFC_Emissions": { "type": "number" },
+#             #     "SF6_Emissions": { "type": "number" },
+#             #     "NF3_Emissions": { "type": "number" },
+#             #     "Biogenic_CO2_Emissions": { "type": "number" },
+#             #     "Emissions_Intensity_per_Revenue": { "type": "number" },
+#             #     "Emissions_Intensity_per_Employee": { "type": "number" },
+#             #     "Base_Year_Emissions": { "type": "number" },
+#             #     "Emissions_Reduction_Target": { "type": "number" },
+#             #     "Emissions_Reduction_Achieved": { "type": "number" },
+#             #     "Energy_Consumption": { "type": "number" },
+#             #     "Renewable_Energy_Consumption": { "type": "number" },
+#             #     "Non_Renewable_Energy_Consumption": { "type": "number" },
+#             #     "Energy_Intensity_per_Revenue": { "type": "number" },
+#             #     "Energy_Intensity_per_Employee": { "type": "number" },
+#             #     "Fuel_Consumption": { "type": "number" },
+#             #     "Electricity_Consumption": { "type": "number" },
+#             #     "Heat_Consumption": { "type": "number" },
+#             #     "Steam_Consumption": { "type": "number" },
+#             #     "Cooling_Consumption": { "type": "number" },
+#             #     "Purchased_Goods_and_Services_Emissions": { "type": "number" },
+#             #     "Capital_Goods_Emissions": { "type": "number" },
+#             #     "Fuel_and_Energy_Related_Activities_Emissions": { "type": "number" },
+#             #     "Upstream_Transportation_and_Distribution_Emissions": { "type": "number" },
+#             #     "Waste_Generated_in_Operations_Emissions": { "type": "number" },
+#             #     "Business_Travel_Emissions": { "type": "number" },
+#             #     "Employee_Commuting_Emissions": { "type": "number" },
+#             #     "Upstream_Leased_Assets_Emissions": { "type": "number" },
+#             #     "Downstream_Transportation_and_Distribution_Emissions": { "type": "number" },
+#             #     "Processing_of_Sold_Products_Emissions": { "type": "number" },
+#             #     "Use_of_Sold_Products_Emissions": { "type": "number" },
+#             #     "End_of_Life_Treatment_of_Sold_Products_Emissions": { "type": "number" },
+#             #     "Downstream_Leased_Assets_Emissions": { "type": "number" },
+#             #     "Franchises_Emissions": { "type": "number" },
+#             #     "Investments_Emissions": { "type": "number" },
+#             #     "Carbon_Offsets_Purchased": { "type": "number" },
+#             #     "Net_GHG_Emissions": { "type": "number" },
+#             #     "Carbon_Sequestration": { "type": "number" }
+#             #     }
+#             # },
+#             # "ESG_Parameters_CSRS": {
+#             #     "type": "object",
+#             #     "properties": {
+#             #     "Environmental_Policies": { "type": "string" },
+#             #     "Environmental_Management_System": { "type": "boolean" },
+#             #     "Environmental_Certifications": { "type": "string" }
+#             #     }
+#             # }
+#             # },
+#             # "required": ["GHG_Protocol_Parameters", "ESG_Parameters_CSRS"]}

application/services/openai_model.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# from pydantic import BaseModel
+# from openai import OpenAI
+# from typing import List, Dict, Optional, Union
+# client = OpenAI()
+# class GHGParameter(BaseModel):
+#     parameter: str
+#     data_type: str
+#     synonyms: Optional[List[str]] = None
+#     uom: Optional[str] = None
+#     description: Optional[str] = None
+#     value: Union[int, str, None]
+# class GHGCategory(BaseModel):
+#     category: str
+#     parameters: List[GHGParameter]
+# SCHEMA = """{
+#   "Gas (GHG)": {
+#     "Total GHG Emissions": {
+#       "data_type": "Numeric",
+#       "synonyms": ["Carbon Footprint"],
+#       "uom": "Metric Tons CO₂e",
+#       "description": "Total greenhouse gases emitted by the organization.",
+#       "value": null
+#     }"""
+# PROMPT = (f"""You are a PDF parsing agent.
+#             Fetch the following data from pdf : {SCHEMA}"""
+#             )
+# def extract_emissions_data_as_json(api, model, file_input):
+#     if api.lower() == "openai":
+#         file = client.files.create(
+#             file=("uploaded.pdf", file_input),
+#             purpose="assistants"
+#         )
+#         completion = client.beta.chat.completions.parse(
+#             model="gpt-4o-2024-08-06",
+#             messages=[
+#                 {
+#                     "role": "user",
+#                     "content": [
+#                         {
+#                             "type": "file",
+#                             "file": {
+#                                 "file_id": file.id,
+#                             }
+#                         },
+#                         {
+#                             "type": "text",
+#                             "text":PROMPT,
+#                         },
+#                     ]
+#                 }
+#             ],
+#             response_format=GHGCategory,
+#         )
+#         research_paper = completion.choices[0].message.parsed
+#         return research_paper
+# from pydantic import BaseModel
+# from openai import OpenAI
+# client = OpenAI()
+# class CalendarEvent(BaseModel):
+#     name: str
+#     date: str
+#     participants: list[str]
+# def extract_emissions_data_as_json(api, model, file_input):
+#     if api.lower() == "openai":
+#             file = client.files.create(
+#                 file=("uploaded.pdf", file_input),
+#                 purpose="assistants"
+#             )
+#     completion = client.beta.chat.completions.parse(
+#         model="gpt-4o-2024-08-06",
+#         messages=[
+#                     {
+#                         "role": "user",
+#                         "content": [
+#                             {
+#                                 "type": "file",
+#                                 "file": {
+#                                     "file_id": file.id,
+#                                 }
+#                             },
+#                             {
+#                                 "type": "text",
+#                                 "text":PROMPT,
+#                             },
+#                         ]
+#                     }
+#                 ],
+#         response_format=GHGCategory,
+#     )
+#     event = completion.choices[0].message.parsed
+# response = client.chat.completions.create(
+#     model="gpt-4o-2024-08-06",
+#     messages=[
+#         {"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
+#         {"role": "user", "content": "how can I solve 8x + 7 = -23"}
+#     ],
+#     response_format={
+#         "type": "json_schema",
+#         "json_schema": {
+#             "name": "GHGCategory",
+#             "schema": {
+#                 "type": "object",
+#                 "properties": {
+#                     "steps": {
+#                         "type": "array",
+#                         "items": {
+#                             "type": "object",
+#                             "properties": {
+#                                 "explanation": {"type": "string"},
+#                                 "output": {"type": "string"}
+#                             },
+#                             "required": ["explanation", "output"],
+#                             "additionalProperties": False
+#                         }
+#                     },
+#                     "final_answer": {"type": "string"}
+#                 },
+#                 "required": ["steps", "final_answer"],
+#                 "additionalProperties": False
+#             },
+#             "strict": True
+#         }
+#     }
+# )
+# print(response.choices[0].message.content)
+# response = await async_client.responses.create(
+#             model="gpt-4o",
+#             input=[
+#                 {
+#                     "role": "user",
+#                     "content": [
+#                         {
+#                             "type": "input_file",
+#                             "file_id": uploaded_file.id,
+#                         },
+#                         {
+#                             "type": "input_text",
+#                             "text": """
+#                             You are an intelligent PDF data extractor designed to extract structured information from Brand Books. A Brand Book contains guidelines and details about a brand's identity, including its logo, colors, typography, messaging, and more.
+#                             Ensure the extracted data follows this schema strictly.
+#                             Return the extracted brand information in JSON format with no explaination.
+#                             For brand_logo and favicon, always provide a direct URL to the image instead of just the image name or a placeholder. If no valid URLs are found, return an empty array.                        """
+#                         }
+#                     ]
+#                 }
+#             ],
+#             text={
+#                 "format": {
+#                     "type": "json_schema",
+#                     "name": "BrandBook",
+#                     "strict": True,
+#                     "schema": {
+#                         "type": "object",
+#                         "properties": {
+#                             "brand_url": {
+#                                 "type": "string",
+#                                 "description": "The URL associated with the brand."
+#                             },
+#                             "brand_name": {
+#                                 "type": "string",
+#                                 "description": "The name of the brand."
+#                             },
+#                             "brand_category": {
+#                                 "type": "array",
+#                                 "description": "A list of categories that the brand belongs to.",
+#                                 "items": {
+#                                     "type": "string"
+#                                 }
+#                             },
+#                             "brand_description": {
+#                                 "type": "string",
+#                                 "description": "A brief description of the brand."
+#                             },
+#                             "brand_colors": {
+#                                 "type": "array",
+#                                 "description": "A list of colors associated with the brand.",
+#                                 "items": {
+#                                     "type": "string"
+#                                 }
+#                             },
+#                             "brand_fonts": {
+#                                 "type": "array",
+#                                 "description": "A list of fonts used by the brand.",
+#                                 "items": {
+#                                     "type": "string"
+#                                 }
+#                             },
+#                             "brand_logo": {
+#                                 "type": "array",
+#                                 "description": "A list of logo urls associated with the brand.",
+#                                 "items": {
+#                                     "type": "string"
+#                                 }
+#                             },
+#                             "target_audience": {
+#                                 "type": "string",
+#                                 "description": "The target audience for the brand."
+#                             },
+#                             "competitors": {
+#                                 "type": "string",
+#                                 "description": "The competitors of the brand."
+#                             },
+#                             "aspirational_brands": {
+#                                 "type": "string",
+#                                 "description": "Brands that the brand aspires to be like."
+#                             },
+#                             "favicon": {
+#                                 "type": "array",
+#                                 "description": "A list of favicon URLs associated with the brand.",
+#                                 "items": {
+#                                     "type": "string"
+#                                 }
+#                             }
+#                         },
+#                         "required": [
+#                             "brand_url",
+#                             "brand_name",
+#                             "brand_category",
+#                             "brand_description",
+#                             "brand_colors",
+#                             "brand_fonts",
+#                             "brand_logo",
+#                             "target_audience",
+#                             "competitors",
+#                             "aspirational_brands",
+#                             "favicon"
+#                         ],
+#                         "additionalProperties": False
+#                     }
+#                 }
+#             }
+#         )

application/services/streamlit_function.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import streamlit as st
+from typing import Union, List
+from application.utils import logger
+logger = logger.get_logger()
+PAGE_TITLE = "PDF Extractor"
+PAGE_LAYOUT = "wide"
+# PAGE_ICON = "src/frontend/images/page_icon.jpg"
+# GITHUB_LINK = "https://github.com/Vela-Test1993/yuvabe-care-companion-ai"
+# ABOUT_US = "An AI-powered assistant for personalized healthcare guidance."
+def config_homepage(page_title=PAGE_TITLE):
+    """
+    Configures the Streamlit homepage with essential settings.
+    This function sets up the page title, icon, layout, and sidebar state.
+    It also defines custom menu items for better navigation.
+    Args:
+        page_title (str): The title displayed on the browser tab (default is PAGE_TITLE).
+    Key Features:
+    - Ensures `st.set_page_config()` is called only once to avoid errors.
+    - Uses constants for improved maintainability and consistency.
+    - Provides links for help, bug reporting, and an 'About' section.
+    Example:
+        >>> config_homepage("My Custom App")
+    """
+    if "page_config_set" not in st.session_state:
+        st.set_page_config(
+            page_title=page_title,
+            # page_icon=PAGE_ICON,
+            layout=PAGE_LAYOUT,
+            initial_sidebar_state="collapsed",
+            # menu_items={
+            #     "Get help": GITHUB_LINK,
+            #     "Report a bug": GITHUB_LINK,
+            #     "About": ABOUT_US
+            # }
+        )
+        # st.session_state.page_config_set = True
+def upload_file(
+    file_types: Union[str, List[str]] = "pdf",
+    label: str = "📤 Upload a file",
+    help_text: str = "Upload your file for processing.",
+    allow_multiple: bool = False,
+):
+    """
+    Streamlit file uploader widget with options.
+    Args:
+        file_types (str or list): Allowed file type(s), e.g., "pdf" or ["pdf", "docx"].
+        label (str): Label displayed above the uploader.
+        help_text (str): Tooltip help text.
+        allow_multiple (bool): Allow multiple file uploads.
+    Returns:
+        Uploaded file(s): A single file object or a list of file objects.
+    """
+    if isinstance(file_types, str):
+        file_types = [file_types]
+    uploaded_files = st.file_uploader(
+        label=label,
+        type=file_types,
+        help=help_text,
+        accept_multiple_files=allow_multiple
+    )
+    if st.button("Submit"):
+        st.session_state.pdf_file = uploaded_files
+        return uploaded_files
+# def extract_text_from_pdf(file) -> str:
+#     """
+#     Extracts and returns the full text content from a PDF file.
+#     :param file: PDF file object (BytesIO or UploadedFile from Streamlit)
+#     :return: Extracted text as a string
+#     """
+#     text = ""
+#     with fitz.open(stream=file.read(), filetype="pdf") as doc:
+#         for page in doc:
+#             text += page.get_text()
+#     return text.strip()

application/utils/logger.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import logging
+from logging.handlers import RotatingFileHandler
+import os
+log_file = 'eco_scribe.log'
+log_dir = 'logs/app'
+log_level=logging.INFO
+def get_logger( ):
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+    log_file_path = os.path.join(log_dir, log_file)
+    logger = logging.getLogger(__name__)
+    if not logger.hasHandlers():
+        logger.setLevel(log_level)
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.DEBUG)
+        file_handler = RotatingFileHandler(log_file_path, maxBytes=5*1024*1024, backupCount=3)
+        file_handler.setLevel(logging.INFO)
+        log_format = '%(asctime)s - %(levelname)s - %(message)s'
+        formatter = logging.Formatter(log_format, datefmt='%Y-%m-%d %H:%M')
+        console_handler.setFormatter(formatter)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+        logger.addHandler(file_handler)
+    return logger

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+openai
+dotenv
+google
+google.genai
+google-generativeai
+pymupdf
+openpyxl
+pandas

test.py ADDED Viewed

File without changes