Spaces:

VelaTest
/

Sustainability_Report_Extractor

Sleeping

App Files Files Community

Vela commited on Jun 7

Commit

5d4ad83

0 Parent(s):

Created a frontend dashboard

Browse files

Files changed (15) hide show

.gitignore +3 -0
README.md +12 -0
app.py +125 -0
pages/database.py +92 -0
requirements.txt +5 -0
src/services/__pycache__/mongo_db_service.cpython-313.pyc +0 -0
src/services/mongo_db_service.py +139 -0
src/utils/__pycache__/common_functions.cpython-313.pyc +0 -0
src/utils/__pycache__/gics_schema.cpython-313.pyc +0 -0
src/utils/__pycache__/logger.cpython-313.pyc +0 -0
src/utils/__pycache__/streamlit_function.cpython-313.pyc +0 -0
src/utils/__pycache__/system_prompts.cpython-313.pyc +0 -0
src/utils/common_functions.py +61 -0
src/utils/logger.py +34 -0
src/utils/streamlit_function.py +67 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+logs
+.env

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: PDFExtractor
+emoji: 🌍
+colorFrom: gray
+colorTo: pink
+sdk: streamlit
+sdk_version: 1.44.1
+app_file: app.py
+pinned: false
+short_description: An AI-powered tool that extracts sustainability data
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import streamlit as st
+import pandas as pd
+import os
+from src.utils import streamlit_function
+from src.utils import logger
+logger = logger.get_logger()
+streamlit_function.config_homepage()
+st.title("Sustainability Report Analyzer")
+st.write("Upload your sustainability report PDF and generate insights using Gemini models.")
+uploaded_files = streamlit_function.upload_file("pdf", label="📤 Upload Sustainability Report PDF")
+if uploaded_files:
+    st.session_state.uploaded_files = uploaded_files
+if "uploaded_files" not in st.session_state:
+    st.session_state.uploaded_files = []
+if st.session_state.uploaded_files:
+    columns = st.columns(1)
+# # import streamlit as st
+# # from application.schemas.response_schema import GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS, GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS, GEMINI_NET_ZERO_INTERVENTION_PARAMETERS
+# # from application.services import streamlit_function, gemini_model
+# # from application.utils import logger
+# # import test
+# # logger = logger.get_logger()
+# # streamlit_function.config_homepage()
+# # st.title("Sustainability Report Analyzer")
+# # st.write("Upload your sustainability report PDF and generate insights using different models.")
+# # MODEL = ["gemini-1.5-pro-latest", "gemini-2.0-flash", "gemini-1.5-flash", "gemini-2.5-pro-exp-03-25"]
+# # MODEL_1 = "gemini-1.5-pro-latest"
+# # MODEL_2 = "gemini-2.0-flash"
+# # MODEL_3 = "gemini-1.5-flash"
+# # API_1 = "gemini"
+# # API_2 = "gemini"
+# # API_3 = "gemini"
+# # response_schema = [ GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
+# #                     GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS,
+# #                     GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
+# #                     GEMINI_NET_ZERO_INTERVENTION_PARAMETERS]
+# # if "uploaded_files" not in st.session_state:
+# #     st.session_state.uploaded_files = []
+# # MODEL = st.selectbox(
+# #     "Select Model",
+# #     options=MODEL,
+# #     index=0,
+# # )
+# # uploaded_files = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
+# # if uploaded_files:
+# #     st.session_state.uploaded_files = uploaded_files
+# # if st.session_state.uploaded_files:
+# #     columns = st.columns([5, 5, 5], gap="small")
+# #     for i, col in enumerate(columns):
+# #         if i < len(st.session_state.uploaded_files):
+# #             pdf_file = st.session_state.uploaded_files[i]
+# #             file_name = pdf_file.name.removesuffix(".pdf")
+# #             result_key = f"{MODEL}_result_file_{i+1}"
+# #             with col:
+# #                 st.write(f"**File {i+1}:** `{pdf_file.name}`")
+# #                 if st.button(f"Extract Data from File {i+1}", key=f"extract_btn_{i}"):
+# #                     with st.spinner(f"Extracting data from File {i+1} using {MODEL}..."):
+# #                         for schema in response_schema:
+# #                             result = gemini_model.extract_emissions_data_as_json(API_1, MODEL, pdf_file, schema)
+# #                             if schema == GEMINI_GHG_PARAMETERS:
+# #                                 column = "Greenhouse Gas (GHG) Protocol Parameters"
+# #                             elif schema == GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD:
+# #                                 column = "Environmental Parameters (CSRD)"
+# #                             elif schema == GEMINI_ENVIRONMENT_PARAMETERS:
+# #                                 column = "Environmental Parameters"
+# #                             elif schema == GEMINI_SOCIAL_PARAMETERS:
+# #                                 column = "Social Parameters"
+# #                             elif schema == GEMINI_GOVERNANCE_PARAMETERS:
+# #                                 column = "Governance Parameters"
+# #                             elif schema == GEMINI_MATERIALITY_PARAMETERS:
+# #                                 column = "Materiality Parameters"
+# #                             elif schema == GEMINI_NET_ZERO_INTERVENTION_PARAMETERS:
+# #                                 column = "Net Zero Intervention Parameters"
+# #                             else:
+# #                                 column = None
+# #                             test.export_results_to_excel(result, sheet_name=MODEL, filename=file_name, column=column )
+# #                             st.session_state[result_key] = result
+# #                 if st.session_state.get(result_key):
+# #                     st.write(f"**Extracted Metrics for File {i+1}:**")
+# #                     st.json(st.session_state[result_key])

pages/database.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import streamlit as st
+import pandas as pd
+from src.utils import streamlit_function
+from src.utils.logger import get_logger
+from src.services.mongo_db_service import retrieve_documents
+from src.utils.common_functions import prepare_comparison_df
+logger = get_logger()
+streamlit_function.config_homepage()
+st.title("📊 ESG Report Comparison Dashboard")
+METRIC_OPTIONS = {
+    "Report Metadata": ["report_metadata"],
+    "Environmental Parameters": [
+        "Emissions", "Energy Consumption", "Water Withdrawal", "Water Discharge",
+        "Waste Generation", "Waste Disposal", "Waste Recovery"
+    ],
+    "Social Parameters": [
+        "Human Rights Training Coverage", "LTIFR", "Other Safety Incidents",
+        "Health & Safety Training Coverage", "Grievances Reported",
+        "Third-party Assessment Coverage", "CSR Beneficiaries", "Female Wage Share",
+        "Wages by Location", "Well-being Cost", "Worker Well-being Coverage",
+        "Employee Well-being Coverage", "Turnover Count", "Workforce Gender Diversity"
+    ],
+    "Governance Parameters": [
+        "Non-compliance Instances", "Disciplinary Actions", "Consumer Complaints",
+        "Customer Data Breaches", "Governance Diversity", "Purchase Concentration",
+        "Sales Concentration", "Related Party Transactions"
+    ],
+    "Materiality": ["material_topics"]
+}
+ESG_EXTRACTOR_COLLECTION = "esg_report_extracts"
+company_docs = retrieve_documents(collection_name=ESG_EXTRACTOR_COLLECTION)
+available_company_data = [doc["_id"] for doc in company_docs if "_id" in doc]
+selected_companies = st.multiselect(
+    "Select up to 3 companies",
+    options=available_company_data,
+    max_selections=3
+)
+def get_all_years(docs) -> list:
+    years = set()
+    for doc in docs:
+        if "esg_reports" in doc and isinstance(doc["esg_reports"], dict):
+            years.update(doc["esg_reports"].keys())
+    return sorted(years, reverse=True)
+def highlight_missing_values(df):
+    return df.style.map(lambda v: "background-color: #ffe6e6" if pd.isna(v) or str(v).strip() in ["", "nan", "None", "Not Available","N/A"] else "background-color: #e6ffe6")
+def extract_company_name_from_doc(doc, default_name):
+    return doc.get("report_metadata", {}).get("company_legal_name", default_name)
+if selected_companies:
+    all_years = get_all_years(company_docs)
+    selected_year = st.selectbox(
+        "Select a report year (applies to all selected companies)",
+        options=["-- Select Year --"] + all_years,
+        key="common_year"
+    )
+    if selected_year != "-- Select Year --":
+        tabs = st.tabs(list(METRIC_OPTIONS.keys()))
+        metric_categories = list(METRIC_OPTIONS.keys())
+        for i, tab in enumerate(tabs):
+            with tab:
+                st.subheader(metric_categories[i])
+                metric_keys = METRIC_OPTIONS[metric_categories[i]]
+                for metric in metric_keys:
+                    st.markdown(f"### {metric}")
+                    comparison_df = prepare_comparison_df(
+                        selected_companies,
+                        selected_year,
+                        metric,
+                        company_docs
+                    )
+                    if comparison_df is not None:
+                        st.dataframe(highlight_missing_values(comparison_df), use_container_width=True)
+                    else:
+                        st.warning(f"No data found for **{metric}** in {selected_year}")
+    else:
+        st.info("Please select a year to view report comparisons.")
+else:
+    st.info("Please select at least one company to continue.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+pymongo
+openpyxl
+dotenv
+unidecode

src/services/__pycache__/mongo_db_service.cpython-313.pyc ADDED Viewed

Binary file (5.83 kB). View file

src/services/mongo_db_service.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+from typing import List, Dict, Optional, Union
+from dotenv import load_dotenv
+from pymongo.errors import ConnectionFailure
+from pymongo import MongoClient, errors
+from bson import ObjectId
+from src.utils.logger import get_logger
+logger = get_logger()
+load_dotenv()
+MONGODB_URI = os.getenv("MONGODB_URI")
+MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME")
+client = MongoClient(MONGODB_URI)
+db = client[MONGODB_DB_NAME]
+ESG_REPORT_EXTRACTS_COLLECTION = "esg_report_extracts"
+def get_mongo_client() -> Optional[MongoClient]:
+    """
+    Establishes and returns a MongoDB client using credentials from the environment.
+    """
+    try:
+        client = MongoClient(os.getenv("MONGODB_URI"))
+        return client
+    except ConnectionFailure:
+        logger.error("MongoDB connection failed. Please check MONGODB_URI.")
+    except Exception as e:
+        logger.exception(f"Unexpected error while connecting to MongoDB: {str(e)}")
+    return None
+def retrieve_documents(
+    collection_name: str,
+    query: Optional[Dict] = None,
+    only_ids: bool = False,
+    single: bool = False,
+    company_legal_name: Optional[str] = None,
+    reporting_year: Optional[int] = None
+) -> Union[List[Dict], Dict, None]:
+    """
+    Retrieves documents from a specified MongoDB collection with optional filtering.
+    Args:
+        collection_name (str): MongoDB collection name.
+        query (Optional[Dict]): MongoDB query filter.
+        only_ids (bool): If True, return only _id field for all documents.
+        single (bool): If True, return only a single matching document.
+        company_legal_name (Optional[str]): Filter by company_legal_name.
+        reporting_year (Optional[int]): Filter by reporting_year inside 'esg_report'.
+    Returns:
+        Union[List[Dict], Dict, None]: A list of documents, a single document, or None.
+    """
+    try:
+        client = get_mongo_client()
+        if client is None:
+            logger.error("MongoDB client is not available.")
+            return [] if not single else None
+        db = client[MONGODB_DB_NAME]
+        collection = db[collection_name]
+        mongo_query = query or {}
+        if company_legal_name:
+            mongo_query["report_metadata.company_legal_name"] = company_legal_name
+        if reporting_year is not None:
+            mongo_query["esg_report.year"] = reporting_year
+        projection = {"_id": 1} if only_ids else None
+        if single:
+            result = collection.find_one(mongo_query, projection)
+            logger.info(f"Retrieved single document from {collection_name} for query: {mongo_query}")
+            return result
+        documents_cursor = collection.find(mongo_query, projection)
+        documents = list(documents_cursor)
+        logger.info(f"Retrieved {len(documents)} documents from collection: {collection_name}")
+        return documents
+    except Exception as e:
+        logger.exception(f"An error occurred while retrieving documents: {str(e)}")
+        return [] if not single else None
+def retrieve_document_by_id(collection_name: str, document_id, convert_to_object_id: bool = False):
+    """
+    Retrieve a single document from a MongoDB collection by _id.
+    Args:
+        collection_name (str): The name of the MongoDB collection.
+        document_id (str or ObjectId): The value of the _id to retrieve.
+        convert_to_object_id (bool): Set to True if _id is an ObjectId, not a string.
+    Returns:
+        dict or None: The document if found, otherwise None.
+    Raises:
+        ValueError: If inputs are invalid.
+        Exception: For any unexpected database errors.
+    """
+    if not collection_name or not isinstance(collection_name, str):
+        raise ValueError("Invalid collection name.")
+    if document_id is None:
+        raise ValueError("document_id must not be None.")
+    try:
+        collection = db[collection_name]
+        if convert_to_object_id:
+            try:
+                document_id = ObjectId(document_id)
+            except Exception as e:
+                raise ValueError(f"Invalid ObjectId format: {document_id}") from e
+        document = collection.find_one({"_id": document_id})
+        if document:
+            logger.info(f"Document found with _id: {document_id}")
+            return document
+        else:
+            logger.error(f"No document found with _id: {document_id}")
+            return None
+    except errors.PyMongoError as e:
+        logger.error(f"Database error while retrieving document: {e}")
+        raise
+    except Exception as ex:
+        logger.error(f"Unexpected error: {ex}")
+        raise
+# all_docs = retrieve_documents(collection_name=ESG_REPORT_EXTRACTS_COLLECTION)
+# print(all_docs[0]["_id"])
+# collection = list_collections()
+# print(collection)

src/utils/__pycache__/common_functions.cpython-313.pyc ADDED Viewed

Binary file (3.92 kB). View file

src/utils/__pycache__/gics_schema.cpython-313.pyc ADDED Viewed

Binary file (14.7 kB). View file

src/utils/__pycache__/logger.cpython-313.pyc ADDED Viewed

Binary file (1.85 kB). View file

src/utils/__pycache__/streamlit_function.cpython-313.pyc ADDED Viewed

Binary file (8.48 kB). View file

src/utils/__pycache__/system_prompts.cpython-313.pyc ADDED Viewed

Binary file (20.5 kB). View file

src/utils/common_functions.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import re
+import pandas as pd
+def prepare_comparison_df(selected_companies, selected_year, metric_key, company_docs):
+    """
+    Prepares a wide-format comparison DataFrame for the selected companies and metric.
+    """
+    rows = {}
+    for company_id in selected_companies:
+        doc = next((d for d in company_docs if d["_id"] == company_id), None)
+        if not doc or "esg_reports" not in doc:
+            continue
+        report = doc["esg_reports"].get(selected_year, {})
+        metric_data = report.get(metric_key, {})
+        def extract_final_value(val):
+            if isinstance(val, dict):
+                numeric = val.get("numeric_value")
+                unit = val.get("measurement_unit")
+            if numeric is not None:
+                        return f"{numeric} {unit}".strip() if unit else str(numeric)
+            return None
+        def recursively_flatten(data, parent_key=""):
+            flat = {}
+            if isinstance(data, dict):
+                for key, val in data.items():
+                    full_key = f"{parent_key} - {key.replace('_', ' ').title()}" if parent_key else key.replace('_', ' ').title()
+                    if isinstance(val, dict):
+                        extracted = extract_final_value(val)
+                        if extracted is not None:
+                            flat[full_key] = extracted
+                        else:
+                            flat.update(recursively_flatten(val, full_key))
+                    else:
+                        flat[full_key] = str(val) if val is not None else "Not Available"
+            elif parent_key:
+                flat[parent_key] = str(data) if data is not None else "Not Available"
+            return flat
+        # def extract_readable(data):
+        #     if isinstance(data, dict):
+        #         return {k.replace("_", " ").title(): (str(v) if v is not None else "Not Available") for k, v in data.items()}
+        #     return {metric_key.replace("_", " ").title(): str(data)}
+        flattened = recursively_flatten(metric_data)
+        for key, val in flattened.items():
+            rows.setdefault(key, {})[company_id] = val
+    if not rows:
+        return None
+    df = pd.DataFrame(rows).T
+    df.index.name = "Metric"
+    df = df.fillna("Not Available")
+    return df

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import logging
+from logging.handlers import RotatingFileHandler
+log_file = 'sustainability_report_extractor.log'
+log_dir = 'logs/app'
+log_level=logging.INFO
+def get_logger():
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+    log_file_path = os.path.join(log_dir, log_file)
+    logger = logging.getLogger(__name__)
+    if not logger.hasHandlers():
+        logger.setLevel(log_level)
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.DEBUG)
+        file_handler = RotatingFileHandler(log_file_path, maxBytes=5*1024*1024, backupCount=3)
+        file_handler.setLevel(logging.INFO)
+        log_format = '%(asctime)s - %(levelname)s - %(message)s'
+        formatter = logging.Formatter(log_format, datefmt='%Y-%m-%d %H:%M')
+        console_handler.setFormatter(formatter)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+        logger.addHandler(file_handler)
+    return logger

src/utils/streamlit_function.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import streamlit as st
+from typing import Union, List
+from src.utils import logger
+logger = logger.get_logger()
+PAGE_TITLE = "PDF Extractor"
+PAGE_LAYOUT = "wide"
+def config_homepage(page_title=PAGE_TITLE):
+    """
+    Configures the Streamlit homepage with essential settings.
+    This function sets up the page title, icon, layout, and sidebar state.
+    It also defines custom menu items for better navigation.
+    Args:
+        page_title (str): The title displayed on the browser tab (default is PAGE_TITLE).
+    Key Features:
+    - Ensures `st.set_page_config()` is called only once to avoid errors.
+    - Uses constants for improved maintainability and consistency.
+    - Provides links for help, bug reporting, and an 'About' section.
+    Example:
+        >>> config_homepage("My Custom App")
+    """
+    if "page_config_set" not in st.session_state:
+        st.set_page_config(
+            page_title=page_title,
+            layout=PAGE_LAYOUT,
+            initial_sidebar_state="collapsed",
+        )
+        # st.session_state.page_config_set = True
+def upload_file(
+    file_types: Union[str, List[str]] = "pdf",
+    label: str = "📤 Upload a file",
+    help_text: str = "Upload your file for processing.",
+    allow_multiple: bool = True,
+):
+    """
+    Streamlit file uploader widget with options.
+    Args:
+        file_types (str or list): Allowed file type(s), e.g., "pdf" or ["pdf", "docx"].
+        label (str): Label displayed above the uploader.
+        help_text (str): Tooltip help text.
+        allow_multiple (bool): Allow multiple file uploads.
+    Returns:
+        Uploaded file(s): A single file object or a list of file objects.
+    """
+    if isinstance(file_types, str):
+        file_types = [file_types]
+    uploaded_files = st.file_uploader(
+        label=label,
+        type=file_types,
+        help=help_text,
+        accept_multiple_files=allow_multiple
+    )
+    if st.button("Submit"):
+        st.session_state.pdf_file = uploaded_files
+        return uploaded_files