Vela commited on
Commit
2692728
·
1 Parent(s): 5d4ad83

removed extraction tool

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  .venv
2
  logs
3
- .env
 
 
1
  .venv
2
  logs
3
+ .env
4
+ src/utils/__pycache__/
app.py CHANGED
@@ -1,125 +1,92 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import os
4
  from src.utils import streamlit_function
5
- from src.utils import logger
 
 
6
 
7
- logger = logger.get_logger()
8
  streamlit_function.config_homepage()
9
 
10
- st.title("Sustainability Report Analyzer")
11
- st.write("Upload your sustainability report PDF and generate insights using Gemini models.")
12
-
13
- uploaded_files = streamlit_function.upload_file("pdf", label="📤 Upload Sustainability Report PDF")
14
- if uploaded_files:
15
- st.session_state.uploaded_files = uploaded_files
16
-
17
- if "uploaded_files" not in st.session_state:
18
- st.session_state.uploaded_files = []
19
-
20
- if st.session_state.uploaded_files:
21
- columns = st.columns(1)
22
-
23
-
24
-
25
-
26
-
27
-
28
-
29
-
30
-
31
-
32
-
33
-
34
-
35
-
36
-
37
-
38
-
39
-
40
-
41
-
42
-
43
-
44
-
45
-
46
-
47
-
48
- # # import streamlit as st
49
- # # from application.schemas.response_schema import GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS, GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS, GEMINI_NET_ZERO_INTERVENTION_PARAMETERS
50
- # # from application.services import streamlit_function, gemini_model
51
- # # from application.utils import logger
52
- # # import test
53
-
54
- # # logger = logger.get_logger()
55
- # # streamlit_function.config_homepage()
56
- # # st.title("Sustainability Report Analyzer")
57
- # # st.write("Upload your sustainability report PDF and generate insights using different models.")
58
-
59
- # # MODEL = ["gemini-1.5-pro-latest", "gemini-2.0-flash", "gemini-1.5-flash", "gemini-2.5-pro-exp-03-25"]
60
-
61
- # # MODEL_1 = "gemini-1.5-pro-latest"
62
- # # MODEL_2 = "gemini-2.0-flash"
63
- # # MODEL_3 = "gemini-1.5-flash"
64
-
65
- # # API_1 = "gemini"
66
- # # API_2 = "gemini"
67
- # # API_3 = "gemini"
68
-
69
- # # response_schema = [ GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
70
- # # GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS,
71
- # # GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
72
- # # GEMINI_NET_ZERO_INTERVENTION_PARAMETERS]
73
-
74
- # # if "uploaded_files" not in st.session_state:
75
- # # st.session_state.uploaded_files = []
76
-
77
- # # MODEL = st.selectbox(
78
- # # "Select Model",
79
- # # options=MODEL,
80
- # # index=0,
81
- # # )
82
-
83
- # # uploaded_files = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
84
-
85
- # # if uploaded_files:
86
- # # st.session_state.uploaded_files = uploaded_files
87
-
88
- # # if st.session_state.uploaded_files:
89
- # # columns = st.columns([5, 5, 5], gap="small")
90
-
91
- # # for i, col in enumerate(columns):
92
- # # if i < len(st.session_state.uploaded_files):
93
- # # pdf_file = st.session_state.uploaded_files[i]
94
- # # file_name = pdf_file.name.removesuffix(".pdf")
95
- # # result_key = f"{MODEL}_result_file_{i+1}"
96
-
97
- # # with col:
98
- # # st.write(f"**File {i+1}:** `{pdf_file.name}`")
99
- # # if st.button(f"Extract Data from File {i+1}", key=f"extract_btn_{i}"):
100
- # # with st.spinner(f"Extracting data from File {i+1} using {MODEL}..."):
101
- # # for schema in response_schema:
102
- # # result = gemini_model.extract_emissions_data_as_json(API_1, MODEL, pdf_file, schema)
103
- # # if schema == GEMINI_GHG_PARAMETERS:
104
- # # column = "Greenhouse Gas (GHG) Protocol Parameters"
105
- # # elif schema == GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD:
106
- # # column = "Environmental Parameters (CSRD)"
107
- # # elif schema == GEMINI_ENVIRONMENT_PARAMETERS:
108
- # # column = "Environmental Parameters"
109
- # # elif schema == GEMINI_SOCIAL_PARAMETERS:
110
- # # column = "Social Parameters"
111
- # # elif schema == GEMINI_GOVERNANCE_PARAMETERS:
112
- # # column = "Governance Parameters"
113
- # # elif schema == GEMINI_MATERIALITY_PARAMETERS:
114
- # # column = "Materiality Parameters"
115
- # # elif schema == GEMINI_NET_ZERO_INTERVENTION_PARAMETERS:
116
- # # column = "Net Zero Intervention Parameters"
117
- # # else:
118
- # # column = None
119
-
120
- # # test.export_results_to_excel(result, sheet_name=MODEL, filename=file_name, column=column )
121
- # # st.session_state[result_key] = result
122
-
123
- # # if st.session_state.get(result_key):
124
- # # st.write(f"**Extracted Metrics for File {i+1}:**")
125
- # # st.json(st.session_state[result_key])
 
1
  import streamlit as st
2
  import pandas as pd
3
+
4
  from src.utils import streamlit_function
5
+ from src.utils.logger import get_logger
6
+ from src.services.mongo_db_service import retrieve_documents
7
+ from src.utils.common_functions import prepare_comparison_df
8
 
9
+ logger = get_logger()
10
  streamlit_function.config_homepage()
11
 
12
+ st.title("📊 ESG Report Comparison Dashboard")
13
+
14
+ METRIC_OPTIONS = {
15
+ "Report Metadata": ["report_metadata"],
16
+ "Environmental Parameters": [
17
+ "Emissions", "Energy Consumption", "Water Withdrawal", "Water Discharge",
18
+ "Waste Generation", "Waste Disposal", "Waste Recovery"
19
+ ],
20
+ "Social Parameters": [
21
+ "Human Rights Training Coverage", "LTIFR", "Other Safety Incidents",
22
+ "Health & Safety Training Coverage", "Grievances Reported",
23
+ "Third-party Assessment Coverage", "CSR Beneficiaries", "Female Wage Share",
24
+ "Wages by Location", "Well-being Cost", "Worker Well-being Coverage",
25
+ "Employee Well-being Coverage", "Turnover Count", "Workforce Gender Diversity"
26
+ ],
27
+ "Governance Parameters": [
28
+ "Non-compliance Instances", "Disciplinary Actions", "Consumer Complaints",
29
+ "Customer Data Breaches", "Governance Diversity", "Purchase Concentration",
30
+ "Sales Concentration", "Related Party Transactions"
31
+ ],
32
+ "Materiality": ["material_topics"]
33
+ }
34
+
35
+ ESG_EXTRACTOR_COLLECTION = "esg_report_extracts"
36
+
37
+ company_docs = retrieve_documents(collection_name=ESG_EXTRACTOR_COLLECTION)
38
+ available_company_data = [doc["_id"] for doc in company_docs if "_id" in doc]
39
+
40
+ selected_companies = st.multiselect(
41
+ "Select up to 3 companies",
42
+ options=available_company_data,
43
+ max_selections=3
44
+ )
45
+
46
+ def get_all_years(docs) -> list:
47
+ years = set()
48
+ for doc in docs:
49
+ if "esg_reports" in doc and isinstance(doc["esg_reports"], dict):
50
+ years.update(doc["esg_reports"].keys())
51
+ return sorted(years, reverse=True)
52
+
53
+ def highlight_missing_values(df):
54
+ return df.style.map(lambda v: "background-color: #ffe6e6" if pd.isna(v) or str(v).strip() in ["", "nan", "None", "Not Available","N/A"] else "background-color: #e6ffe6")
55
+
56
+ def extract_company_name_from_doc(doc, default_name):
57
+ return doc.get("report_metadata", {}).get("company_legal_name", default_name)
58
+
59
+ if selected_companies:
60
+ all_years = get_all_years(company_docs)
61
+
62
+ selected_year = st.selectbox(
63
+ "Select a report year (applies to all selected companies)",
64
+ options=["-- Select Year --"] + all_years,
65
+ key="common_year"
66
+ )
67
+
68
+ if selected_year != "-- Select Year --":
69
+ tabs = st.tabs(list(METRIC_OPTIONS.keys()))
70
+ metric_categories = list(METRIC_OPTIONS.keys())
71
+ for i, tab in enumerate(tabs):
72
+ with tab:
73
+ st.subheader(metric_categories[i])
74
+ metric_keys = METRIC_OPTIONS[metric_categories[i]]
75
+ for metric in metric_keys:
76
+ st.markdown(f"### {metric}")
77
+
78
+ comparison_df = prepare_comparison_df(
79
+ selected_companies,
80
+ selected_year,
81
+ metric,
82
+ company_docs
83
+ )
84
+
85
+ if comparison_df is not None:
86
+ st.dataframe(highlight_missing_values(comparison_df), use_container_width=True)
87
+ else:
88
+ st.warning(f"No data found for **{metric}** in {selected_year}")
89
+ else:
90
+ st.info("Please select a year to view report comparisons.")
91
+ else:
92
+ st.info("Please select at least one company to continue.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/database.py DELETED
@@ -1,92 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
-
4
- from src.utils import streamlit_function
5
- from src.utils.logger import get_logger
6
- from src.services.mongo_db_service import retrieve_documents
7
- from src.utils.common_functions import prepare_comparison_df
8
-
9
- logger = get_logger()
10
- streamlit_function.config_homepage()
11
-
12
- st.title("📊 ESG Report Comparison Dashboard")
13
-
14
- METRIC_OPTIONS = {
15
- "Report Metadata": ["report_metadata"],
16
- "Environmental Parameters": [
17
- "Emissions", "Energy Consumption", "Water Withdrawal", "Water Discharge",
18
- "Waste Generation", "Waste Disposal", "Waste Recovery"
19
- ],
20
- "Social Parameters": [
21
- "Human Rights Training Coverage", "LTIFR", "Other Safety Incidents",
22
- "Health & Safety Training Coverage", "Grievances Reported",
23
- "Third-party Assessment Coverage", "CSR Beneficiaries", "Female Wage Share",
24
- "Wages by Location", "Well-being Cost", "Worker Well-being Coverage",
25
- "Employee Well-being Coverage", "Turnover Count", "Workforce Gender Diversity"
26
- ],
27
- "Governance Parameters": [
28
- "Non-compliance Instances", "Disciplinary Actions", "Consumer Complaints",
29
- "Customer Data Breaches", "Governance Diversity", "Purchase Concentration",
30
- "Sales Concentration", "Related Party Transactions"
31
- ],
32
- "Materiality": ["material_topics"]
33
- }
34
-
35
- ESG_EXTRACTOR_COLLECTION = "esg_report_extracts"
36
-
37
- company_docs = retrieve_documents(collection_name=ESG_EXTRACTOR_COLLECTION)
38
- available_company_data = [doc["_id"] for doc in company_docs if "_id" in doc]
39
-
40
- selected_companies = st.multiselect(
41
- "Select up to 3 companies",
42
- options=available_company_data,
43
- max_selections=3
44
- )
45
-
46
- def get_all_years(docs) -> list:
47
- years = set()
48
- for doc in docs:
49
- if "esg_reports" in doc and isinstance(doc["esg_reports"], dict):
50
- years.update(doc["esg_reports"].keys())
51
- return sorted(years, reverse=True)
52
-
53
- def highlight_missing_values(df):
54
- return df.style.map(lambda v: "background-color: #ffe6e6" if pd.isna(v) or str(v).strip() in ["", "nan", "None", "Not Available","N/A"] else "background-color: #e6ffe6")
55
-
56
- def extract_company_name_from_doc(doc, default_name):
57
- return doc.get("report_metadata", {}).get("company_legal_name", default_name)
58
-
59
- if selected_companies:
60
- all_years = get_all_years(company_docs)
61
-
62
- selected_year = st.selectbox(
63
- "Select a report year (applies to all selected companies)",
64
- options=["-- Select Year --"] + all_years,
65
- key="common_year"
66
- )
67
-
68
- if selected_year != "-- Select Year --":
69
- tabs = st.tabs(list(METRIC_OPTIONS.keys()))
70
- metric_categories = list(METRIC_OPTIONS.keys())
71
- for i, tab in enumerate(tabs):
72
- with tab:
73
- st.subheader(metric_categories[i])
74
- metric_keys = METRIC_OPTIONS[metric_categories[i]]
75
- for metric in metric_keys:
76
- st.markdown(f"### {metric}")
77
-
78
- comparison_df = prepare_comparison_df(
79
- selected_companies,
80
- selected_year,
81
- metric,
82
- company_docs
83
- )
84
-
85
- if comparison_df is not None:
86
- st.dataframe(highlight_missing_values(comparison_df), use_container_width=True)
87
- else:
88
- st.warning(f"No data found for **{metric}** in {selected_year}")
89
- else:
90
- st.info("Please select a year to view report comparisons.")
91
- else:
92
- st.info("Please select at least one company to continue.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/__pycache__/common_functions.cpython-313.pyc CHANGED
Binary files a/src/utils/__pycache__/common_functions.cpython-313.pyc and b/src/utils/__pycache__/common_functions.cpython-313.pyc differ
 
src/utils/__pycache__/streamlit_function.cpython-313.pyc CHANGED
Binary files a/src/utils/__pycache__/streamlit_function.cpython-313.pyc and b/src/utils/__pycache__/streamlit_function.cpython-313.pyc differ