Spaces:

VelaTest
/

PDFExtractor

Sleeping

App Files Files Community

Vela commited on Apr 13

Commit

d1ca23a

1 Parent(s): dab58f3

modified gemini service module add API call file handling

Browse files

Files changed (6) hide show

app.py +52 -34
application/schemas/response_schema.py +63 -1
application/services/gemini_model.py +138 -258
application/services/streamlit_function.py +50 -12
application/services/supabase_service.py +20 -0
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -1,60 +1,78 @@
-from application.services import streamlit_function, llm_service
-from application.services import gemini_model, openai_model
 import streamlit as st
 from google.genai.errors import ClientError
 from application.utils import logger
-import test
 logger = logger.get_logger()
-streamlit_function.config_homepage()
-pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
-available_files = ["Select a pdf file"]
-for file in llm_service.get_files():
-    available_files.append(file.filename)
-selected_file = st.selectbox("Select a existing file", available_files)
-for key in ["gpt4o_mini_result", "gpt4o_result", "gemini_result", "pdf_file"]:
     if key not in st.session_state:
         st.session_state[key] = None
 if st.session_state.pdf_file:
     with st.container():
         col1, col2, col3 = st.columns([5, 5, 5], gap="small")
         with col1:
-            if st.button("Generate GPT-4o-min Response"):
-                with st.spinner("Calling GPT-4o-mini..."):
-                    result = llm_service.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
-                    # result= openai_model.extract_emissions_data_as_json("openai","gpt-4o-mini",pdf_file)
-                    st.session_state.gpt4o_mini_result = result
-            if st.session_state.gpt4o_mini_result:
-                st.write("Extracted Metrics by gpt-4o-mini")
-                st.json(st.session_state.gpt4o_mini_result)
         with col2:
-            if st.button("Generate GPT-4o Response"):
-                with st.spinner("Calling gpt-4o..."):
-                    result= llm_service.extract_emissions_data_as_json("openai","gpt-4o",pdf_file)
-                    st.session_state.gpt4o_result = result
-            if st.session_state.gpt4o_result:
-                st.write("Extracted Metrics by gpt-4o")
-                st.json(st.session_state.gpt4o_result)
         with col3:
             try:
-                if st.button("Generate Gemini Response"):
-                    with st.spinner("Calling gemini-1.5-pro-latest..."):
-                        result = llm_service.extract_emissions_data_as_json("gemini","gemini-2.0-flash", st.session_state.pdf_file)
-                        # result = gemini_model.extract_emissions_data_as_json("gemini","gemini-2.0-flash", pdf_file)
-                        st.session_state.gemini_result = result
             except ClientError as e:
                 st.error(f"Gemini API Error: {e}")
                 logger.error("Error Details:", e.message, e.response)
-            if st.session_state.gemini_result:
-                st.write("Extracted Metrics by gemini-1.5-pro-latest")
-                st.json(st.session_state.gemini_result)

 import streamlit as st
+import os
+from application.services import streamlit_function, gemini_model
 from google.genai.errors import ClientError
 from application.utils import logger
 logger = logger.get_logger()
+MODEL_1 = "gemini-1.5-pro-latest"
+MODEL_2 = "gemini-2.0-flash"
+MODEL_3 = "gemini-1.5-flash"
+API_1 = "gemini"
+API_2 = "gemini"
+API_3 = "gemini"
+streamlit_function.config_homepage()
+pdf_file = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
+for key in [f"{MODEL_1}_result", f"{MODEL_2}_result", f"{MODEL_3}_result", "pdf_file"]:
     if key not in st.session_state:
         st.session_state[key] = None
+if "excel_file" not in st.session_state:
+    st.session_state["excel_file"] = None
 if st.session_state.pdf_file:
     with st.container():
         col1, col2, col3 = st.columns([5, 5, 5], gap="small")
+        file_name = st.session_state.pdf_file.name.removesuffix(".pdf")
+        excel_file=None
         with col1:
+            if st.button(f"Generate {MODEL_1} Response"):
+                with st.spinner(f"Calling {MODEL_1}..."):
+                    result = gemini_model.extract_emissions_data_as_json(API_1 , MODEL_1, st.session_state.pdf_file)
+                    excel_file = streamlit_function.export_results_to_excel(result, MODEL_1, file_name)
+                    st.session_state[f"{MODEL_1}_result"] = result
+            if st.session_state[f"{MODEL_1}_result"]:
+                st.write(f"Extracted Metrics by {MODEL_1}_result")
+                st.json(st.session_state[f"{MODEL_1}_result"])
         with col2:
+            if st.button(f"Generate {MODEL_2} Response"):
+                with st.spinner(f"Calling {MODEL_2}..."):
+                    result = gemini_model.extract_emissions_data_as_json(API_2, MODEL_2, st.session_state.pdf_file)
+                    excel_file = streamlit_function.export_results_to_excel(result, MODEL_2, file_name)
+                    st.session_state[f"{MODEL_2}_result"] = result
+            if st.session_state[f"{MODEL_2}_result"]:
+                st.write(f"Extracted Metrics by {MODEL_2}_result")
+                st.json(st.session_state[f"{MODEL_2}_result"])
         with col3:
             try:
+                if st.button(f"Generate {MODEL_3} Response"):
+                    with st.spinner(f"Calling {MODEL_3}..."):
+                        result = gemini_model.extract_emissions_data_as_json(API_3, MODEL_3, st.session_state.pdf_file)
+                        excel_file = streamlit_function.export_results_to_excel(result, MODEL_3, file_name)
+                        st.session_state[f"{MODEL_3}_result"] = result
             except ClientError as e:
                 st.error(f"Gemini API Error: {e}")
                 logger.error("Error Details:", e.message, e.response)
+            if st.session_state[f"{MODEL_3}_result"]:
+                st.write(f"Extracted Metrics by {MODEL_3}_result")
+                st.json(st.session_state[f"{MODEL_3}_result"])
+    file_path = f"data/{file_name}.xlsx"
+    if os.path.exists(file_path):
+        with open(file_path, "rb") as file:
+            st.download_button(
+                label="Download Excel File",
+                data=file,
+                file_name=f"{file_name}.xlsx",
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )

application/schemas/response_schema.py CHANGED Viewed

@@ -1,3 +1,5 @@
 RESPONSE_FORMAT = {
     "type": "json_schema",
     "json_schema": {
@@ -449,4 +451,64 @@ GEMINI_RESPONSE_FORMAT = {
         }
     },
     "propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
-}

+from pydantic import BaseModel
 RESPONSE_FORMAT = {
     "type": "json_schema",
     "json_schema": {
         }
     },
     "propertyOrdering": ["Company Name", "Greenhouse Gas (GHG) Protocol Parameters"]
+}
+class Parameter(BaseModel):
+    """
+    A generic class to hold details for a sustainability metric.
+    """
+    synonym: str
+    uom: str
+    description: str
+    value: str
+class GreenhouseGasGHGProtocolParameters(BaseModel):
+    Total_GHG_Emissions: Parameter
+    Scope_1_Emissions: Parameter
+    Scope_2_Emissions: Parameter
+    Scope_3_Emissions: Parameter
+    CO2_Emissions: Parameter
+    CH4_Emissions: Parameter
+    N2O_Emissions: Parameter
+    HFC_Emissions: Parameter
+    PFC_Emissions: Parameter
+    SF6_Emissions: Parameter
+    NF3_Emissions: Parameter
+    Biogenic_CO2_Emissions: Parameter
+    Emissions_Intensity_per_Revenue: Parameter
+    Emissions_Intensity_per_Employee: Parameter
+    Base_Year_Emissions: Parameter
+    Emissions_Reduction_Target: Parameter
+    Emissions_Reduction_Achieved: Parameter
+    Energy_Consumption: Parameter
+    Renewable_Energy_Consumption: Parameter
+    Non_Renewable_Energy_Consumption: Parameter
+    Energy_Intensity_per_Revenue: Parameter
+    Energy_Intensity_per_Employee: Parameter
+    Fuel_Consumption: Parameter
+    Electricity_Consumption: Parameter
+    Heat_Consumption: Parameter
+    Steam_Consumption: Parameter
+    Cooling_Consumption: Parameter
+    Purchased_Goods_and_Services_Emissions: Parameter
+    Capital_Goods_Emissions: Parameter
+    Fuel_and_Energy_Related_Activities_Emissions: Parameter
+    Upstream_Transportation_and_Distribution_Emissions: Parameter
+    Waste_Generated_in_Operations_Emissions: Parameter
+    Business_Travel_Emissions: Parameter
+    Employee_Commuting_Emissions: Parameter
+    Upstream_Leased_Assets_Emissions: Parameter
+    # Downstream_Transportation_and_Distribution_Emissions: Parameter
+    # Processing_of_Sold_Products_Emissions: Parameter
+    # Use_of_Sold_Products_Emissions: Parameter
+    # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
+    # Downstream_Leased_Assets_Emissions: Parameter
+    # Franchises_Emissions: Parameter
+    # Investments_Emissions: Parameter
+    # Carbon_Offsets_Purchased: Parameter
+    # Net_GHG_Emissions: Parameter
+    # Carbon_Sequestration: Parameter
+class EmissionData(BaseModel):
+    GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters

application/services/gemini_model.py CHANGED Viewed

@@ -1,81 +1,138 @@
 import os
 import json
 from google import genai
 from google.genai import types
-from pydantic import BaseModel
-from typing import Optional, Union, BinaryIO
-from application.utils import logger
 from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
-logger = logger.get_logger()
 PROMPT = (
-    """You are a PDF parsing agent.
-    Your job is to extract from a company’s sustainability or ESG report in PDF format:
-    If the values are not found in the document, please return json null for that value.
-    """
 )
-class Parameter(BaseModel):
     """
-    A generic class to hold details for a sustainability metric.
     """
-    synonym: str
-    uom: str
-    description: str
-    value: str
-class GreenhouseGasGHGProtocolParameters(BaseModel):
-    Total_GHG_Emissions: Parameter
-    Scope_1_Emissions: Parameter
-    Scope_2_Emissions: Parameter
-    Scope_3_Emissions: Parameter
-    CO2_Emissions: Parameter
-    CH4_Emissions: Parameter
-    N2O_Emissions: Parameter
-    HFC_Emissions: Parameter
-    PFC_Emissions: Parameter
-    SF6_Emissions: Parameter
-    NF3_Emissions: Parameter
-    Biogenic_CO2_Emissions: Parameter
-    Emissions_Intensity_per_Revenue: Parameter
-    Emissions_Intensity_per_Employee: Parameter
-    Base_Year_Emissions: Parameter
-    Emissions_Reduction_Target: Parameter
-    Emissions_Reduction_Achieved: Parameter
-    Energy_Consumption: Parameter
-    Renewable_Energy_Consumption: Parameter
-    Non_Renewable_Energy_Consumption: Parameter
-    Energy_Intensity_per_Revenue: Parameter
-    Energy_Intensity_per_Employee: Parameter
-    Fuel_Consumption: Parameter
-    Electricity_Consumption: Parameter
-    Heat_Consumption: Parameter
-    Steam_Consumption: Parameter
-    Cooling_Consumption: Parameter
-    Purchased_Goods_and_Services_Emissions: Parameter
-    Capital_Goods_Emissions: Parameter
-    Fuel_and_Energy_Related_Activities_Emissions: Parameter
-    Upstream_Transportation_and_Distribution_Emissions: Parameter
-    Waste_Generated_in_Operations_Emissions: Parameter
-    Business_Travel_Emissions: Parameter
-    Employee_Commuting_Emissions: Parameter
-    Upstream_Leased_Assets_Emissions: Parameter
-    # Downstream_Transportation_and_Distribution_Emissions: Parameter
-    # Processing_of_Sold_Products_Emissions: Parameter
-    # Use_of_Sold_Products_Emissions: Parameter
-    # End_of_Life_Treatment_of_Sold_Products_Emissions: Parameter
-    # Downstream_Leased_Assets_Emissions: Parameter
-    # Franchises_Emissions: Parameter
-    # Investments_Emissions: Parameter
-    # Carbon_Offsets_Purchased: Parameter
-    # Net_GHG_Emissions: Parameter
-    # Carbon_Sequestration: Parameter
-class EmissionData(BaseModel):
-    GreenhouseGasGHGProtocolParameters: GreenhouseGasGHGProtocolParameters
-# print(json.dumps(EmissionData.model_json_schema(), indent=2))
 def extract_emissions_data_as_json(
     api: str,
@@ -83,34 +140,33 @@ def extract_emissions_data_as_json(
     file_input: Union[BinaryIO, bytes]
 ) -> Optional[dict]:
     """
-    Extract ESG data from PDF using OpenAI or Gemini APIs.
     Args:
-        api: 'openai' or 'gemini'
-        model: Model name (e.g. gpt-4o, gemini-pro)
-        file_input: File-like object or bytes of the PDF.
     Returns:
-        Parsed ESG data as dict or None if failed.
     """
     try:
-        client = genai.Client(api_key=os.getenv("gemini_api_key"))
-        file_bytes = file_input.read()
-        logger.info("[Gemini] Sending content for generation...")
         response = client.models.generate_content(
             model=model,
-            contents=[
-                types.Part.from_bytes(data=file_bytes, mime_type="application/pdf"),
-                PROMPT
-            ],
             config={
                 'response_mime_type': 'application/json',
-                'response_schema': GEMINI_RESPONSE_FORMAT,
             }
         )
         logger.info("[Gemini] Response received.")
         try:
             return json.loads(response.text)
@@ -119,181 +175,5 @@ def extract_emissions_data_as_json(
             return {"raw_response": response.text}
     except Exception as e:
-        logger.exception(f"Error during ESG data extraction.{e}")
-        return None
-# import os
-# from google import genai
-# from pydantic import BaseModel, Field, ValidationError
-# from dotenv import load_dotenv
-# from typing import Optional
-# from google.genai import types
-# load_dotenv()
-# client = genai.Client(api_key=os.getenv("gemini_api_key"))
-# schema= """{
-#   "parameters": [
-#     {
-#       "parameter": "Total GHG Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Carbon Footprint"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "Total greenhouse gases emitted by the organization."
-#     },
-#     {
-#       "parameter": "Scope 1 Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Direct Emissions"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "Direct GHG emissions from owned or controlled sources."
-#     },
-#     {
-#       "parameter": "Scope 2 Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Indirect Energy Emissions"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "Indirect GHG emissions from the consumption of purchased electricity, steam, heating, and cooling."
-#     },
-#     {
-#       "parameter": "Scope 3 Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Value Chain Emissions"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "Other indirect emissions occurring in the value chain, including both upstream and downstream emissions."
-#     },
-#     {
-#       "parameter": "CO₂ Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Carbon Emissions"],
-#       "uom": "Metric Tons CO₂",
-#       "description": "Emissions of carbon dioxide."
-#     },
-#     {
-#       "parameter": "CH₄ Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Methane Emissions"],
-#       "uom": "Metric Tons CH₄",
-#       "description": "Emissions of methane."
-#     },
-#     {
-#       "parameter": "N₂O Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Nitrous Oxide Emissions"],
-#       "uom": "Metric Tons N₂O",
-#       "description": "Emissions of nitrous oxide."
-#     },
-#     {
-#       "parameter": "HFC Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Hydrofluorocarbon Emissions"],
-#       "uom": "Metric Tons HFCs",
-#       "description": "Emissions of hydrofluorocarbons."
-#     },
-#     {
-#       "parameter": "PFC Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Perfluorocarbon Emissions"],
-#       "uom": "Metric Tons PFCs",
-#       "description": "Emissions of perfluorocarbons."
-#     },
-#     {
-#       "parameter": "SF₆ Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Sulfur Hexafluoride Emissions"],
-#       "uom": "Metric Tons SF₆",
-#       "description": "Emissions of sulfur hexafluoride."
-#     },
-#     {
-#       "parameter": "NF₃ Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Nitrogen Trifluoride Emissions"],
-#       "uom": "Metric Tons NF₃",
-#       "description": "Emissions of nitrogen trifluoride."
-#     },
-#     {
-#       "parameter": "Biogenic CO₂ Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Biogenic Carbon Emissions"],
-#       "uom": "Metric Tons CO₂",
-#       "description": "CO₂ emissions from biological sources."
-#     },
-#     {
-#       "parameter": "Emissions Intensity per Revenue",
-#       "dataType": "Numeric",
-#       "synonyms": ["Carbon Intensity"],
-#       "uom": "Metric Tons CO₂e / Revenue",
-#       "description": "GHG emissions per unit of revenue."
-#     },
-#     {
-#       "parameter": "Emissions Intensity per Employee",
-#       "dataType": "Numeric",
-#       "synonyms": ["Emissions per Employee"],
-#       "uom": "Metric Tons CO₂e / Employee",
-#       "description": "GHG emissions per employee."
-#     },
-#     {
-#       "parameter": "Base Year Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Baseline Emissions"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "GHG emissions in the base year for comparison."
-#     },
-#     {
-#       "parameter": "Emissions Reduction Target",
-#       "dataType": "Numeric",
-#       "synonyms": ["Emission Reduction Goal"],
-#       "uom": "Percentage (%)",
-#       "description": "Targeted percentage reduction in GHG emissions."
-#     },
-#     {
-#       "parameter": "Emissions Reduction Achieved",
-#       "dataType": "Numeric",
-#       "synonyms": ["Emission Reduction Accomplished"],
-#       "uom": "Percentage (%)",
-#       "description": "Actual percentage reduction in GHG emissions achieved."
-#     },
-#     {
-#       "parameter": "Energy Consumption",
-#       "dataType": "Numeric",
-#       "synonyms": ["Energy Use"],
-#       "uom": "MWh or GJ",
-#       "description": "Total energy consumed by the organization."
-#     },
-#     {
-#       "parameter": "Renewable Energy Consumption",
-#       "dataType": "Numeric",
-#       "synonyms": ["Green Energy Use"],
-#       "uom": "MWh or GJ",
-#       "description": "Amount of energy consumed from renewable sources."
-#     },
-#     {
-#       "parameter": "Non-Renewable Energy Consumption",
-#       "dataType": "Numeric",
-#       "synonyms": ["Fossil Energy Use"],
-#       "uom": "MWh or GJ",
-#       "description": "Amount of energy consumed from non-renewable sources."
-#     },
-#     {
-#       "parameter": "Carbon Offsets Purchased",
-#       "dataType": "Numeric",
-#       "synonyms": ["Carbon Credits"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "Amount of carbon offsets purchased."
-#     },
-#     {
-#       "parameter": "Net GHG Emissions",
-#       "dataType": "Numeric",
-#       "synonyms": ["Net Carbon Emissions"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "GHG emissions after accounting for offsets."
-#     },
-#     {
-#       "parameter": "Carbon Sequestration",
-#       "dataType": "Numeric",
-#       "synonyms": ["Carbon Capture"],
-#       "uom": "Metric Tons CO₂e",
-#       "description": "Amount of CO₂ sequestered or captured."
-#     }
-#   ]
-# }"""

 import os
 import json
+import re
+from typing import Optional, Dict, Union, IO, List, BinaryIO
 from google import genai
 from google.genai import types
 from application.schemas.response_schema import GEMINI_RESPONSE_FORMAT
+from application.utils import logger
+logger=logger.get_logger()
+client = genai.Client(api_key=os.getenv("gemini_api_key"))
 PROMPT = (
+    """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
+    and ESG (Environmental, Social, Governance) Data from a company’s sustainability
+    or ESG report in PDF format."""
 )
+def sanitize_file_name(name: str, max_length: int = 40) -> str:
+    """
+    Sanitizes a file name to comply with Gemini API naming rules:
+    - Lowercase only
+    - Alphanumeric characters and dashes (`-`) allowed
+    - Cannot start or end with a dash
+    - Max length: 40 characters
+    Args:
+        name (str): The original file name (without extension).
+        max_length (int, optional): Maximum allowed characters (default: 40).
+    Returns:
+        str: Sanitized file name.
+    Raises:
+        ValueError: If the sanitized name is empty after cleaning.
+    """
+    if not name or not isinstance(name, str):
+        raise ValueError("Invalid file name: must be a non-empty string.")
+    # Convert to lowercase and replace invalid characters with dashes
+    name = re.sub(r'[^a-z0-9]+', '-', name.lower())
+    # Remove leading/trailing dashes and truncate
+    name = name.strip('-')[:max_length].rstrip('-')
+    if not name:
+        raise ValueError("Sanitized file name is empty or invalid after cleanup.")
+    return name
+def get_files() -> List[str]:
+    """
+    Retrieves all uploaded file names from Gemini.
+    Returns:
+        List[str]: List of existing file names.
     """
+    files = client.files.list()
+    return [file.name for file in files]
+def delete_files(file_names: Union[str, List[str]]) -> None:
+    """
+    Deletes specified files from Gemini.
+    Args:
+        file_names (Union[str, List[str]]): File name or list of names to delete.
+    """
+    if not file_names:
+        logger.warning("No file names provided for deletion.")
+        return
+    if isinstance(file_names, str):
+        file_names = [file_names]
+    existing_files = get_files()
+    for name in file_names:
+        logger.info(f"Attempting to delete file: {name}")
+        if name in existing_files:
+            client.files.delete(name=name)
+            logger.info(f"Deleted file: {name}")
+        else:
+            logger.warning(f"File not found: {name}")
+def upload_file(
+    file: Union[str, IO[bytes]],
+    file_name: Optional[str] = None,
+    config: Optional[Dict[str, str]] = None
+) -> Optional[types.File]:
+    """
+    Uploads a file to the Gemini API, handling both file paths and binary streams.
+    Args:
+        file (Union[str, IO[bytes]]): File path or binary file object (e.g., from Streamlit).
+        file_name (Optional[str]): Name for the file. If None, attempts to use file.name.
+        config (Optional[Dict[str, str]]): Extra config like 'mime_type'.
+    Returns:
+        Optional[types.File]: The uploaded Gemini file object, or existing one if already uploaded.
+    Raises:
+        Exception: If upload fails.
     """
+    try:
+        if not file_name:
+            if isinstance(file, str):
+                file_name = os.path.basename(file)
+            elif hasattr(file, "name"):
+                file_name = os.path.basename(file.name)
+            else:
+                raise ValueError("file_name must be provided if file has no 'name' attribute.")
+        sanitized_name = sanitize_file_name(os.path.splitext(file_name)[0])
+        mime_type = "application/pdf"
+        config = config or {}
+        config.update({"name": sanitized_name, "mime_type": mime_type})
+        gemini_file_key = f"files/{sanitized_name}"
+        if gemini_file_key in get_files():
+            logger.info(f"File already exists on Gemini: {gemini_file_key}")
+            return client.files.get(name=gemini_file_key)
+        logger.info(f"Uploading file to Gemini: {gemini_file_key}")
+        if isinstance(file, str):
+            with open(file, "rb") as f:
+                return client.files.upload(file=f, config=config)
+        else:
+            return client.files.upload(file=file, config=config)
+    except Exception as e:
+        logger.error(f"Failed to upload file '{file_name}': {e}")
+        raise
 def extract_emissions_data_as_json(
     api: str,
     file_input: Union[BinaryIO, bytes]
 ) -> Optional[dict]:
     """
+    Extracts ESG data from a PDF using the Gemini API.
     Args:
+        api (str): API provider (must be 'gemini').
+        model (str): Model name (e.g., 'gemini-pro').
+        file_input (Union[BinaryIO, bytes]): File object or byte stream.
     Returns:
+        Optional[dict]: Parsed JSON response or raw text if parsing fails.
     """
     try:
+        if api.lower() != "gemini":
+            logger.error(f"Unsupported API: {api}")
+            return None
+        file_name = file_input.name if hasattr(file_input, 'name') else "uploaded_file.pdf"
+        uploaded_file = upload_file(file=file_input, file_name=file_name)
         response = client.models.generate_content(
             model=model,
+            contents=[uploaded_file, PROMPT],
             config={
                 'response_mime_type': 'application/json',
+                'response_schema': GEMINI_RESPONSE_FORMAT
             }
         )
         logger.info("[Gemini] Response received.")
         try:
             return json.loads(response.text)
             return {"raw_response": response.text}
     except Exception as e:
+        logger.exception("Error during ESG data extraction.")
+        return None

application/services/streamlit_function.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import streamlit as st
 from typing import Union, List
 from application.utils import logger
 logger = logger.get_logger()
@@ -75,15 +79,49 @@ def upload_file(
         st.session_state.pdf_file = uploaded_files
         return uploaded_files
-# def extract_text_from_pdf(file) -> str:
-#     """
-#     Extracts and returns the full text content from a PDF file.
-#     :param file: PDF file object (BytesIO or UploadedFile from Streamlit)
-#     :return: Extracted text as a string
-#     """
-#     text = ""
-#     with fitz.open(stream=file.read(), filetype="pdf") as doc:
-#         for page in doc:
-#             text += page.get_text()
-#     return text.strip()

 import streamlit as st
 from typing import Union, List
+import pandas as pd
+from io import BytesIO
+import json
+import os
 from application.utils import logger
 logger = logger.get_logger()
         st.session_state.pdf_file = uploaded_files
         return uploaded_files
+def export_results_to_excel(results: dict, sheet_name: str, filename: str = "output.xlsx") -> BytesIO:
+    """
+    Converts a dictionary result into a formatted Excel file.
+    Appends to a file in the 'data/' folder if it already exists,
+    and returns an in-memory Excel file for download.
+    Args:
+        results (dict): The data to export.
+        sheet_name (str): The sheet name to write to.
+        filename (str): The Excel file name (with or without '.xlsx').
+    Returns:
+        BytesIO: In-memory Excel file for Streamlit download.
+    """
+    try:
+        df = pd.json_normalize(results, sep='_')
+        df.replace({None: "", "NULL": ""}, inplace=True)
+    except Exception as e:
+        df = pd.DataFrame([{"error": f"Could not parse result: {str(e)}"}])
+    # Ensure correct file extension and path
+    filename = f"{filename}.xlsx" if not filename.endswith(".xlsx") else filename
+    full_path = os.path.join("data", filename)
+    os.makedirs("data", exist_ok=True)  # Ensure the folder exists
+    # Save to physical file
+    if os.path.exists(full_path):
+        with pd.ExcelWriter(full_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
+            book = writer.book
+            if sheet_name in book.sheetnames:
+                sheet = book[sheet_name]
+                start_row = sheet.max_row
+            else:
+                start_row = 0
+            df.to_excel(writer, sheet_name=sheet_name, index=False, header=start_row == 0, startrow=start_row)
+    else:
+        df.to_excel(full_path, index=False, engine="openpyxl", sheet_name=sheet_name)
+    # Prepare in-memory Excel for download
+    output_stream = BytesIO()
+    with pd.ExcelWriter(output_stream, engine="openpyxl") as writer:
+        df.to_excel(writer, index=False, sheet_name=sheet_name)
+    output_stream.seek(0)
+    return output_stream

application/services/supabase_service.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import json
+import os
+from datetime import datetime
+from supabase import create_client, StorageException
+from utils import logger
+from dotenv import load_dotenv
+# Logger Initialization
+logger = logger.get_logger()
+# Load Environment Variables
+load_dotenv()
+SUPABASE_URL = os.getenv('SUPABASE_URL')
+SUPABASE_KEY = os.getenv('SUPABASE_KEY')
+SUPABASE_BUCKET = os.getenv('SUPABASE_BUCKET')
+LLM_MODEL_NAME = os.getenv('LLM_MODEL_NAME')
+BUCKET_FOLDER = "chat-history"
+# Supabase Client Initialization
+supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

requirements.txt CHANGED Viewed

@@ -4,6 +4,6 @@ dotenv
 google
 google.genai
 google-generativeai
-pymupdf
-openpyxl
-pandas

 google
 google.genai
 google-generativeai
+pandas
+supabase
+openpyxl