Spaces:
Sleeping
Sleeping
Vela
commited on
Commit
·
172e21d
1
Parent(s):
22481bd
added agentic framework
Browse files- application/agents/extractor_agent.py +35 -0
- application/agents/scraper_agent.py +214 -0
- application/services/gemini_model.py +49 -13
- application/services/langgraph_service.py +22 -0
- application/services/mongo_db_service.py +87 -0
- application/tools/emission_data_extractor.py +115 -0
- application/tools/pdf_downloader_tool.py +63 -0
- application/tools/web_search_tools.py +150 -0
- pages/chatbot.py +102 -0
- pages/multiple_pdf_extractor.py +6 -6
- requirements.txt +8 -1
application/agents/extractor_agent.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
|
4 |
+
from application.tools.emission_data_extractor import extract_emission_data_as_json
|
5 |
+
from application.services.langgraph_service import create_agent
|
6 |
+
from application.utils.logger import get_logger
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
logger = get_logger()
|
10 |
+
|
11 |
+
EXTRACTOR_SYSTEM_PROMPT = """
|
12 |
+
You are an intelligent assistant specialized in extracting emission-related ESG (Environmental, Social, and Governance) data from PDF documents.
|
13 |
+
|
14 |
+
You have access to the following tool:
|
15 |
+
- **extract_emission_data_as_json**: Use this tool to upload a PDF and extract structured emission-related information as a JSON response.
|
16 |
+
|
17 |
+
Instructions:
|
18 |
+
- Your task is to extract only emission-related ESG data, such as carbon emissions, Scope 1, Scope 2, Scope 3 emissions, and other relevant sustainability metrics.
|
19 |
+
- Always attempt to return structured JSON data if possible.
|
20 |
+
- If structured data cannot be extracted cleanly, ensure that the raw response from the document is returned under a "raw_response" field.
|
21 |
+
- Do not make assumptions or hallucinate missing values — extract only what is explicitly present in the document.
|
22 |
+
- Always prioritize extracting the latest, most clearly defined data from the PDF.
|
23 |
+
- Do not summarize, analyze, or interpret the document — your only role is **accurate data extraction**.
|
24 |
+
|
25 |
+
Goal:
|
26 |
+
- Accurately upload the PDF.
|
27 |
+
- Extract the requested emission-related ESG data in a clean JSON format.
|
28 |
+
- Handle edge cases gracefully (e.g., invalid PDFs, no emission data found).
|
29 |
+
|
30 |
+
Behave like a highly precise and reliable data extraction engine.
|
31 |
+
"""
|
32 |
+
|
33 |
+
llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
|
34 |
+
|
35 |
+
extractor_agent = create_agent(llm, [extract_emission_data_as_json], EXTRACTOR_SYSTEM_PROMPT)
|
application/agents/scraper_agent.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import json
|
4 |
+
from langchain_core.messages import ToolMessage
|
5 |
+
from typing import TypedDict, Annotated
|
6 |
+
from langgraph.graph.message import add_messages
|
7 |
+
from typing import Annotated, List
|
8 |
+
from langgraph.graph import StateGraph, END
|
9 |
+
from langchain_core.messages import ToolMessage, HumanMessage
|
10 |
+
from langchain_openai import ChatOpenAI
|
11 |
+
|
12 |
+
# Local Imports
|
13 |
+
from application.tools.web_search_tools import get_top_companies_from_web, get_sustainability_report_pdf
|
14 |
+
from application.tools.pdf_downloader_tool import download_pdf
|
15 |
+
from application.tools.emission_data_extractor import extract_emission_data_as_json
|
16 |
+
from application.services.langgraph_service import create_agent
|
17 |
+
from application.utils.logger import get_logger
|
18 |
+
|
19 |
+
# setting up environment and logger
|
20 |
+
load_dotenv()
|
21 |
+
logger = get_logger()
|
22 |
+
|
23 |
+
# Langsmith
|
24 |
+
LANGSMITH_API_KEY=os.getenv('LANGSMITH_API_KEY')
|
25 |
+
os.environ['LANGSMITH_API_KEY'] = LANGSMITH_API_KEY
|
26 |
+
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
|
27 |
+
os.environ["LANGCHAIN_PROJECT"] = "Sustainability_AI"
|
28 |
+
|
29 |
+
# OpenAI api key set up
|
30 |
+
os.environ['OPENAI_API_KEY'] = os.environ.get("OPENAI_API_KEY")
|
31 |
+
|
32 |
+
|
33 |
+
class AgentState(TypedDict):
|
34 |
+
messages: Annotated[List, add_messages]
|
35 |
+
|
36 |
+
graph = StateGraph(AgentState)
|
37 |
+
|
38 |
+
model = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
|
39 |
+
tools = [get_top_companies_from_web, get_sustainability_report_pdf, download_pdf, extract_emission_data_as_json]
|
40 |
+
model_with_tools = model.bind_tools(tools)
|
41 |
+
|
42 |
+
def invoke_model(state: AgentState) -> dict:
|
43 |
+
"""Invokes the LLM with the current conversation history."""
|
44 |
+
logger.info("--- Invoking Model ---")
|
45 |
+
# LangGraph automatically passes the entire state
|
46 |
+
# The model_with_tools expects a list of BaseMessages
|
47 |
+
response = model_with_tools.invoke(state['messages'])
|
48 |
+
# logger.info(f"Model response: {response}")
|
49 |
+
# We return a dictionary with the key corresponding to the state field name
|
50 |
+
return {"messages": [response]} # The response is already an AIMessage
|
51 |
+
|
52 |
+
def invoke_tools(state: AgentState) -> dict:
|
53 |
+
"""Invokes the necessary tools based on the last AI message."""
|
54 |
+
logger.info("--- Invoking Tools ---")
|
55 |
+
# The state contains the history, the last message is the AI's request
|
56 |
+
last_message = state['messages'][-1]
|
57 |
+
|
58 |
+
# Check if the last message is an AIMessage with tool_calls
|
59 |
+
if not hasattr(last_message, 'tool_calls') or not last_message.tool_calls:
|
60 |
+
logger.info("No tool calls found in the last message.")
|
61 |
+
# This scenario might indicate the conversation should end or requires clarification
|
62 |
+
# For now, return an empty dict, which won't update the state significantly.
|
63 |
+
# Consider adding a message indicating no tools were called if needed.
|
64 |
+
return {}
|
65 |
+
# Alternative: return {"messages": [SystemMessage(content="No tool calls requested.")]}
|
66 |
+
|
67 |
+
tool_invocation_messages = []
|
68 |
+
|
69 |
+
# Find the tool object by name
|
70 |
+
tool_map = {tool.name: tool for tool in tools}
|
71 |
+
|
72 |
+
for tool_call in last_message.tool_calls:
|
73 |
+
tool_name = tool_call['name']
|
74 |
+
tool_args = tool_call['args']
|
75 |
+
tool_call_id = tool_call['id'] # Crucial for linking the result
|
76 |
+
|
77 |
+
logger.info(f"Executing tool: {tool_name} with args: {tool_args}")
|
78 |
+
|
79 |
+
if tool_name in tool_map:
|
80 |
+
selected_tool = tool_map[tool_name]
|
81 |
+
try:
|
82 |
+
# Use the tool's invoke method, passing the arguments dictionary
|
83 |
+
result = selected_tool.invoke(tool_args)
|
84 |
+
|
85 |
+
# IMPORTANT: Convert the result to a string or a JSON serializable format
|
86 |
+
# if it's a complex object. ToolMessage content should be simple.
|
87 |
+
# Adjust this based on what your tools actually return.
|
88 |
+
if isinstance(result, list) or isinstance(result, dict):
|
89 |
+
result_content = json.dumps(result) # Convert dict/list to JSON string
|
90 |
+
elif hasattr(result, 'companies') and isinstance(result.companies, list): # Handle CompanyListResponse example
|
91 |
+
result_content = f"Companies found: {', '.join(result.companies)}"
|
92 |
+
elif result is None:
|
93 |
+
result_content = "Tool executed successfully, but returned no specific data (None)."
|
94 |
+
else:
|
95 |
+
result_content = str(result) # Default to string conversion
|
96 |
+
|
97 |
+
logger.info(f"Tool {tool_name} result: {result_content}")
|
98 |
+
tool_invocation_messages.append(
|
99 |
+
ToolMessage(content=result_content, tool_call_id=tool_call_id)
|
100 |
+
)
|
101 |
+
except Exception as e:
|
102 |
+
logger.error(f"Error executing tool {tool_name}: {e}")
|
103 |
+
# Return an error message in the ToolMessage
|
104 |
+
tool_invocation_messages.append(
|
105 |
+
ToolMessage(content=f"Error executing tool {tool_name}: {str(e)}", tool_call_id=tool_call_id)
|
106 |
+
)
|
107 |
+
else:
|
108 |
+
logger.warning(f"Tool '{tool_name}' not found.")
|
109 |
+
tool_invocation_messages.append(
|
110 |
+
ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id)
|
111 |
+
)
|
112 |
+
|
113 |
+
# Return the collected ToolMessages to be added to the state
|
114 |
+
return {"messages": tool_invocation_messages}
|
115 |
+
|
116 |
+
# --- Graph Definition ---
|
117 |
+
graph_builder = StateGraph(AgentState)
|
118 |
+
|
119 |
+
# Add nodes
|
120 |
+
graph_builder.add_node("scraper_agent", invoke_model)
|
121 |
+
graph_builder.add_node("tools", invoke_tools) # Renamed for clarity
|
122 |
+
|
123 |
+
# Define edges
|
124 |
+
graph_builder.set_entry_point("scraper_agent")
|
125 |
+
|
126 |
+
# Conditional edge: After the agent runs, decide whether to call tools or end.
|
127 |
+
def router(state: AgentState) -> str:
|
128 |
+
"""Determines the next step based on the last message."""
|
129 |
+
last_message = state['messages'][-1]
|
130 |
+
if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
|
131 |
+
# If the AI message has tool calls, invoke the tools node
|
132 |
+
logger.info("--- Routing to Tools ---")
|
133 |
+
return "tools"
|
134 |
+
else:
|
135 |
+
# Otherwise, the conversation can end
|
136 |
+
logger.info("--- Routing to End ---")
|
137 |
+
return END
|
138 |
+
|
139 |
+
graph_builder.add_conditional_edges(
|
140 |
+
"scraper_agent",
|
141 |
+
router,
|
142 |
+
{
|
143 |
+
"tools": "tools", # If router returns "tools", go to the "tools" node
|
144 |
+
END: END, # If router returns END, finish the graph execution
|
145 |
+
}
|
146 |
+
)
|
147 |
+
|
148 |
+
# After tools are invoked, their results (ToolMessages) should go back to the agent
|
149 |
+
graph_builder.add_edge("tools", "scraper_agent")
|
150 |
+
|
151 |
+
# Compile the graph
|
152 |
+
app = graph_builder.compile()
|
153 |
+
|
154 |
+
# # --- Running the Graph ---
|
155 |
+
# if __name__ == "__main__":
|
156 |
+
# logger.info("Starting graph execution...")
|
157 |
+
# # Use HumanMessage for the initial input
|
158 |
+
# initial_input = {"messages": [HumanMessage(content="Please download this pdf https://www.infosys.com/sustainability/documents/infosys-esg-report-2023-24.pdf")]}
|
159 |
+
|
160 |
+
# # Stream events to see the flow (optional, but helpful for debugging)
|
161 |
+
# # Add recursion limit to prevent infinite loops
|
162 |
+
# try:
|
163 |
+
# final_state = None
|
164 |
+
# for event in app.stream(initial_input, {"recursion_limit": 15}):
|
165 |
+
# # event is a dictionary where keys are node names and values are outputs
|
166 |
+
# logger.info(f"Event: {event}")
|
167 |
+
# # Keep track of the latest state if needed, especially the messages
|
168 |
+
# if "scraper_agent" in event:
|
169 |
+
# final_state = event["scraper_agent"]
|
170 |
+
# elif "tools" in event:
|
171 |
+
# final_state = event["tools"] # Though tool output doesn't directly give full state
|
172 |
+
# logger.info("---")
|
173 |
+
|
174 |
+
# logger.info("\n--- Final State Messages ---")
|
175 |
+
# # To get the absolute final state after streaming, invoke might be simpler,
|
176 |
+
# # or you need to properly aggregate the state from the stream events.
|
177 |
+
# # A simpler way to get final output:
|
178 |
+
# final_output = app.invoke(initial_input, {"recursion_limit": 15})
|
179 |
+
# logger.info(json.dumps(final_output['messages'][-1].dict(), indent=2)) # Print the last message
|
180 |
+
|
181 |
+
# except Exception as e:
|
182 |
+
# logger.error(f"\n--- An error occurred during graph execution ---")
|
183 |
+
# import traceback
|
184 |
+
# traceback.print_exc()
|
185 |
+
|
186 |
+
|
187 |
+
SCRAPER_SYSTEM_PROMPT = """
|
188 |
+
You are an intelligent assistant specialized in company research and sustainability report retrieval.
|
189 |
+
|
190 |
+
You have access to the following tools:
|
191 |
+
- **search_tool**: Use this tool when the user asks for a list of top companies related to an industry or category (e.g., "top 5 textile companies"). Always preserve any number mentioned (e.g., 'top 5', 'top 10') in the query.
|
192 |
+
- **pdf_finder_tool**: Use this tool when the user requests a sustainability report or any other specific PDF document about a company. Search specifically for the latest sustainability report if not otherwise specified.
|
193 |
+
- **pdf_downloader_tool**: Use this tool when the user provides a direct PDF link or asks you to download a PDF document from a URL.
|
194 |
+
|
195 |
+
Instructions:
|
196 |
+
- Carefully read the user's request and select the correct tool based on their intent.
|
197 |
+
- Always preserve important details like quantity (e.g., "top 5"), industry, or company name.
|
198 |
+
- If the user mentions multiple companies and asks for reports, find reports for **each** company individually.
|
199 |
+
- Do not add assumptions, opinions, or unrelated information.
|
200 |
+
- Always generate clean, direct, and minimal input for the tool — close to the user's original query.
|
201 |
+
- Prioritize the most recent information when searching for reports unless otherwise instructed.
|
202 |
+
|
203 |
+
Goal:
|
204 |
+
- Select the appropriate tool.
|
205 |
+
- Build a precise query that perfectly reflects the user's request.
|
206 |
+
- Return only what the user asks — no extra text or interpretation.
|
207 |
+
"""
|
208 |
+
search_tool = get_top_companies_from_web
|
209 |
+
pdf_finder_tool = get_sustainability_report_pdf
|
210 |
+
pdf_downloader_tool = download_pdf
|
211 |
+
|
212 |
+
llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
|
213 |
+
|
214 |
+
scraper_agent = create_agent(llm, [search_tool, pdf_finder_tool, pdf_downloader_tool], SCRAPER_SYSTEM_PROMPT)
|
application/services/gemini_model.py
CHANGED
@@ -10,22 +10,57 @@ logger=logger.get_logger()
|
|
10 |
|
11 |
client = genai.Client(api_key=os.getenv("gemini_api_key"))
|
12 |
|
13 |
-
PROMPT = (
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
or not found in the document. If a value is missing or unavailable, return a suitable
|
21 |
-
placeholder according to the format used
|
22 |
-
in the schema.
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"""
|
30 |
)
|
31 |
|
@@ -176,6 +211,7 @@ def extract_emissions_data_as_json(
|
|
176 |
config={
|
177 |
'response_mime_type': 'application/json',
|
178 |
'response_schema': response_schema,
|
|
|
179 |
},
|
180 |
)
|
181 |
if hasattr(response, 'usage_metadata'):
|
|
|
10 |
|
11 |
client = genai.Client(api_key=os.getenv("gemini_api_key"))
|
12 |
|
13 |
+
# PROMPT = (
|
14 |
+
# """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
|
15 |
+
# and ESG (Environmental, Social, Governance) Data from a company’s sustainability
|
16 |
+
# or ESG report in PDF format.
|
17 |
+
|
18 |
+
# You must extract the data based on a predefined response schema. It is critical
|
19 |
+
# that you return all keys specified in the schema, even if the value is not present
|
20 |
+
# or not found in the document. If a value is missing or unavailable, return a suitable
|
21 |
+
# placeholder according to the format used
|
22 |
+
# in the schema.
|
23 |
|
24 |
+
# Your output should strictly follow the structure of the schema, ensuring completeness
|
25 |
+
# and consistency for downstream processing.
|
|
|
|
|
|
|
26 |
|
27 |
+
# Be precise in extracting values and identifying relevant context from the PDF. Use
|
28 |
+
# surrounding text or tables to identify the most likely match for each field.
|
29 |
+
# """
|
30 |
+
# )
|
31 |
|
32 |
+
PROMPT = (
|
33 |
+
"""You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format.
|
34 |
+
Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.
|
35 |
+
|
36 |
+
### Instructions:
|
37 |
+
1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.
|
38 |
+
2. **Data Sources**: Extract data from all relevant sections of the PDF, including:
|
39 |
+
- Narrative text
|
40 |
+
- Tables
|
41 |
+
- Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)
|
42 |
+
- Footnotes or appendices
|
43 |
+
3. **Infographic Handling**: For infographics, prioritize:
|
44 |
+
- Text labels or annotations within the graphic
|
45 |
+
- Captions or descriptions near the infographic
|
46 |
+
- Legends or keys that clarify values
|
47 |
+
- If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.
|
48 |
+
4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).
|
49 |
+
5. **Edge Cases**:
|
50 |
+
- If data is missing, use placeholders as specified in the schema.
|
51 |
+
- If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.
|
52 |
+
|
53 |
+
### Output Requirements:
|
54 |
+
- Return a JSON object adhering to the schema.
|
55 |
+
- Ensure all fields are populated, using placeholders for missing data.
|
56 |
+
- Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.
|
57 |
+
|
58 |
+
|
59 |
+
### Task:
|
60 |
+
- Parse the PDF thoroughly to extract all relevant data.
|
61 |
+
- Ensure consistency in units, years, and terminology across the output.
|
62 |
+
- Handle infographics with care, prioritizing textual data and flagging estimates.
|
63 |
+
- Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.
|
64 |
"""
|
65 |
)
|
66 |
|
|
|
211 |
config={
|
212 |
'response_mime_type': 'application/json',
|
213 |
'response_schema': response_schema,
|
214 |
+
'temperature': 0.0,
|
215 |
},
|
216 |
)
|
217 |
if hasattr(response, 'usage_metadata'):
|
application/services/langgraph_service.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
3 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
4 |
+
from langchain_core.tools import BaseTool
|
5 |
+
from langchain_openai import ChatOpenAI
|
6 |
+
|
7 |
+
def create_agent(
|
8 |
+
llm: ChatOpenAI,
|
9 |
+
tools: List[BaseTool],
|
10 |
+
system_prompt: str
|
11 |
+
) -> AgentExecutor:
|
12 |
+
"""Create an agent executor with given tools and a system prompt."""
|
13 |
+
prompt = ChatPromptTemplate.from_messages(
|
14 |
+
[
|
15 |
+
("system", system_prompt),
|
16 |
+
MessagesPlaceholder(variable_name="messages"),
|
17 |
+
MessagesPlaceholder(variable_name="agent_scratchpad"),
|
18 |
+
]
|
19 |
+
)
|
20 |
+
agent = create_openai_tools_agent(llm, tools, prompt)
|
21 |
+
executor = AgentExecutor(agent=agent, tools=tools)
|
22 |
+
return executor
|
application/services/mongo_db_service.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pymongo import MongoClient
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from typing import List, Dict, Optional
|
5 |
+
from application.utils.logger import get_logger
|
6 |
+
|
7 |
+
logger = get_logger()
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
DB_NAME = "sustainability_reports_db"
|
12 |
+
|
13 |
+
def get_mongo_client():
|
14 |
+
try:
|
15 |
+
client = MongoClient(os.getenv("MONGODB_URI"))
|
16 |
+
return client
|
17 |
+
except Exception as e:
|
18 |
+
logger.exception(f"An unexpected error occurred while connecting to MongoDB: {str(e)}")
|
19 |
+
return None
|
20 |
+
|
21 |
+
def store_document(collection_name: str, document: Dict) -> Optional[str]:
|
22 |
+
"""
|
23 |
+
Stores a document in MongoDB if it doesn't already exist.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
collection_name (str): Name of the MongoDB collection.
|
27 |
+
document (Dict): The document to be inserted.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
Optional[str]: Inserted document ID if successful, None otherwise.
|
31 |
+
"""
|
32 |
+
try:
|
33 |
+
client = get_mongo_client()
|
34 |
+
if client is None:
|
35 |
+
logger.error("MongoDB client is not available.")
|
36 |
+
return None
|
37 |
+
|
38 |
+
db = client.get_database(DB_NAME)
|
39 |
+
collection = db[collection_name]
|
40 |
+
|
41 |
+
# Check if a similar document already exists
|
42 |
+
existing_document = collection.find_one(document)
|
43 |
+
if existing_document:
|
44 |
+
logger.info(f"Document already exists with ID: {existing_document['_id']}")
|
45 |
+
return str(existing_document['_id'])
|
46 |
+
|
47 |
+
# If no existing document, insert the new one
|
48 |
+
result = collection.insert_one(document)
|
49 |
+
logger.info(f"New document inserted with ID: {result.inserted_id}")
|
50 |
+
return str(result.inserted_id)
|
51 |
+
|
52 |
+
except Exception as e:
|
53 |
+
logger.exception(f"An unexpected error occurred: {str(e)}")
|
54 |
+
|
55 |
+
return None
|
56 |
+
|
57 |
+
def retrieve_documents(collection_name: str, query: Optional[Dict] = None) -> List[Dict]:
|
58 |
+
"""
|
59 |
+
Retrieves documents from the specified MongoDB collection.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
collection_name (str): Name of the MongoDB collection.
|
63 |
+
query (Optional[Dict]): A MongoDB query filter. Defaults to {} (fetch all documents).
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
List[Dict]: A list of documents matching the query. Empty list if none found or error occurs.
|
67 |
+
"""
|
68 |
+
try:
|
69 |
+
client = get_mongo_client()
|
70 |
+
if client is None:
|
71 |
+
logger.error("MongoDB client is not available.")
|
72 |
+
return []
|
73 |
+
|
74 |
+
db = client.get_database(DB_NAME)
|
75 |
+
collection = db[collection_name]
|
76 |
+
|
77 |
+
documents_cursor = collection.find(query or {})
|
78 |
+
documents = list(documents_cursor)
|
79 |
+
|
80 |
+
logger.info(f"Retrieved {len(documents)} documents from collection: {collection_name}")
|
81 |
+
return documents
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
logger.exception(f"An error occurred while retrieving documents: {str(e)}")
|
85 |
+
return []
|
86 |
+
|
87 |
+
# all_docs = retrieve_documents("Zalando")
|
application/tools/emission_data_extractor.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from typing import Optional, Dict, Union, BinaryIO
|
5 |
+
import requests
|
6 |
+
from google import genai
|
7 |
+
from google.genai import types
|
8 |
+
from application.utils.logger import get_logger
|
9 |
+
from application.services.gemini_model import upload_file
|
10 |
+
from application.services.mongo_db_service import store_document
|
11 |
+
from application.schemas.response_schema import GEMINI_GHG_PARAMETERS
|
12 |
+
from langchain_core.tools import tool
|
13 |
+
|
14 |
+
logger = get_logger()
|
15 |
+
|
16 |
+
client = genai.Client(api_key=os.getenv("gemini_api_key"))
|
17 |
+
MODEL = "gemini-2.0-flash"
|
18 |
+
|
19 |
+
|
20 |
+
PROMPT = (
|
21 |
+
"""You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format.
|
22 |
+
Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.
|
23 |
+
|
24 |
+
### Instructions:
|
25 |
+
1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.
|
26 |
+
2. **Data Sources**: Extract data from all relevant sections of the PDF, including:
|
27 |
+
- Narrative text
|
28 |
+
- Tables
|
29 |
+
- Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)
|
30 |
+
- Footnotes or appendices
|
31 |
+
3. **Infographic Handling**: For infographics, prioritize:
|
32 |
+
- Text labels or annotations within the graphic
|
33 |
+
- Captions or descriptions near the infographic
|
34 |
+
- Legends or keys that clarify values
|
35 |
+
- If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.
|
36 |
+
4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).
|
37 |
+
5. **Edge Cases**:
|
38 |
+
- If data is missing, use placeholders as specified in the schema.
|
39 |
+
- If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.
|
40 |
+
|
41 |
+
### Output Requirements:
|
42 |
+
- Return a JSON object adhering to the schema.
|
43 |
+
- Ensure all fields are populated, using placeholders for missing data.
|
44 |
+
- Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.
|
45 |
+
|
46 |
+
|
47 |
+
### Task:
|
48 |
+
- Parse the PDF thoroughly to extract all relevant data.
|
49 |
+
- Ensure consistency in units, years, and terminology across the output.
|
50 |
+
- Handle infographics with care, prioritizing textual data and flagging estimates.
|
51 |
+
- Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.
|
52 |
+
"""
|
53 |
+
)
|
54 |
+
|
55 |
+
@tool
|
56 |
+
def extract_emission_data_as_json(file_input: Union[BinaryIO, bytes, str]) -> Optional[Dict]:
|
57 |
+
"""
|
58 |
+
Extracts emission-related ESG data from a PDF file using the Gemini API.
|
59 |
+
|
60 |
+
This function uploads the provided PDF (local file path, binary file, or byte stream) to Gemini,
|
61 |
+
sends a structured prompt to extract relevant emission data, and attempts to parse the response as JSON.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
file_input (Union[BinaryIO, bytes, str]):
|
65 |
+
The input file to process. Can be a file object, byte stream, or local file path.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
Optional[Dict]:
|
69 |
+
A dictionary containing the extracted emission data if parsing succeeds,
|
70 |
+
or a dictionary with the raw text response if JSON parsing fails.
|
71 |
+
Returns None if the extraction process encounters an error.
|
72 |
+
|
73 |
+
Raises:
|
74 |
+
Exception:
|
75 |
+
Logs and handles any unexpected errors during file upload, Gemini API interaction, or response parsing.
|
76 |
+
|
77 |
+
Notes:
|
78 |
+
- The function automatically handles uploading if the file is not already present on Gemini.
|
79 |
+
- If the response is not valid JSON, the raw response text is returned under the key "raw_response".
|
80 |
+
- Token usage information (input, output, total tokens) is logged if available.
|
81 |
+
"""
|
82 |
+
try:
|
83 |
+
|
84 |
+
uploaded_file = upload_file(file=file_input)
|
85 |
+
|
86 |
+
response = client.models.generate_content(
|
87 |
+
model=MODEL,
|
88 |
+
contents=[uploaded_file, PROMPT],
|
89 |
+
config={
|
90 |
+
'response_mime_type': 'application/json',
|
91 |
+
'response_schema': GEMINI_GHG_PARAMETERS,
|
92 |
+
'temperature': 0.0,
|
93 |
+
},
|
94 |
+
)
|
95 |
+
if hasattr(response, 'usage_metadata'):
|
96 |
+
logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}")
|
97 |
+
logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}")
|
98 |
+
logger.info(f"Total tokens: {response.usage_metadata.total_token_count}")
|
99 |
+
else:
|
100 |
+
logger.info("Token usage metadata not available in response")
|
101 |
+
|
102 |
+
logger.info("[Gemini] Response received.")
|
103 |
+
try:
|
104 |
+
result = json.loads(response.text)
|
105 |
+
file_name = result.get('Company Name', 'Unknown Company')
|
106 |
+
document ={"Greenhouse Gas (GHG) Protocol Parameters": result.get('Greenhouse Gas (GHG) Protocol Parameters')}
|
107 |
+
store_document(file_name, document)
|
108 |
+
return json.loads(response.text)
|
109 |
+
except json.JSONDecodeError:
|
110 |
+
logger.warning("Failed to parse JSON, returning raw response.")
|
111 |
+
return {"raw_response": response.text}
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
logger.exception("Error during ESG data extraction.")
|
115 |
+
return None
|
application/tools/pdf_downloader_tool.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from application.utils.logger import get_logger
|
4 |
+
|
5 |
+
from langchain_core.tools import tool
|
6 |
+
|
7 |
+
|
8 |
+
logger = get_logger()
|
9 |
+
|
10 |
+
@tool
|
11 |
+
def download_pdf(filename:str, url: str, save_path: str = "reports", overwrite: bool = False):
|
12 |
+
"""
|
13 |
+
Downloads a PDF file from a given URL ('pdf_link') and saves it locally
|
14 |
+
with the specified 'filename'. Returns the local path if successful, otherwise None.
|
15 |
+
Use this tool AFTER get_sustainability_report_pdf has returned a valid PDF link or if user provides the PDF link.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
filename (str): The name to save the PDF as (should end with .pdf).
|
19 |
+
url (str): The direct URL to the PDF file.
|
20 |
+
save_path (str): The directory to save the PDF into (default: "reports").
|
21 |
+
overwrite (bool): Whether to overwrite the file if it already exists.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
str | None: The path to the saved file if successful, otherwise None.
|
25 |
+
"""
|
26 |
+
try:
|
27 |
+
# parsed_url = urlparse(url)
|
28 |
+
# filename = os.path.basename(parsed_url.path)
|
29 |
+
|
30 |
+
if not filename.lower().endswith(".pdf"):
|
31 |
+
logger.warning(f"URL does not point to a PDF file: {url}")
|
32 |
+
return None
|
33 |
+
|
34 |
+
os.makedirs(save_path, exist_ok=True)
|
35 |
+
full_path = os.path.join(save_path, filename)
|
36 |
+
|
37 |
+
if os.path.exists(full_path) and not overwrite:
|
38 |
+
logger.info(f"File already exists, skipping download: {full_path}")
|
39 |
+
return full_path
|
40 |
+
|
41 |
+
logger.info(f"Starting download from {url}")
|
42 |
+
|
43 |
+
response = requests.get(url, stream=True, timeout=20)
|
44 |
+
response.raise_for_status()
|
45 |
+
|
46 |
+
with open(full_path, "wb") as file:
|
47 |
+
for chunk in response.iter_content(chunk_size=8192):
|
48 |
+
if chunk:
|
49 |
+
file.write(chunk)
|
50 |
+
|
51 |
+
logger.info(f"Successfully downloaded to: {full_path}")
|
52 |
+
return full_path
|
53 |
+
|
54 |
+
except requests.exceptions.Timeout:
|
55 |
+
logger.error(f"Timeout while downloading {url}")
|
56 |
+
except requests.exceptions.HTTPError as http_err:
|
57 |
+
logger.error(f"HTTP error while downloading {url}: {http_err}")
|
58 |
+
except requests.exceptions.RequestException as req_err:
|
59 |
+
logger.error(f"Request error while downloading {url}: {req_err}")
|
60 |
+
except Exception as e:
|
61 |
+
logger.error(f"Unexpected error: {e}")
|
62 |
+
|
63 |
+
return None
|
application/tools/web_search_tools.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from openai import OpenAI
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from typing import List
|
6 |
+
from application.utils.logger import get_logger
|
7 |
+
from typing import Literal
|
8 |
+
from duckduckgo_search import DDGS
|
9 |
+
from tavily import TavilyClient
|
10 |
+
from langchain_core.tools import tool
|
11 |
+
|
12 |
+
logger = get_logger()
|
13 |
+
load_dotenv()
|
14 |
+
os.makedirs("reports", exist_ok=True)
|
15 |
+
|
16 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
+
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
18 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
19 |
+
|
20 |
+
class CompanyListResponse(BaseModel):
|
21 |
+
companies: List[str]
|
22 |
+
|
23 |
+
# parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss']
|
24 |
+
|
25 |
+
@tool
|
26 |
+
def get_top_companies_from_web(query: str):
|
27 |
+
|
28 |
+
"""
|
29 |
+
# Searches the web for a list of top companies based on a given query.
|
30 |
+
|
31 |
+
Extracts the number of companies from the query if specified; defaults to 5 otherwise.
|
32 |
+
Returns only the specified number of company names in a list format.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
query (str): The search query from the user.
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
CompanyListResponse: A structured list of top company names.
|
39 |
+
"""
|
40 |
+
prompt = (
|
41 |
+
f"{query} "
|
42 |
+
"focusing on globally recognized companies known for size, influence, or sustainability efforts. "
|
43 |
+
"Respond with a Python list of company names only, no explanation. "
|
44 |
+
"Example: ['Company A', 'Company B', 'Company C']. "
|
45 |
+
"Please do not include any other text or formatting."
|
46 |
+
)
|
47 |
+
logger.info(f'User query : {query}')
|
48 |
+
try:
|
49 |
+
response = client.responses.create(
|
50 |
+
model="gpt-4o-mini",
|
51 |
+
tools=[{"type": "web_search_preview"}],
|
52 |
+
input=prompt,
|
53 |
+
)
|
54 |
+
|
55 |
+
output = response.output_text
|
56 |
+
# logger.info(f"Raw Output: {output}")
|
57 |
+
parsed_list = eval(output.strip())
|
58 |
+
logger.info(f"Parsed List: {parsed_list}")
|
59 |
+
result = CompanyListResponse(companies=parsed_list)
|
60 |
+
return result
|
61 |
+
except Exception as e:
|
62 |
+
logger.error(f"Error parsing response: {e}")
|
63 |
+
raise ValueError(f"Failed to parse company list: {output}")
|
64 |
+
|
65 |
+
@tool
|
66 |
+
def get_sustainability_report_pdf(
|
67 |
+
company_name: str,
|
68 |
+
year: int | None = None,
|
69 |
+
max_results: int = 1,
|
70 |
+
search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo",
|
71 |
+
) -> str | None:
|
72 |
+
|
73 |
+
"""
|
74 |
+
Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company.
|
75 |
+
Use this tool when the user provides the exact name of the company they want the report for.
|
76 |
+
Optionally, a specific 'year' can be provided.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
company_name (str): The name of the company.
|
80 |
+
year (int, optional): The year of the sustainability report. Defaults to None.
|
81 |
+
max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1.
|
82 |
+
search_engine (str, optional): Search engine to use.
|
83 |
+
- "tavily" : only use Tavily search
|
84 |
+
- "duckduckgo" : only use DuckDuckGo
|
85 |
+
- "both" (default): try Tavily first, fallback to DuckDuckGo if needed
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
str or None: The URL of the sustainability report PDF if found, otherwise None.
|
89 |
+
|
90 |
+
Search Strategy:
|
91 |
+
- Tavily: Searches with advanced search settings.
|
92 |
+
- DuckDuckGo: Searches public web with 'filetype:pdf' filter.
|
93 |
+
- Only URLs ending with '.pdf' are considered valid.
|
94 |
+
|
95 |
+
Notes:
|
96 |
+
- Any search failures are internally handled and logged.
|
97 |
+
"""
|
98 |
+
|
99 |
+
def search_with_tavily(query: str) -> str | None:
|
100 |
+
try:
|
101 |
+
logger.info(f"Searching Tavily for: {query}")
|
102 |
+
result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results)
|
103 |
+
urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")]
|
104 |
+
if urls:
|
105 |
+
logger.info(f"Found PDF via Tavily: {urls[0]}")
|
106 |
+
return urls[0]
|
107 |
+
logger.info("No PDF found via Tavily.")
|
108 |
+
except Exception as e:
|
109 |
+
logger.error(f"Tavily search error: {e}")
|
110 |
+
return None
|
111 |
+
|
112 |
+
def search_with_duckduckgo(query: str, max_results: int) -> str | None:
|
113 |
+
try:
|
114 |
+
logger.info(f"Searching DuckDuckGo for: {query}")
|
115 |
+
with DDGS() as ddgs:
|
116 |
+
search_results = ddgs.text(query.strip(), max_results=max_results)
|
117 |
+
for result in search_results:
|
118 |
+
pdf_url = result.get('href', '')
|
119 |
+
if pdf_url.lower().endswith('.pdf'):
|
120 |
+
logger.info(f"Found PDF via DuckDuckGo: {pdf_url}")
|
121 |
+
return pdf_url
|
122 |
+
else:
|
123 |
+
logger.info(f"Skipped non-PDF link: {pdf_url}")
|
124 |
+
except Exception as error:
|
125 |
+
logger.error(f"DuckDuckGo search error: {error}")
|
126 |
+
return None
|
127 |
+
|
128 |
+
# Compose search query
|
129 |
+
query = f"{company_name} sustainability report filetype:pdf"
|
130 |
+
if year:
|
131 |
+
query += f" {year}"
|
132 |
+
|
133 |
+
logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.")
|
134 |
+
|
135 |
+
# Perform search according to engine selection
|
136 |
+
if search_engine == "tavily":
|
137 |
+
return search_with_tavily(query)
|
138 |
+
|
139 |
+
elif search_engine == "duckduckgo":
|
140 |
+
return search_with_duckduckgo(query, max_results=max_results)
|
141 |
+
|
142 |
+
elif search_engine == "both":
|
143 |
+
pdf_url = search_with_tavily(query)
|
144 |
+
if not pdf_url:
|
145 |
+
pdf_url = search_with_duckduckgo(query, max_results=max_results)
|
146 |
+
return pdf_url
|
147 |
+
|
148 |
+
else:
|
149 |
+
logger.error(f"Invalid search engine option provided: {search_engine}")
|
150 |
+
raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")
|
pages/chatbot.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
5 |
+
# from application.agents.scraper_agent import app
|
6 |
+
# from application.utils.logger import get_logger
|
7 |
+
|
8 |
+
try:
|
9 |
+
from application.agents.scraper_agent import app
|
10 |
+
# from application.main import graph
|
11 |
+
from application.utils.logger import get_logger
|
12 |
+
except ImportError as e:
|
13 |
+
st.error(f"Import Error: Ensure backend modules are accessible. Details: {e}")
|
14 |
+
st.stop()
|
15 |
+
|
16 |
+
logger = get_logger()
|
17 |
+
|
18 |
+
st.set_page_config(page_title="Sustainability AI Assistant", layout="wide")
|
19 |
+
st.title("♻️ Sustainability Report AI Assistant")
|
20 |
+
st.caption(
|
21 |
+
"Ask about sustainability reports by company or industry! "
|
22 |
+
"(e.g., 'Get report for Apple', 'Download report for Microsoft 2023', "
|
23 |
+
"'Find reports for top 3 airline companies', 'Download this pdf <link>')"
|
24 |
+
)
|
25 |
+
|
26 |
+
load_dotenv()
|
27 |
+
|
28 |
+
def initialize_chat_history():
|
29 |
+
"""Initialize session chat history."""
|
30 |
+
if "messages" not in st.session_state:
|
31 |
+
st.session_state.messages = []
|
32 |
+
logger.info("Initialized empty chat history in session state.")
|
33 |
+
|
34 |
+
def display_chat_history():
|
35 |
+
"""Render previous chat messages."""
|
36 |
+
for message in st.session_state.messages:
|
37 |
+
if isinstance(message, SystemMessage):
|
38 |
+
# st.info(f"System: {message.content}")
|
39 |
+
pass
|
40 |
+
elif isinstance(message, HumanMessage):
|
41 |
+
with st.chat_message("user"):
|
42 |
+
st.markdown(message.content)
|
43 |
+
elif isinstance(message, AIMessage):
|
44 |
+
with st.chat_message("assistant"):
|
45 |
+
st.markdown(message.content)
|
46 |
+
|
47 |
+
def invoke_agent():
|
48 |
+
"""Invoke the LangGraph agent and update session state."""
|
49 |
+
try:
|
50 |
+
graph_input = {"messages": st.session_state.messages}
|
51 |
+
logger.info("Invoking LangGraph agent...")
|
52 |
+
|
53 |
+
# final_output_state = graph.invoke(graph_input, {"recursion_limit": 15})
|
54 |
+
|
55 |
+
final_output_state = app.invoke(graph_input, {"recursion_limit": 15})
|
56 |
+
|
57 |
+
logger.info("Agent invocation completed successfully.")
|
58 |
+
return final_output_state
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
logger.error("Agent invocation failed.", exc_info=True)
|
62 |
+
st.error(f"An error occurred while processing your request: {e}")
|
63 |
+
return None
|
64 |
+
|
65 |
+
def display_last_ai_response():
|
66 |
+
"""Display the latest AI message, if any."""
|
67 |
+
last_ai_message = next(
|
68 |
+
(msg for msg in reversed(st.session_state.messages) if isinstance(msg, AIMessage)),
|
69 |
+
None
|
70 |
+
)
|
71 |
+
if last_ai_message:
|
72 |
+
with st.chat_message("assistant"):
|
73 |
+
st.markdown(last_ai_message.content)
|
74 |
+
logger.info("Displayed latest AI response.")
|
75 |
+
else:
|
76 |
+
st.warning("Agent completed without a final AI message.")
|
77 |
+
logger.warning("No AI message found in the final output.")
|
78 |
+
|
79 |
+
initialize_chat_history()
|
80 |
+
display_chat_history()
|
81 |
+
|
82 |
+
if user_query := st.chat_input("Your question about sustainability reports..."):
|
83 |
+
logger.info(f"User input received: {user_query}")
|
84 |
+
|
85 |
+
st.session_state.messages.append(HumanMessage(content=user_query))
|
86 |
+
|
87 |
+
with st.chat_message("user"):
|
88 |
+
st.markdown(user_query)
|
89 |
+
|
90 |
+
with st.spinner("Processing your request... Please wait."):
|
91 |
+
final_output_state = invoke_agent()
|
92 |
+
|
93 |
+
if final_output_state:
|
94 |
+
st.session_state.messages = final_output_state['messages']
|
95 |
+
display_last_ai_response()
|
96 |
+
|
97 |
+
with st.sidebar:
|
98 |
+
st.markdown("---")
|
99 |
+
if st.button("Clear Chat History"):
|
100 |
+
st.session_state.messages = []
|
101 |
+
logger.info("Chat history cleared by user.")
|
102 |
+
st.rerun()
|
pages/multiple_pdf_extractor.py
CHANGED
@@ -24,12 +24,12 @@ AVAILABLE_MODELS = [
|
|
24 |
|
25 |
RESPONSE_SCHEMAS = {
|
26 |
"Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
|
27 |
-
"Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
|
28 |
-
"Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
|
29 |
-
"Social Parameters": GEMINI_SOCIAL_PARAMETERS,
|
30 |
-
"Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
|
31 |
-
"Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
|
32 |
-
"Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
|
33 |
}
|
34 |
|
35 |
selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)
|
|
|
24 |
|
25 |
RESPONSE_SCHEMAS = {
|
26 |
"Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
|
27 |
+
# "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
|
28 |
+
# "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
|
29 |
+
# "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
|
30 |
+
# "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
|
31 |
+
# "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
|
32 |
+
# "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
|
33 |
}
|
34 |
|
35 |
selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)
|
requirements.txt
CHANGED
@@ -6,4 +6,11 @@ google.genai
|
|
6 |
google-generativeai
|
7 |
pandas
|
8 |
supabase
|
9 |
-
openpyxl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
google-generativeai
|
7 |
pandas
|
8 |
supabase
|
9 |
+
openpyxl
|
10 |
+
langchain
|
11 |
+
pymongo
|
12 |
+
langgraph
|
13 |
+
langsmith
|
14 |
+
tavily-python
|
15 |
+
duckduckgo-search
|
16 |
+
langchain_openai
|