Vela commited on
Commit
172e21d
·
1 Parent(s): 22481bd

added agentic framework

Browse files
application/agents/extractor_agent.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain_openai import ChatOpenAI
3
+
4
+ from application.tools.emission_data_extractor import extract_emission_data_as_json
5
+ from application.services.langgraph_service import create_agent
6
+ from application.utils.logger import get_logger
7
+
8
+ load_dotenv()
9
+ logger = get_logger()
10
+
11
+ EXTRACTOR_SYSTEM_PROMPT = """
12
+ You are an intelligent assistant specialized in extracting emission-related ESG (Environmental, Social, and Governance) data from PDF documents.
13
+
14
+ You have access to the following tool:
15
+ - **extract_emission_data_as_json**: Use this tool to upload a PDF and extract structured emission-related information as a JSON response.
16
+
17
+ Instructions:
18
+ - Your task is to extract only emission-related ESG data, such as carbon emissions, Scope 1, Scope 2, Scope 3 emissions, and other relevant sustainability metrics.
19
+ - Always attempt to return structured JSON data if possible.
20
+ - If structured data cannot be extracted cleanly, ensure that the raw response from the document is returned under a "raw_response" field.
21
+ - Do not make assumptions or hallucinate missing values — extract only what is explicitly present in the document.
22
+ - Always prioritize extracting the latest, most clearly defined data from the PDF.
23
+ - Do not summarize, analyze, or interpret the document — your only role is **accurate data extraction**.
24
+
25
+ Goal:
26
+ - Accurately upload the PDF.
27
+ - Extract the requested emission-related ESG data in a clean JSON format.
28
+ - Handle edge cases gracefully (e.g., invalid PDFs, no emission data found).
29
+
30
+ Behave like a highly precise and reliable data extraction engine.
31
+ """
32
+
33
+ llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
34
+
35
+ extractor_agent = create_agent(llm, [extract_emission_data_as_json], EXTRACTOR_SYSTEM_PROMPT)
application/agents/scraper_agent.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import json
4
+ from langchain_core.messages import ToolMessage
5
+ from typing import TypedDict, Annotated
6
+ from langgraph.graph.message import add_messages
7
+ from typing import Annotated, List
8
+ from langgraph.graph import StateGraph, END
9
+ from langchain_core.messages import ToolMessage, HumanMessage
10
+ from langchain_openai import ChatOpenAI
11
+
12
+ # Local Imports
13
+ from application.tools.web_search_tools import get_top_companies_from_web, get_sustainability_report_pdf
14
+ from application.tools.pdf_downloader_tool import download_pdf
15
+ from application.tools.emission_data_extractor import extract_emission_data_as_json
16
+ from application.services.langgraph_service import create_agent
17
+ from application.utils.logger import get_logger
18
+
19
+ # setting up environment and logger
20
+ load_dotenv()
21
+ logger = get_logger()
22
+
23
+ # Langsmith
24
+ LANGSMITH_API_KEY=os.getenv('LANGSMITH_API_KEY')
25
+ os.environ['LANGSMITH_API_KEY'] = LANGSMITH_API_KEY
26
+ os.environ['LANGCHAIN_TRACING_V2'] = 'true'
27
+ os.environ["LANGCHAIN_PROJECT"] = "Sustainability_AI"
28
+
29
+ # OpenAI api key set up
30
+ os.environ['OPENAI_API_KEY'] = os.environ.get("OPENAI_API_KEY")
31
+
32
+
33
+ class AgentState(TypedDict):
34
+ messages: Annotated[List, add_messages]
35
+
36
+ graph = StateGraph(AgentState)
37
+
38
+ model = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
39
+ tools = [get_top_companies_from_web, get_sustainability_report_pdf, download_pdf, extract_emission_data_as_json]
40
+ model_with_tools = model.bind_tools(tools)
41
+
42
+ def invoke_model(state: AgentState) -> dict:
43
+ """Invokes the LLM with the current conversation history."""
44
+ logger.info("--- Invoking Model ---")
45
+ # LangGraph automatically passes the entire state
46
+ # The model_with_tools expects a list of BaseMessages
47
+ response = model_with_tools.invoke(state['messages'])
48
+ # logger.info(f"Model response: {response}")
49
+ # We return a dictionary with the key corresponding to the state field name
50
+ return {"messages": [response]} # The response is already an AIMessage
51
+
52
+ def invoke_tools(state: AgentState) -> dict:
53
+ """Invokes the necessary tools based on the last AI message."""
54
+ logger.info("--- Invoking Tools ---")
55
+ # The state contains the history, the last message is the AI's request
56
+ last_message = state['messages'][-1]
57
+
58
+ # Check if the last message is an AIMessage with tool_calls
59
+ if not hasattr(last_message, 'tool_calls') or not last_message.tool_calls:
60
+ logger.info("No tool calls found in the last message.")
61
+ # This scenario might indicate the conversation should end or requires clarification
62
+ # For now, return an empty dict, which won't update the state significantly.
63
+ # Consider adding a message indicating no tools were called if needed.
64
+ return {}
65
+ # Alternative: return {"messages": [SystemMessage(content="No tool calls requested.")]}
66
+
67
+ tool_invocation_messages = []
68
+
69
+ # Find the tool object by name
70
+ tool_map = {tool.name: tool for tool in tools}
71
+
72
+ for tool_call in last_message.tool_calls:
73
+ tool_name = tool_call['name']
74
+ tool_args = tool_call['args']
75
+ tool_call_id = tool_call['id'] # Crucial for linking the result
76
+
77
+ logger.info(f"Executing tool: {tool_name} with args: {tool_args}")
78
+
79
+ if tool_name in tool_map:
80
+ selected_tool = tool_map[tool_name]
81
+ try:
82
+ # Use the tool's invoke method, passing the arguments dictionary
83
+ result = selected_tool.invoke(tool_args)
84
+
85
+ # IMPORTANT: Convert the result to a string or a JSON serializable format
86
+ # if it's a complex object. ToolMessage content should be simple.
87
+ # Adjust this based on what your tools actually return.
88
+ if isinstance(result, list) or isinstance(result, dict):
89
+ result_content = json.dumps(result) # Convert dict/list to JSON string
90
+ elif hasattr(result, 'companies') and isinstance(result.companies, list): # Handle CompanyListResponse example
91
+ result_content = f"Companies found: {', '.join(result.companies)}"
92
+ elif result is None:
93
+ result_content = "Tool executed successfully, but returned no specific data (None)."
94
+ else:
95
+ result_content = str(result) # Default to string conversion
96
+
97
+ logger.info(f"Tool {tool_name} result: {result_content}")
98
+ tool_invocation_messages.append(
99
+ ToolMessage(content=result_content, tool_call_id=tool_call_id)
100
+ )
101
+ except Exception as e:
102
+ logger.error(f"Error executing tool {tool_name}: {e}")
103
+ # Return an error message in the ToolMessage
104
+ tool_invocation_messages.append(
105
+ ToolMessage(content=f"Error executing tool {tool_name}: {str(e)}", tool_call_id=tool_call_id)
106
+ )
107
+ else:
108
+ logger.warning(f"Tool '{tool_name}' not found.")
109
+ tool_invocation_messages.append(
110
+ ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id)
111
+ )
112
+
113
+ # Return the collected ToolMessages to be added to the state
114
+ return {"messages": tool_invocation_messages}
115
+
116
+ # --- Graph Definition ---
117
+ graph_builder = StateGraph(AgentState)
118
+
119
+ # Add nodes
120
+ graph_builder.add_node("scraper_agent", invoke_model)
121
+ graph_builder.add_node("tools", invoke_tools) # Renamed for clarity
122
+
123
+ # Define edges
124
+ graph_builder.set_entry_point("scraper_agent")
125
+
126
+ # Conditional edge: After the agent runs, decide whether to call tools or end.
127
+ def router(state: AgentState) -> str:
128
+ """Determines the next step based on the last message."""
129
+ last_message = state['messages'][-1]
130
+ if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
131
+ # If the AI message has tool calls, invoke the tools node
132
+ logger.info("--- Routing to Tools ---")
133
+ return "tools"
134
+ else:
135
+ # Otherwise, the conversation can end
136
+ logger.info("--- Routing to End ---")
137
+ return END
138
+
139
+ graph_builder.add_conditional_edges(
140
+ "scraper_agent",
141
+ router,
142
+ {
143
+ "tools": "tools", # If router returns "tools", go to the "tools" node
144
+ END: END, # If router returns END, finish the graph execution
145
+ }
146
+ )
147
+
148
+ # After tools are invoked, their results (ToolMessages) should go back to the agent
149
+ graph_builder.add_edge("tools", "scraper_agent")
150
+
151
+ # Compile the graph
152
+ app = graph_builder.compile()
153
+
154
+ # # --- Running the Graph ---
155
+ # if __name__ == "__main__":
156
+ # logger.info("Starting graph execution...")
157
+ # # Use HumanMessage for the initial input
158
+ # initial_input = {"messages": [HumanMessage(content="Please download this pdf https://www.infosys.com/sustainability/documents/infosys-esg-report-2023-24.pdf")]}
159
+
160
+ # # Stream events to see the flow (optional, but helpful for debugging)
161
+ # # Add recursion limit to prevent infinite loops
162
+ # try:
163
+ # final_state = None
164
+ # for event in app.stream(initial_input, {"recursion_limit": 15}):
165
+ # # event is a dictionary where keys are node names and values are outputs
166
+ # logger.info(f"Event: {event}")
167
+ # # Keep track of the latest state if needed, especially the messages
168
+ # if "scraper_agent" in event:
169
+ # final_state = event["scraper_agent"]
170
+ # elif "tools" in event:
171
+ # final_state = event["tools"] # Though tool output doesn't directly give full state
172
+ # logger.info("---")
173
+
174
+ # logger.info("\n--- Final State Messages ---")
175
+ # # To get the absolute final state after streaming, invoke might be simpler,
176
+ # # or you need to properly aggregate the state from the stream events.
177
+ # # A simpler way to get final output:
178
+ # final_output = app.invoke(initial_input, {"recursion_limit": 15})
179
+ # logger.info(json.dumps(final_output['messages'][-1].dict(), indent=2)) # Print the last message
180
+
181
+ # except Exception as e:
182
+ # logger.error(f"\n--- An error occurred during graph execution ---")
183
+ # import traceback
184
+ # traceback.print_exc()
185
+
186
+
187
+ SCRAPER_SYSTEM_PROMPT = """
188
+ You are an intelligent assistant specialized in company research and sustainability report retrieval.
189
+
190
+ You have access to the following tools:
191
+ - **search_tool**: Use this tool when the user asks for a list of top companies related to an industry or category (e.g., "top 5 textile companies"). Always preserve any number mentioned (e.g., 'top 5', 'top 10') in the query.
192
+ - **pdf_finder_tool**: Use this tool when the user requests a sustainability report or any other specific PDF document about a company. Search specifically for the latest sustainability report if not otherwise specified.
193
+ - **pdf_downloader_tool**: Use this tool when the user provides a direct PDF link or asks you to download a PDF document from a URL.
194
+
195
+ Instructions:
196
+ - Carefully read the user's request and select the correct tool based on their intent.
197
+ - Always preserve important details like quantity (e.g., "top 5"), industry, or company name.
198
+ - If the user mentions multiple companies and asks for reports, find reports for **each** company individually.
199
+ - Do not add assumptions, opinions, or unrelated information.
200
+ - Always generate clean, direct, and minimal input for the tool — close to the user's original query.
201
+ - Prioritize the most recent information when searching for reports unless otherwise instructed.
202
+
203
+ Goal:
204
+ - Select the appropriate tool.
205
+ - Build a precise query that perfectly reflects the user's request.
206
+ - Return only what the user asks — no extra text or interpretation.
207
+ """
208
+ search_tool = get_top_companies_from_web
209
+ pdf_finder_tool = get_sustainability_report_pdf
210
+ pdf_downloader_tool = download_pdf
211
+
212
+ llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)
213
+
214
+ scraper_agent = create_agent(llm, [search_tool, pdf_finder_tool, pdf_downloader_tool], SCRAPER_SYSTEM_PROMPT)
application/services/gemini_model.py CHANGED
@@ -10,22 +10,57 @@ logger=logger.get_logger()
10
 
11
  client = genai.Client(api_key=os.getenv("gemini_api_key"))
12
 
13
- PROMPT = (
14
- """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
15
- and ESG (Environmental, Social, Governance) Data from a company’s sustainability
16
- or ESG report in PDF format.
 
 
 
 
 
 
17
 
18
- You must extract the data based on a predefined response schema. It is critical
19
- that you return all keys specified in the schema, even if the value is not present
20
- or not found in the document. If a value is missing or unavailable, return a suitable
21
- placeholder according to the format used
22
- in the schema.
23
 
24
- Your output should strictly follow the structure of the schema, ensuring completeness
25
- and consistency for downstream processing.
 
 
26
 
27
- Be precise in extracting values and identifying relevant context from the PDF. Use
28
- surrounding text or tables to identify the most likely match for each field.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
  )
31
 
@@ -176,6 +211,7 @@ def extract_emissions_data_as_json(
176
  config={
177
  'response_mime_type': 'application/json',
178
  'response_schema': response_schema,
 
179
  },
180
  )
181
  if hasattr(response, 'usage_metadata'):
 
10
 
11
  client = genai.Client(api_key=os.getenv("gemini_api_key"))
12
 
13
+ # PROMPT = (
14
+ # """You are a PDF parsing agent. Your job is to extract GHG Protocol Parameters
15
+ # and ESG (Environmental, Social, Governance) Data from a company’s sustainability
16
+ # or ESG report in PDF format.
17
+
18
+ # You must extract the data based on a predefined response schema. It is critical
19
+ # that you return all keys specified in the schema, even if the value is not present
20
+ # or not found in the document. If a value is missing or unavailable, return a suitable
21
+ # placeholder according to the format used
22
+ # in the schema.
23
 
24
+ # Your output should strictly follow the structure of the schema, ensuring completeness
25
+ # and consistency for downstream processing.
 
 
 
26
 
27
+ # Be precise in extracting values and identifying relevant context from the PDF. Use
28
+ # surrounding text or tables to identify the most likely match for each field.
29
+ # """
30
+ # )
31
 
32
+ PROMPT = (
33
+ """You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format.
34
+ Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.
35
+
36
+ ### Instructions:
37
+ 1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.
38
+ 2. **Data Sources**: Extract data from all relevant sections of the PDF, including:
39
+ - Narrative text
40
+ - Tables
41
+ - Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)
42
+ - Footnotes or appendices
43
+ 3. **Infographic Handling**: For infographics, prioritize:
44
+ - Text labels or annotations within the graphic
45
+ - Captions or descriptions near the infographic
46
+ - Legends or keys that clarify values
47
+ - If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.
48
+ 4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).
49
+ 5. **Edge Cases**:
50
+ - If data is missing, use placeholders as specified in the schema.
51
+ - If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.
52
+
53
+ ### Output Requirements:
54
+ - Return a JSON object adhering to the schema.
55
+ - Ensure all fields are populated, using placeholders for missing data.
56
+ - Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.
57
+
58
+
59
+ ### Task:
60
+ - Parse the PDF thoroughly to extract all relevant data.
61
+ - Ensure consistency in units, years, and terminology across the output.
62
+ - Handle infographics with care, prioritizing textual data and flagging estimates.
63
+ - Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.
64
  """
65
  )
66
 
 
211
  config={
212
  'response_mime_type': 'application/json',
213
  'response_schema': response_schema,
214
+ 'temperature': 0.0,
215
  },
216
  )
217
  if hasattr(response, 'usage_metadata'):
application/services/langgraph_service.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain.agents import AgentExecutor, create_openai_tools_agent
3
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
4
+ from langchain_core.tools import BaseTool
5
+ from langchain_openai import ChatOpenAI
6
+
7
+ def create_agent(
8
+ llm: ChatOpenAI,
9
+ tools: List[BaseTool],
10
+ system_prompt: str
11
+ ) -> AgentExecutor:
12
+ """Create an agent executor with given tools and a system prompt."""
13
+ prompt = ChatPromptTemplate.from_messages(
14
+ [
15
+ ("system", system_prompt),
16
+ MessagesPlaceholder(variable_name="messages"),
17
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
18
+ ]
19
+ )
20
+ agent = create_openai_tools_agent(llm, tools, prompt)
21
+ executor = AgentExecutor(agent=agent, tools=tools)
22
+ return executor
application/services/mongo_db_service.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pymongo import MongoClient
3
+ from dotenv import load_dotenv
4
+ from typing import List, Dict, Optional
5
+ from application.utils.logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+ load_dotenv()
10
+
11
+ DB_NAME = "sustainability_reports_db"
12
+
13
+ def get_mongo_client():
14
+ try:
15
+ client = MongoClient(os.getenv("MONGODB_URI"))
16
+ return client
17
+ except Exception as e:
18
+ logger.exception(f"An unexpected error occurred while connecting to MongoDB: {str(e)}")
19
+ return None
20
+
21
+ def store_document(collection_name: str, document: Dict) -> Optional[str]:
22
+ """
23
+ Stores a document in MongoDB if it doesn't already exist.
24
+
25
+ Args:
26
+ collection_name (str): Name of the MongoDB collection.
27
+ document (Dict): The document to be inserted.
28
+
29
+ Returns:
30
+ Optional[str]: Inserted document ID if successful, None otherwise.
31
+ """
32
+ try:
33
+ client = get_mongo_client()
34
+ if client is None:
35
+ logger.error("MongoDB client is not available.")
36
+ return None
37
+
38
+ db = client.get_database(DB_NAME)
39
+ collection = db[collection_name]
40
+
41
+ # Check if a similar document already exists
42
+ existing_document = collection.find_one(document)
43
+ if existing_document:
44
+ logger.info(f"Document already exists with ID: {existing_document['_id']}")
45
+ return str(existing_document['_id'])
46
+
47
+ # If no existing document, insert the new one
48
+ result = collection.insert_one(document)
49
+ logger.info(f"New document inserted with ID: {result.inserted_id}")
50
+ return str(result.inserted_id)
51
+
52
+ except Exception as e:
53
+ logger.exception(f"An unexpected error occurred: {str(e)}")
54
+
55
+ return None
56
+
57
+ def retrieve_documents(collection_name: str, query: Optional[Dict] = None) -> List[Dict]:
58
+ """
59
+ Retrieves documents from the specified MongoDB collection.
60
+
61
+ Args:
62
+ collection_name (str): Name of the MongoDB collection.
63
+ query (Optional[Dict]): A MongoDB query filter. Defaults to {} (fetch all documents).
64
+
65
+ Returns:
66
+ List[Dict]: A list of documents matching the query. Empty list if none found or error occurs.
67
+ """
68
+ try:
69
+ client = get_mongo_client()
70
+ if client is None:
71
+ logger.error("MongoDB client is not available.")
72
+ return []
73
+
74
+ db = client.get_database(DB_NAME)
75
+ collection = db[collection_name]
76
+
77
+ documents_cursor = collection.find(query or {})
78
+ documents = list(documents_cursor)
79
+
80
+ logger.info(f"Retrieved {len(documents)} documents from collection: {collection_name}")
81
+ return documents
82
+
83
+ except Exception as e:
84
+ logger.exception(f"An error occurred while retrieving documents: {str(e)}")
85
+ return []
86
+
87
+ # all_docs = retrieve_documents("Zalando")
application/tools/emission_data_extractor.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ from typing import Optional, Dict, Union, BinaryIO
5
+ import requests
6
+ from google import genai
7
+ from google.genai import types
8
+ from application.utils.logger import get_logger
9
+ from application.services.gemini_model import upload_file
10
+ from application.services.mongo_db_service import store_document
11
+ from application.schemas.response_schema import GEMINI_GHG_PARAMETERS
12
+ from langchain_core.tools import tool
13
+
14
+ logger = get_logger()
15
+
16
+ client = genai.Client(api_key=os.getenv("gemini_api_key"))
17
+ MODEL = "gemini-2.0-flash"
18
+
19
+
20
+ PROMPT = (
21
+ """You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format.
22
+ Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing.
23
+
24
+ ### Instructions:
25
+ 1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder.
26
+ 2. **Data Sources**: Extract data from all relevant sections of the PDF, including:
27
+ - Narrative text
28
+ - Tables
29
+ - Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data)
30
+ - Footnotes or appendices
31
+ 3. **Infographic Handling**: For infographics, prioritize:
32
+ - Text labels or annotations within the graphic
33
+ - Captions or descriptions near the infographic
34
+ - Legends or keys that clarify values
35
+ - If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics.
36
+ 4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data).
37
+ 5. **Edge Cases**:
38
+ - If data is missing, use placeholders as specified in the schema.
39
+ - If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema.
40
+
41
+ ### Output Requirements:
42
+ - Return a JSON object adhering to the schema.
43
+ - Ensure all fields are populated, using placeholders for missing data.
44
+ - Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction.
45
+
46
+
47
+ ### Task:
48
+ - Parse the PDF thoroughly to extract all relevant data.
49
+ - Ensure consistency in units, years, and terminology across the output.
50
+ - Handle infographics with care, prioritizing textual data and flagging estimates.
51
+ - Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions.
52
+ """
53
+ )
54
+
55
+ @tool
56
+ def extract_emission_data_as_json(file_input: Union[BinaryIO, bytes, str]) -> Optional[Dict]:
57
+ """
58
+ Extracts emission-related ESG data from a PDF file using the Gemini API.
59
+
60
+ This function uploads the provided PDF (local file path, binary file, or byte stream) to Gemini,
61
+ sends a structured prompt to extract relevant emission data, and attempts to parse the response as JSON.
62
+
63
+ Args:
64
+ file_input (Union[BinaryIO, bytes, str]):
65
+ The input file to process. Can be a file object, byte stream, or local file path.
66
+
67
+ Returns:
68
+ Optional[Dict]:
69
+ A dictionary containing the extracted emission data if parsing succeeds,
70
+ or a dictionary with the raw text response if JSON parsing fails.
71
+ Returns None if the extraction process encounters an error.
72
+
73
+ Raises:
74
+ Exception:
75
+ Logs and handles any unexpected errors during file upload, Gemini API interaction, or response parsing.
76
+
77
+ Notes:
78
+ - The function automatically handles uploading if the file is not already present on Gemini.
79
+ - If the response is not valid JSON, the raw response text is returned under the key "raw_response".
80
+ - Token usage information (input, output, total tokens) is logged if available.
81
+ """
82
+ try:
83
+
84
+ uploaded_file = upload_file(file=file_input)
85
+
86
+ response = client.models.generate_content(
87
+ model=MODEL,
88
+ contents=[uploaded_file, PROMPT],
89
+ config={
90
+ 'response_mime_type': 'application/json',
91
+ 'response_schema': GEMINI_GHG_PARAMETERS,
92
+ 'temperature': 0.0,
93
+ },
94
+ )
95
+ if hasattr(response, 'usage_metadata'):
96
+ logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}")
97
+ logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}")
98
+ logger.info(f"Total tokens: {response.usage_metadata.total_token_count}")
99
+ else:
100
+ logger.info("Token usage metadata not available in response")
101
+
102
+ logger.info("[Gemini] Response received.")
103
+ try:
104
+ result = json.loads(response.text)
105
+ file_name = result.get('Company Name', 'Unknown Company')
106
+ document ={"Greenhouse Gas (GHG) Protocol Parameters": result.get('Greenhouse Gas (GHG) Protocol Parameters')}
107
+ store_document(file_name, document)
108
+ return json.loads(response.text)
109
+ except json.JSONDecodeError:
110
+ logger.warning("Failed to parse JSON, returning raw response.")
111
+ return {"raw_response": response.text}
112
+
113
+ except Exception as e:
114
+ logger.exception("Error during ESG data extraction.")
115
+ return None
application/tools/pdf_downloader_tool.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from application.utils.logger import get_logger
4
+
5
+ from langchain_core.tools import tool
6
+
7
+
8
+ logger = get_logger()
9
+
10
+ @tool
11
+ def download_pdf(filename:str, url: str, save_path: str = "reports", overwrite: bool = False):
12
+ """
13
+ Downloads a PDF file from a given URL ('pdf_link') and saves it locally
14
+ with the specified 'filename'. Returns the local path if successful, otherwise None.
15
+ Use this tool AFTER get_sustainability_report_pdf has returned a valid PDF link or if user provides the PDF link.
16
+
17
+ Args:
18
+ filename (str): The name to save the PDF as (should end with .pdf).
19
+ url (str): The direct URL to the PDF file.
20
+ save_path (str): The directory to save the PDF into (default: "reports").
21
+ overwrite (bool): Whether to overwrite the file if it already exists.
22
+
23
+ Returns:
24
+ str | None: The path to the saved file if successful, otherwise None.
25
+ """
26
+ try:
27
+ # parsed_url = urlparse(url)
28
+ # filename = os.path.basename(parsed_url.path)
29
+
30
+ if not filename.lower().endswith(".pdf"):
31
+ logger.warning(f"URL does not point to a PDF file: {url}")
32
+ return None
33
+
34
+ os.makedirs(save_path, exist_ok=True)
35
+ full_path = os.path.join(save_path, filename)
36
+
37
+ if os.path.exists(full_path) and not overwrite:
38
+ logger.info(f"File already exists, skipping download: {full_path}")
39
+ return full_path
40
+
41
+ logger.info(f"Starting download from {url}")
42
+
43
+ response = requests.get(url, stream=True, timeout=20)
44
+ response.raise_for_status()
45
+
46
+ with open(full_path, "wb") as file:
47
+ for chunk in response.iter_content(chunk_size=8192):
48
+ if chunk:
49
+ file.write(chunk)
50
+
51
+ logger.info(f"Successfully downloaded to: {full_path}")
52
+ return full_path
53
+
54
+ except requests.exceptions.Timeout:
55
+ logger.error(f"Timeout while downloading {url}")
56
+ except requests.exceptions.HTTPError as http_err:
57
+ logger.error(f"HTTP error while downloading {url}: {http_err}")
58
+ except requests.exceptions.RequestException as req_err:
59
+ logger.error(f"Request error while downloading {url}: {req_err}")
60
+ except Exception as e:
61
+ logger.error(f"Unexpected error: {e}")
62
+
63
+ return None
application/tools/web_search_tools.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from openai import OpenAI
4
+ from pydantic import BaseModel
5
+ from typing import List
6
+ from application.utils.logger import get_logger
7
+ from typing import Literal
8
+ from duckduckgo_search import DDGS
9
+ from tavily import TavilyClient
10
+ from langchain_core.tools import tool
11
+
12
+ logger = get_logger()
13
+ load_dotenv()
14
+ os.makedirs("reports", exist_ok=True)
15
+
16
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
+ tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
18
+ client = OpenAI(api_key=OPENAI_API_KEY)
19
+
20
+ class CompanyListResponse(BaseModel):
21
+ companies: List[str]
22
+
23
+ # parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss']
24
+
25
+ @tool
26
+ def get_top_companies_from_web(query: str):
27
+
28
+ """
29
+ # Searches the web for a list of top companies based on a given query.
30
+
31
+ Extracts the number of companies from the query if specified; defaults to 5 otherwise.
32
+ Returns only the specified number of company names in a list format.
33
+
34
+ Args:
35
+ query (str): The search query from the user.
36
+
37
+ Returns:
38
+ CompanyListResponse: A structured list of top company names.
39
+ """
40
+ prompt = (
41
+ f"{query} "
42
+ "focusing on globally recognized companies known for size, influence, or sustainability efforts. "
43
+ "Respond with a Python list of company names only, no explanation. "
44
+ "Example: ['Company A', 'Company B', 'Company C']. "
45
+ "Please do not include any other text or formatting."
46
+ )
47
+ logger.info(f'User query : {query}')
48
+ try:
49
+ response = client.responses.create(
50
+ model="gpt-4o-mini",
51
+ tools=[{"type": "web_search_preview"}],
52
+ input=prompt,
53
+ )
54
+
55
+ output = response.output_text
56
+ # logger.info(f"Raw Output: {output}")
57
+ parsed_list = eval(output.strip())
58
+ logger.info(f"Parsed List: {parsed_list}")
59
+ result = CompanyListResponse(companies=parsed_list)
60
+ return result
61
+ except Exception as e:
62
+ logger.error(f"Error parsing response: {e}")
63
+ raise ValueError(f"Failed to parse company list: {output}")
64
+
65
+ @tool
66
+ def get_sustainability_report_pdf(
67
+ company_name: str,
68
+ year: int | None = None,
69
+ max_results: int = 1,
70
+ search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo",
71
+ ) -> str | None:
72
+
73
+ """
74
+ Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company.
75
+ Use this tool when the user provides the exact name of the company they want the report for.
76
+ Optionally, a specific 'year' can be provided.
77
+
78
+ Args:
79
+ company_name (str): The name of the company.
80
+ year (int, optional): The year of the sustainability report. Defaults to None.
81
+ max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1.
82
+ search_engine (str, optional): Search engine to use.
83
+ - "tavily" : only use Tavily search
84
+ - "duckduckgo" : only use DuckDuckGo
85
+ - "both" (default): try Tavily first, fallback to DuckDuckGo if needed
86
+
87
+ Returns:
88
+ str or None: The URL of the sustainability report PDF if found, otherwise None.
89
+
90
+ Search Strategy:
91
+ - Tavily: Searches with advanced search settings.
92
+ - DuckDuckGo: Searches public web with 'filetype:pdf' filter.
93
+ - Only URLs ending with '.pdf' are considered valid.
94
+
95
+ Notes:
96
+ - Any search failures are internally handled and logged.
97
+ """
98
+
99
+ def search_with_tavily(query: str) -> str | None:
100
+ try:
101
+ logger.info(f"Searching Tavily for: {query}")
102
+ result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results)
103
+ urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")]
104
+ if urls:
105
+ logger.info(f"Found PDF via Tavily: {urls[0]}")
106
+ return urls[0]
107
+ logger.info("No PDF found via Tavily.")
108
+ except Exception as e:
109
+ logger.error(f"Tavily search error: {e}")
110
+ return None
111
+
112
+ def search_with_duckduckgo(query: str, max_results: int) -> str | None:
113
+ try:
114
+ logger.info(f"Searching DuckDuckGo for: {query}")
115
+ with DDGS() as ddgs:
116
+ search_results = ddgs.text(query.strip(), max_results=max_results)
117
+ for result in search_results:
118
+ pdf_url = result.get('href', '')
119
+ if pdf_url.lower().endswith('.pdf'):
120
+ logger.info(f"Found PDF via DuckDuckGo: {pdf_url}")
121
+ return pdf_url
122
+ else:
123
+ logger.info(f"Skipped non-PDF link: {pdf_url}")
124
+ except Exception as error:
125
+ logger.error(f"DuckDuckGo search error: {error}")
126
+ return None
127
+
128
+ # Compose search query
129
+ query = f"{company_name} sustainability report filetype:pdf"
130
+ if year:
131
+ query += f" {year}"
132
+
133
+ logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.")
134
+
135
+ # Perform search according to engine selection
136
+ if search_engine == "tavily":
137
+ return search_with_tavily(query)
138
+
139
+ elif search_engine == "duckduckgo":
140
+ return search_with_duckduckgo(query, max_results=max_results)
141
+
142
+ elif search_engine == "both":
143
+ pdf_url = search_with_tavily(query)
144
+ if not pdf_url:
145
+ pdf_url = search_with_duckduckgo(query, max_results=max_results)
146
+ return pdf_url
147
+
148
+ else:
149
+ logger.error(f"Invalid search engine option provided: {search_engine}")
150
+ raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.")
pages/chatbot.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+
4
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
5
+ # from application.agents.scraper_agent import app
6
+ # from application.utils.logger import get_logger
7
+
8
+ try:
9
+ from application.agents.scraper_agent import app
10
+ # from application.main import graph
11
+ from application.utils.logger import get_logger
12
+ except ImportError as e:
13
+ st.error(f"Import Error: Ensure backend modules are accessible. Details: {e}")
14
+ st.stop()
15
+
16
+ logger = get_logger()
17
+
18
+ st.set_page_config(page_title="Sustainability AI Assistant", layout="wide")
19
+ st.title("♻️ Sustainability Report AI Assistant")
20
+ st.caption(
21
+ "Ask about sustainability reports by company or industry! "
22
+ "(e.g., 'Get report for Apple', 'Download report for Microsoft 2023', "
23
+ "'Find reports for top 3 airline companies', 'Download this pdf <link>')"
24
+ )
25
+
26
+ load_dotenv()
27
+
28
+ def initialize_chat_history():
29
+ """Initialize session chat history."""
30
+ if "messages" not in st.session_state:
31
+ st.session_state.messages = []
32
+ logger.info("Initialized empty chat history in session state.")
33
+
34
+ def display_chat_history():
35
+ """Render previous chat messages."""
36
+ for message in st.session_state.messages:
37
+ if isinstance(message, SystemMessage):
38
+ # st.info(f"System: {message.content}")
39
+ pass
40
+ elif isinstance(message, HumanMessage):
41
+ with st.chat_message("user"):
42
+ st.markdown(message.content)
43
+ elif isinstance(message, AIMessage):
44
+ with st.chat_message("assistant"):
45
+ st.markdown(message.content)
46
+
47
+ def invoke_agent():
48
+ """Invoke the LangGraph agent and update session state."""
49
+ try:
50
+ graph_input = {"messages": st.session_state.messages}
51
+ logger.info("Invoking LangGraph agent...")
52
+
53
+ # final_output_state = graph.invoke(graph_input, {"recursion_limit": 15})
54
+
55
+ final_output_state = app.invoke(graph_input, {"recursion_limit": 15})
56
+
57
+ logger.info("Agent invocation completed successfully.")
58
+ return final_output_state
59
+
60
+ except Exception as e:
61
+ logger.error("Agent invocation failed.", exc_info=True)
62
+ st.error(f"An error occurred while processing your request: {e}")
63
+ return None
64
+
65
+ def display_last_ai_response():
66
+ """Display the latest AI message, if any."""
67
+ last_ai_message = next(
68
+ (msg for msg in reversed(st.session_state.messages) if isinstance(msg, AIMessage)),
69
+ None
70
+ )
71
+ if last_ai_message:
72
+ with st.chat_message("assistant"):
73
+ st.markdown(last_ai_message.content)
74
+ logger.info("Displayed latest AI response.")
75
+ else:
76
+ st.warning("Agent completed without a final AI message.")
77
+ logger.warning("No AI message found in the final output.")
78
+
79
+ initialize_chat_history()
80
+ display_chat_history()
81
+
82
+ if user_query := st.chat_input("Your question about sustainability reports..."):
83
+ logger.info(f"User input received: {user_query}")
84
+
85
+ st.session_state.messages.append(HumanMessage(content=user_query))
86
+
87
+ with st.chat_message("user"):
88
+ st.markdown(user_query)
89
+
90
+ with st.spinner("Processing your request... Please wait."):
91
+ final_output_state = invoke_agent()
92
+
93
+ if final_output_state:
94
+ st.session_state.messages = final_output_state['messages']
95
+ display_last_ai_response()
96
+
97
+ with st.sidebar:
98
+ st.markdown("---")
99
+ if st.button("Clear Chat History"):
100
+ st.session_state.messages = []
101
+ logger.info("Chat history cleared by user.")
102
+ st.rerun()
pages/multiple_pdf_extractor.py CHANGED
@@ -24,12 +24,12 @@ AVAILABLE_MODELS = [
24
 
25
  RESPONSE_SCHEMAS = {
26
  "Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
27
- "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
28
- "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
29
- "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
30
- "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
31
- "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
32
- "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
33
  }
34
 
35
  selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)
 
24
 
25
  RESPONSE_SCHEMAS = {
26
  "Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
27
+ # "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
28
+ # "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
29
+ # "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
30
+ # "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
31
+ # "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
32
+ # "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
33
  }
34
 
35
  selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)
requirements.txt CHANGED
@@ -6,4 +6,11 @@ google.genai
6
  google-generativeai
7
  pandas
8
  supabase
9
- openpyxl
 
 
 
 
 
 
 
 
6
  google-generativeai
7
  pandas
8
  supabase
9
+ openpyxl
10
+ langchain
11
+ pymongo
12
+ langgraph
13
+ langsmith
14
+ tavily-python
15
+ duckduckgo-search
16
+ langchain_openai