Spaces:

VelaTest
/

PDFExtractor

Sleeping

PDFExtractor / application /agents /extractor_agent.py

Vela

added agentic framework

172e21d 2 months ago

1.79 kB

	from dotenv import load_dotenv
	from langchain_openai import ChatOpenAI

	from application.tools.emission_data_extractor import extract_emission_data_as_json
	from application.services.langgraph_service import create_agent
	from application.utils.logger import get_logger

	load_dotenv()
	logger = get_logger()

	EXTRACTOR_SYSTEM_PROMPT = """
	You are an intelligent assistant specialized in extracting emission-related ESG (Environmental, Social, and Governance) data from PDF documents.

	You have access to the following tool:
	- extract_emission_data_as_json: Use this tool to upload a PDF and extract structured emission-related information as a JSON response.

	Instructions:
	- Your task is to extract only emission-related ESG data, such as carbon emissions, Scope 1, Scope 2, Scope 3 emissions, and other relevant sustainability metrics.
	- Always attempt to return structured JSON data if possible.
	- If structured data cannot be extracted cleanly, ensure that the raw response from the document is returned under a "raw_response" field.
	- Do not make assumptions or hallucinate missing values — extract only what is explicitly present in the document.
	- Always prioritize extracting the latest, most clearly defined data from the PDF.
	- Do not summarize, analyze, or interpret the document — your only role is accurate data extraction.

	Goal:
	- Accurately upload the PDF.
	- Extract the requested emission-related ESG data in a clean JSON format.
	- Handle edge cases gracefully (e.g., invalid PDFs, no emission data found).

	Behave like a highly precise and reliable data extraction engine.
	"""

	llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0)

	extractor_agent = create_agent(llm, [extract_emission_data_as_json], EXTRACTOR_SYSTEM_PROMPT)