Upload 4 files
Browse files- main.py +178 -0
- processing_utility.py +166 -0
- rag_utils.py +273 -0
- requirements.txt +144 -0
main.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import tempfile
|
4 |
+
import requests
|
5 |
+
from fastapi import FastAPI, HTTPException
|
6 |
+
from pydantic import BaseModel
|
7 |
+
from typing import List, Dict, Union
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
|
10 |
+
from processing_utility import download_and_parse_document, extract_schema_from_file
|
11 |
+
|
12 |
+
# Import functions and constants from the colbert_utils.py file
|
13 |
+
# Make sure colbert_utils.py is in the same directory or accessible via PYTHONPATH
|
14 |
+
from rag_utils import (
|
15 |
+
process_markdown_with_manual_sections,
|
16 |
+
perform_vector_search,
|
17 |
+
generate_answer_with_groq,
|
18 |
+
CHUNK_SIZE,
|
19 |
+
CHUNK_OVERLAP,
|
20 |
+
TOP_K_CHUNKS,
|
21 |
+
GROQ_MODEL_NAME
|
22 |
+
)
|
23 |
+
|
24 |
+
load_dotenv()
|
25 |
+
|
26 |
+
|
27 |
+
# --- FastAPI App Initialization ---
|
28 |
+
app = FastAPI(
|
29 |
+
title="HackRX RAG API",
|
30 |
+
description="API for Retrieval-Augmented Generation from PDF documents.",
|
31 |
+
version="1.0.0",
|
32 |
+
)
|
33 |
+
|
34 |
+
# --- Groq API Key Setup ---
|
35 |
+
# It's highly recommended to set this as an environment variable in production.
|
36 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "NOT_FOUND")
|
37 |
+
if GROQ_API_KEY == "NOT_FOUND":
|
38 |
+
print("WARNING: GROQ_API_KEY is using a placeholder or hardcoded value. Please set GROQ_API_KEY environment variable for production.")
|
39 |
+
|
40 |
+
# --- Pydantic Models for Request and Response ---
|
41 |
+
class RunRequest(BaseModel):
|
42 |
+
documents: str # URL to the PDF document
|
43 |
+
questions: List[str]
|
44 |
+
|
45 |
+
class Answer(BaseModel):
|
46 |
+
answer: str
|
47 |
+
|
48 |
+
class RunResponse(BaseModel):
|
49 |
+
answers: List[Answer]
|
50 |
+
|
51 |
+
# --- Pseudo-functions (Replace with actual implementations if needed) ---
|
52 |
+
|
53 |
+
def convert_to_markdown(pdf_url: str) -> str:
|
54 |
+
"""
|
55 |
+
PSEUDO-FUNCTION: Downloads the PDF from the URL and returns its local path.
|
56 |
+
In a real scenario, this might involve converting PDF to Markdown,
|
57 |
+
but for process_pdf_with_manual_sections, we just need the local PDF path.
|
58 |
+
"""
|
59 |
+
print(f"Downloading PDF from: {pdf_url}")
|
60 |
+
try:
|
61 |
+
response = requests.get(pdf_url, stream=True)
|
62 |
+
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
63 |
+
|
64 |
+
# Create a temporary file to store the PDF
|
65 |
+
temp_pdf_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
66 |
+
for chunk in response.iter_content(chunk_size=8192):
|
67 |
+
temp_pdf_file.write(chunk)
|
68 |
+
temp_pdf_file.close()
|
69 |
+
print(f"PDF downloaded to temporary path: {temp_pdf_file.name}")
|
70 |
+
return temp_pdf_file.name
|
71 |
+
except requests.exceptions.RequestException as e:
|
72 |
+
raise HTTPException(status_code=500, detail=f"Failed to download PDF from URL: {e}")
|
73 |
+
except Exception as e:
|
74 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during PDF download: {e}")
|
75 |
+
|
76 |
+
|
77 |
+
def fetch_headings_json(pdf_url: str) -> Dict:
|
78 |
+
"""
|
79 |
+
PSEUDO-FUNCTION: Fetches section headings for the PDF.
|
80 |
+
In a real scenario, this would involve a more sophisticated service
|
81 |
+
or logic to extract headings from the PDF.
|
82 |
+
For this example, we return a hardcoded dummy JSON.
|
83 |
+
"""
|
84 |
+
print(f"Fetching headings for PDF URL (pseudo-function): {pdf_url}")
|
85 |
+
# This dummy JSON should match the expected schema for process_pdf_with_manual_sections
|
86 |
+
# {"data":{"headings": ["Your Heading"]}}
|
87 |
+
dummy_headings = {
|
88 |
+
"run_id": "dummy-run-id",
|
89 |
+
"extraction_agent_id": "dummy-agent-id",
|
90 |
+
"data": {
|
91 |
+
"headings": [
|
92 |
+
"Policy Wordings",
|
93 |
+
"SECTION A) PREAMBLE",
|
94 |
+
"SECTION B) DEFINITIONS - STANDARD DEFINITIONS",
|
95 |
+
"SECTION B) DEFINITIONS - SPECIFIC DEFINITIONS",
|
96 |
+
"SECTION C) BENEFITS COVERED UNDER THE POLICY",
|
97 |
+
"PART A- COVERAGE- Domestic (Within India Only, for Imperial and Imperial Plus Plans)",
|
98 |
+
"PART B- COVERAGE- International",
|
99 |
+
"SECTION D) EXCLUSIONS- STANDARD EXCLUSIONS APPLICABLE TO PART A- DOMESTIC COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
|
100 |
+
"SECTION D) EXCLUSIONS– SPECIFIC EXCLUSIONS APPLICABLE TO PART A- DOMESTIC COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
|
101 |
+
"SECTION D) EXCLUSIONS- STANDARD EXCLUSIONS APPLICABLE TO PART B- INTERNATIONAL COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
|
102 |
+
"SECTION D) EXCLUSIONS– SPECIFIC EXCLUSIONS APPLICABLE TO INTERNATIONAL COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
|
103 |
+
"SECTION E) GENERAL TERMS AND CONDITIONS - STANDARD GENERAL TERMS AND CONDITIONS",
|
104 |
+
"SECTION E) GENERAL TERMS AND CONDITIONS - SPECIFIC TERMS AND CONDITIONS",
|
105 |
+
"SECTION E) GENERAL TERMS AND CLAUSES - STANDARD GENERAL TERMS AND CLAUSES"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
"extraction_metadata": {
|
109 |
+
"field_metadata": {},
|
110 |
+
"usage": {
|
111 |
+
"num_pages_extracted": 49,
|
112 |
+
"num_document_tokens": 48701,
|
113 |
+
"num_output_tokens": 1229
|
114 |
+
}
|
115 |
+
}
|
116 |
+
}
|
117 |
+
return dummy_headings
|
118 |
+
|
119 |
+
# --- API Endpoint ---
|
120 |
+
@app.post("/hackrx/run", response_model=RunResponse)
|
121 |
+
async def run_rag_pipeline(request: RunRequest):
|
122 |
+
"""
|
123 |
+
Runs the RAG pipeline for a given PDF document and a list of questions.
|
124 |
+
"""
|
125 |
+
pdf_url = request.documents
|
126 |
+
questions = request.questions
|
127 |
+
|
128 |
+
local_pdf_path = None
|
129 |
+
try:
|
130 |
+
# Step 1: Download PDF (using pseudo-function)
|
131 |
+
local_markdown_path = await download_and_parse_document(pdf_url) # Renamed from convert_to_markdown to reflect it returns local path
|
132 |
+
|
133 |
+
# Step 2: Fetch headings JSON (using pseudo-function)
|
134 |
+
headings_json = extract_schema_from_file(local_markdown_path)
|
135 |
+
with open("output.json", 'w', encoding='utf-8') as f:
|
136 |
+
json.dump(headings_json, f, indent=4, ensure_ascii=False)
|
137 |
+
if not headings_json or not headings_json.get("headings"):
|
138 |
+
raise HTTPException(status_code=400, detail="Could not retrieve valid headings from the provided PDF URL.")
|
139 |
+
|
140 |
+
# Step 3: Process PDF with manual sections to get chunks with metadata
|
141 |
+
print("Processing PDF into chunks with manual sections...")
|
142 |
+
processed_documents = process_markdown_with_manual_sections(
|
143 |
+
local_markdown_path,
|
144 |
+
headings_json,
|
145 |
+
CHUNK_SIZE,
|
146 |
+
CHUNK_OVERLAP
|
147 |
+
)
|
148 |
+
if not processed_documents:
|
149 |
+
raise HTTPException(status_code=500, detail="Failed to process PDF into document chunks.")
|
150 |
+
|
151 |
+
all_answers = []
|
152 |
+
# Step 4: Iterate through questions, perform search, and generate answers
|
153 |
+
for i, question in enumerate(questions):
|
154 |
+
print(f"Processing question {i+1}/{len(questions)}: '{question}'")
|
155 |
+
# Perform vector search
|
156 |
+
retrieved_results = perform_vector_search(processed_documents, question, TOP_K_CHUNKS)
|
157 |
+
|
158 |
+
if retrieved_results:
|
159 |
+
# Generate answer using Groq
|
160 |
+
answer_text = generate_answer_with_groq(question, retrieved_results, GROQ_API_KEY)
|
161 |
+
else:
|
162 |
+
answer_text = "No relevant information found in the document to answer this question."
|
163 |
+
|
164 |
+
all_answers.append(Answer(answer=answer_text))
|
165 |
+
|
166 |
+
return RunResponse(answers=all_answers)
|
167 |
+
|
168 |
+
except HTTPException as e:
|
169 |
+
raise e
|
170 |
+
except Exception as e:
|
171 |
+
print(f"An unhandled error occurred: {e}")
|
172 |
+
raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")
|
173 |
+
finally:
|
174 |
+
# Clean up the temporary PDF file
|
175 |
+
if local_pdf_path and os.path.exists(local_pdf_path):
|
176 |
+
os.unlink(local_pdf_path)
|
177 |
+
print(f"Cleaned up temporary PDF file: {local_pdf_path}")
|
178 |
+
|
processing_utility.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import httpx # An asynchronous HTTP client.
|
2 |
+
import os # To handle file paths and create directories.
|
3 |
+
import asyncio # To run synchronous libraries in an async environment.
|
4 |
+
from urllib.parse import unquote, urlparse # To get the filename from the URL.
|
5 |
+
import uuid # To generate unique filenames if needed.
|
6 |
+
|
7 |
+
from pydantic import HttpUrl
|
8 |
+
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
|
9 |
+
import json
|
10 |
+
import re
|
11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
from langchain.schema import Document
|
13 |
+
|
14 |
+
|
15 |
+
import os
|
16 |
+
import argparse
|
17 |
+
from typing import Optional
|
18 |
+
|
19 |
+
# Ensure required libraries are installed.
|
20 |
+
# You can install them using:
|
21 |
+
# pip install llama_cloud_services pydantic python-dotenv
|
22 |
+
|
23 |
+
from llama_cloud_services import LlamaExtract
|
24 |
+
from pydantic import BaseModel, Field
|
25 |
+
from dotenv import load_dotenv
|
26 |
+
|
27 |
+
class Insurance(BaseModel):
|
28 |
+
"""
|
29 |
+
A Pydantic model to define the data schema for extraction.
|
30 |
+
The description helps guide the AI model.
|
31 |
+
"""
|
32 |
+
headings: str = Field(description="An array of headings")
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def extract_schema_from_file(file_path: str) -> Optional[Insurance]:
|
37 |
+
"""
|
38 |
+
Initializes the LlamaExtract client, creates an agent, and extracts
|
39 |
+
data from the provided file path based on the Resume schema.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
file_path (str): The path to the local markdown file.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
An instance of the Resume Pydantic model containing the extracted data,
|
46 |
+
or None if the extraction fails or the file doesn't exist.
|
47 |
+
"""
|
48 |
+
if not os.path.exists(file_path):
|
49 |
+
print(f"❌ Error: The file '{file_path}' was not found.")
|
50 |
+
return None
|
51 |
+
|
52 |
+
print(f"🚀 Initializing extractor and sending '{file_path}' to LlamaCloud...")
|
53 |
+
|
54 |
+
try:
|
55 |
+
# Initialize the LlamaExtract client.
|
56 |
+
# It will automatically use the LLAMA_CLOUD_API_KEY from the environment.
|
57 |
+
extractor = LlamaExtract()
|
58 |
+
|
59 |
+
# Create an extraction agent with a specific name and data schema.
|
60 |
+
# This configuration is taken directly from your provided code.
|
61 |
+
# Assuming 'extractor' and 'Insurance' are already defined
|
62 |
+
# Define your configuration dictionary
|
63 |
+
agent_config = {
|
64 |
+
"system_prompt": "Identify and extract the primary section/segment headings within this legal or policy document. Focus on headings that establish the overarching theme or context for the entire block of text they introduce. Examples include 'Introduction', 'Definitions', 'Scope', 'Liabilities', or 'Terms and Conditions'. Do not extract subheadings or any headings that merely denote a list item."
|
65 |
+
}
|
66 |
+
|
67 |
+
# Create the agent, passing the config dictionary
|
68 |
+
#agent = extractor.create_agent(name="insurance-parser", data_schema=Insurance)
|
69 |
+
agent = extractor.get_agent(name="insurance-parser")
|
70 |
+
|
71 |
+
|
72 |
+
# Call the agent to extract data from the specified document.
|
73 |
+
print("🤖 Agent created. Starting extraction...")
|
74 |
+
result = agent.extract(file_path)
|
75 |
+
|
76 |
+
if result and result.data:
|
77 |
+
print("✅ Extraction successful!")
|
78 |
+
# The function returns the structured data.
|
79 |
+
return result.data
|
80 |
+
else:
|
81 |
+
print("⚠️ Extraction did not return any data.")
|
82 |
+
return None
|
83 |
+
|
84 |
+
except Exception as e:
|
85 |
+
print(f"\n❌ An error occurred during the API call: {e}")
|
86 |
+
print("Please check your API key, network connection, and file format.")
|
87 |
+
return None
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
async def download_and_parse_document(doc_url: HttpUrl) -> str:
|
92 |
+
"""
|
93 |
+
Asynchronously downloads a document, saves it to a local directory,
|
94 |
+
and then parses it using LangChain's PyMuPDF4LLMLoader.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
doc_url: The Pydantic-validated URL of the document to process.
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
A single string containing the document's content as structured Markdown.
|
101 |
+
"""
|
102 |
+
print(f"Initiating download from: {doc_url}")
|
103 |
+
try:
|
104 |
+
# Create the local storage directory if it doesn't exist.
|
105 |
+
LOCAL_STORAGE_DIR = "data/"
|
106 |
+
os.makedirs(LOCAL_STORAGE_DIR, exist_ok=True)
|
107 |
+
|
108 |
+
async with httpx.AsyncClient() as client:
|
109 |
+
response = await client.get(str(doc_url), timeout=30.0, follow_redirects=True)
|
110 |
+
response.raise_for_status()
|
111 |
+
|
112 |
+
doc_bytes = response.content
|
113 |
+
print("Download successful.")
|
114 |
+
|
115 |
+
# --- Logic to determine the local filename ---
|
116 |
+
# Parse the URL to extract the path.
|
117 |
+
parsed_path = urlparse(str(doc_url)).path
|
118 |
+
# Get the last part of the path and decode URL-encoded characters (like %20 for space).
|
119 |
+
filename = unquote(os.path.basename(parsed_path))
|
120 |
+
|
121 |
+
# If the filename is empty, create a unique one.
|
122 |
+
if not filename:
|
123 |
+
filename = f"{uuid.uuid4()}.pdf"
|
124 |
+
|
125 |
+
# Construct the full path where the file will be saved.
|
126 |
+
local_file_path = os.path.join(LOCAL_STORAGE_DIR, filename)
|
127 |
+
|
128 |
+
# Save the downloaded document to the local file.
|
129 |
+
with open(local_file_path, "wb") as f:
|
130 |
+
f.write(doc_bytes)
|
131 |
+
|
132 |
+
print(f"Document saved locally at: {local_file_path}")
|
133 |
+
print("Parsing document with LangChain's PyMuPDF4LLMLoader...")
|
134 |
+
|
135 |
+
# The loader's 'load' method is synchronous. Run it in a separate thread.
|
136 |
+
def load_document():
|
137 |
+
loader = PyMuPDF4LLMLoader(local_file_path)
|
138 |
+
documents = loader.load()
|
139 |
+
return documents
|
140 |
+
|
141 |
+
documents = await asyncio.to_thread(load_document)
|
142 |
+
|
143 |
+
if documents:
|
144 |
+
parsed_markdown = "\n\n".join([doc.page_content for doc in documents])
|
145 |
+
print(f"Parsing complete. Extracted {len(parsed_markdown)} characters as Markdown.")
|
146 |
+
# The local file is NOT deleted, as requested.
|
147 |
+
'''with open("sample_schema.json", 'r') as file:
|
148 |
+
# Load the JSON data from the file into a Python variable (dictionary or list)
|
149 |
+
data_variable = json.load(file)'''
|
150 |
+
|
151 |
+
#await process_markdown_with_manual_sections(parsed_markdown, data_variable, chunk_size = 1000, chunk_overlap =200)
|
152 |
+
filename = "hello.md"
|
153 |
+
with open(filename, "w", encoding="utf-8") as f:
|
154 |
+
f.write(parsed_markdown)
|
155 |
+
print(f"Markdown successfully saved to {filename}")
|
156 |
+
#return parsed_markdown
|
157 |
+
return filename
|
158 |
+
else:
|
159 |
+
raise ValueError("PyMuPDF4LLMLoader did not return any content.")
|
160 |
+
|
161 |
+
except httpx.HTTPStatusError as e:
|
162 |
+
print(f"Error downloading document: {e}")
|
163 |
+
raise
|
164 |
+
except Exception as e:
|
165 |
+
print(f"Error during processing: {e}")
|
166 |
+
raise
|
rag_utils.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_core.documents import Document
|
4 |
+
from ragatouille import RAGPretrainedModel
|
5 |
+
from groq import Groq
|
6 |
+
import json
|
7 |
+
#from PyPDF2 import PdfReader # New import for raw PDF text extraction
|
8 |
+
import re
|
9 |
+
|
10 |
+
# --- Configuration (can be overridden by the calling app) ---
|
11 |
+
CHUNK_SIZE = 1000
|
12 |
+
CHUNK_OVERLAP = 200
|
13 |
+
TOP_K_CHUNKS = 7
|
14 |
+
GROQ_MODEL_NAME = "llama3-8b-8192"
|
15 |
+
|
16 |
+
# --- Helper Functions ---
|
17 |
+
|
18 |
+
def extract_raw_text_from_pdf(pdf_path: str) -> str:
|
19 |
+
"""
|
20 |
+
Extracts raw text from a PDF file using PyPDF2.
|
21 |
+
This is a simpler text extraction compared to LLMSherpa, suitable for manual sectioning.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
reader = PdfReader(pdf_path)
|
25 |
+
full_text = ""
|
26 |
+
for page in reader.pages:
|
27 |
+
full_text += page.extract_text() + "\n" # Add newline between pages
|
28 |
+
print(f"Extracted raw text from PDF: {len(full_text)} characters.")
|
29 |
+
return full_text
|
30 |
+
except Exception as e:
|
31 |
+
print(f"Error extracting raw text from PDF: {e}")
|
32 |
+
return ""
|
33 |
+
|
34 |
+
def process_markdown_with_manual_sections(
|
35 |
+
md_file_path: str,
|
36 |
+
headings_json: dict,
|
37 |
+
chunk_size: int,
|
38 |
+
chunk_overlap: int
|
39 |
+
):
|
40 |
+
"""
|
41 |
+
Processes a markdown document from a file path by segmenting it based on
|
42 |
+
provided section headings, and then recursively chunking each segment.
|
43 |
+
Each chunk receives the corresponding section heading as metadata.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
md_file_path (str): The path to the input markdown (.md) file.
|
47 |
+
headings_json (dict): A JSON object with schema: {"headings": ["Your Heading 1", "Your Heading 2"]}
|
48 |
+
This contains the major section headings to split by.
|
49 |
+
chunk_size (int): The maximum size of each text chunk.
|
50 |
+
chunk_overlap (int): The number of characters to overlap between consecutive chunks.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
tuple[list[Document], list[dict]]: A tuple containing:
|
54 |
+
- list[Document]: A list of LangChain Document objects, each containing
|
55 |
+
a text chunk and its associated metadata.
|
56 |
+
- list[dict]: A list of dictionaries, each with {"section_heading", "section_text"}
|
57 |
+
representing the segmented sections for evaluation.
|
58 |
+
"""
|
59 |
+
all_chunks_with_metadata = []
|
60 |
+
full_text = ""
|
61 |
+
|
62 |
+
# Check if the file exists and read its content
|
63 |
+
if not os.path.exists(md_file_path):
|
64 |
+
print(f"Error: File not found at '{md_file_path}'")
|
65 |
+
return [], []
|
66 |
+
if not os.path.isfile(md_file_path):
|
67 |
+
print(f"Error: Path '{md_file_path}' is not a file.")
|
68 |
+
return [], []
|
69 |
+
if not md_file_path.lower().endswith(".md"):
|
70 |
+
print(f"Warning: File '{md_file_path}' does not have a .md extension.")
|
71 |
+
|
72 |
+
try:
|
73 |
+
with open(md_file_path, 'r', encoding='utf-8') as f:
|
74 |
+
full_text = f.read()
|
75 |
+
except Exception as e:
|
76 |
+
print(f"Error reading file '{md_file_path}': {e}")
|
77 |
+
return [], []
|
78 |
+
|
79 |
+
if not full_text:
|
80 |
+
print("Input markdown file is empty.")
|
81 |
+
return [], []
|
82 |
+
|
83 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
84 |
+
chunk_size=chunk_size,
|
85 |
+
chunk_overlap=chunk_overlap,
|
86 |
+
length_function=len,
|
87 |
+
is_separator_regex=False,
|
88 |
+
)
|
89 |
+
|
90 |
+
# Extract heading texts from the 'headings' key
|
91 |
+
heading_texts = headings_json.get("headings", [])
|
92 |
+
print(f"Identified headings for segmentation: {heading_texts}")
|
93 |
+
|
94 |
+
# Find start indices of all headings in the full text using regex
|
95 |
+
heading_positions = []
|
96 |
+
for heading in heading_texts:
|
97 |
+
# Create a regex pattern to match the heading, ignoring extra whitespace and making it case-insensitive
|
98 |
+
# re.escape() escapes special characters in the heading string
|
99 |
+
# \s* matches zero or more whitespace characters
|
100 |
+
# re.IGNORECASE makes the match case-insensitive
|
101 |
+
pattern = re.compile(r'\s*'.join(re.escape(word) for word in heading.split()))
|
102 |
+
|
103 |
+
match = pattern.search(full_text)
|
104 |
+
if match:
|
105 |
+
heading_positions.append({"heading_text": heading, "start_index": match.start()})
|
106 |
+
else:
|
107 |
+
print(f"Warning: Heading '{heading}' not found in the markdown text using regex. This section might be missed.")
|
108 |
+
|
109 |
+
# Sort heading positions by their start index
|
110 |
+
heading_positions.sort(key=lambda x: x["start_index"])
|
111 |
+
|
112 |
+
# Segment the text based on heading positions
|
113 |
+
segments_with_headings = []
|
114 |
+
|
115 |
+
# Handle preface (text before the first heading)
|
116 |
+
if heading_positions and heading_positions[0]["start_index"] > 0:
|
117 |
+
preface_text = full_text[:heading_positions[0]["start_index"]].strip()
|
118 |
+
if preface_text:
|
119 |
+
segments_with_headings.append({
|
120 |
+
"section_heading": "Document Start/Preface",
|
121 |
+
"section_text": preface_text
|
122 |
+
})
|
123 |
+
|
124 |
+
# Iterate through heading positions to define sections
|
125 |
+
for i, current_heading_info in enumerate(heading_positions):
|
126 |
+
start_index = current_heading_info["start_index"]
|
127 |
+
heading_text = current_heading_info["heading_text"]
|
128 |
+
|
129 |
+
# Determine the end index for the current section
|
130 |
+
end_index = len(full_text)
|
131 |
+
if i + 1 < len(heading_positions):
|
132 |
+
end_index = heading_positions[i+1]["start_index"]
|
133 |
+
|
134 |
+
# Extract section content (from current heading's start to next heading's start)
|
135 |
+
# We include the heading text itself in the section_text
|
136 |
+
section_content = full_text[start_index:end_index].strip()
|
137 |
+
|
138 |
+
if section_content:
|
139 |
+
segments_with_headings.append({
|
140 |
+
"section_heading": heading_text,
|
141 |
+
"section_text": section_content
|
142 |
+
})
|
143 |
+
|
144 |
+
print(f"Created {len(segments_with_headings)} segments based on provided headings.")
|
145 |
+
|
146 |
+
# Chunk each segment and attach metadata
|
147 |
+
for segment in segments_with_headings:
|
148 |
+
section_heading = segment["section_heading"]
|
149 |
+
section_text = segment["section_text"]
|
150 |
+
|
151 |
+
if section_text:
|
152 |
+
chunks = text_splitter.split_text(section_text)
|
153 |
+
for chunk in chunks:
|
154 |
+
metadata = {
|
155 |
+
"document_part": "Section", # All these are now considered 'Section'
|
156 |
+
"section_heading": section_heading,
|
157 |
+
}
|
158 |
+
all_chunks_with_metadata.append(Document(page_content=chunk, metadata=metadata))
|
159 |
+
|
160 |
+
print(f"Created {len(all_chunks_with_metadata)} chunks with metadata from segmented sections.")
|
161 |
+
|
162 |
+
with open("output.json", 'w', encoding='utf-8') as f:
|
163 |
+
json.dump(segments_with_headings, f, indent=4, ensure_ascii=False)
|
164 |
+
return all_chunks_with_metadata
|
165 |
+
|
166 |
+
def perform_vector_search(documents: list[Document], query: str, top_k: int, rag_model_instance=None) -> list[dict]:
|
167 |
+
"""
|
168 |
+
Performs vector search using Ragatouille's ColBERT implementation
|
169 |
+
to retrieve the top k relevant chunks, preserving metadata.
|
170 |
+
|
171 |
+
Args:
|
172 |
+
documents (list[Document]): The list of LangChain Document objects to index and search.
|
173 |
+
query (str): The search query.
|
174 |
+
top_k (int): The number of top relevant chunks to retrieve.
|
175 |
+
rag_model_instance: An optional pre-loaded Ragatouille model instance.
|
176 |
+
If None, a new one will be loaded.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
list[dict]: A list of dictionaries, each containing 'content' and 'document_metadata'
|
180 |
+
from the Ragatouille search results.
|
181 |
+
"""
|
182 |
+
if rag_model_instance is None:
|
183 |
+
print("Initializing Ragatouille ColBERT model...")
|
184 |
+
rag = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
|
185 |
+
else:
|
186 |
+
rag = rag_model_instance
|
187 |
+
|
188 |
+
# Separate content and metadata for indexing
|
189 |
+
collection_texts = [doc.page_content for doc in documents]
|
190 |
+
collection_metadatas = [doc.metadata for doc in documents]
|
191 |
+
|
192 |
+
index_name = "custom_chunks_index" # Changed index name
|
193 |
+
print("Indexing chunks with Ragatouille (this may take a while for large datasets)...")
|
194 |
+
rag.index(
|
195 |
+
collection=collection_texts,
|
196 |
+
document_metadatas=collection_metadatas,
|
197 |
+
index_name=index_name,
|
198 |
+
overwrite_index=True
|
199 |
+
)
|
200 |
+
print("Indexing complete.")
|
201 |
+
|
202 |
+
print(f"Performing vector search for query: '{query}' (top_k={top_k})...")
|
203 |
+
results = rag.search(query=query, k=top_k)
|
204 |
+
|
205 |
+
print(f"Retrieved {len(results)} top chunks.")
|
206 |
+
return results
|
207 |
+
|
208 |
+
def generate_answer_with_groq(query: str, retrieved_results: list[dict], groq_api_key: str) -> str:
|
209 |
+
"""
|
210 |
+
Generates an answer using the Groq API based on the query and retrieved chunks' content.
|
211 |
+
Includes metadata in the prompt for better context.
|
212 |
+
|
213 |
+
Args:
|
214 |
+
query (str): The original user query.
|
215 |
+
retrieved_results (list[dict]): A list of dictionaries from Ragatouille search,
|
216 |
+
each with 'content' and 'document_metadata'.
|
217 |
+
groq_api_key (str): The Groq API key.
|
218 |
+
|
219 |
+
Returns:
|
220 |
+
str: The generated answer.
|
221 |
+
"""
|
222 |
+
if not groq_api_key:
|
223 |
+
return "Error: Groq API key is not set. Cannot generate answer."
|
224 |
+
|
225 |
+
print("Generating answer with Groq API...")
|
226 |
+
client = Groq(api_key=groq_api_key)
|
227 |
+
|
228 |
+
context_parts = []
|
229 |
+
for i, res in enumerate(retrieved_results):
|
230 |
+
content = res.get("content", "")
|
231 |
+
metadata = res.get("document_metadata", {})
|
232 |
+
section_heading = metadata.get("section_heading", "N/A")
|
233 |
+
document_part = metadata.get("document_part", "N/A") # New metadata field
|
234 |
+
|
235 |
+
context_parts.append(
|
236 |
+
f"--- Context Chunk {i+1} ---\n"
|
237 |
+
f"Document Part: {document_part}\n"
|
238 |
+
f"Section Heading: {section_heading}\n"
|
239 |
+
f"Content: {content}\n"
|
240 |
+
f"-------------------------"
|
241 |
+
)
|
242 |
+
context = "\n\n".join(context_parts)
|
243 |
+
|
244 |
+
prompt = (
|
245 |
+
f"You are a specialized document analyzer assistant. Your task is to answer the user's question "
|
246 |
+
f"solely based on the provided context. Pay close attention to the section heading and document part "
|
247 |
+
f"for each context chunk. Ensure your answer incorporates all relevant details, including any legal nuances "
|
248 |
+
f"and conditions found in the context, and is concise, limited to one or two sentences. "
|
249 |
+
f"Do not explicitly mention the retrieved chunks. If the answer cannot be found in the provided context, "
|
250 |
+
f"clearly state that you do not have enough information.\n\n"
|
251 |
+
f"Context:\n{context}\n\n"
|
252 |
+
f"Question: {query}\n\n"
|
253 |
+
f"Answer:"
|
254 |
+
)
|
255 |
+
|
256 |
+
try:
|
257 |
+
chat_completion = client.chat.completions.create(
|
258 |
+
messages=[
|
259 |
+
{
|
260 |
+
"role": "user",
|
261 |
+
"content": prompt,
|
262 |
+
}
|
263 |
+
],
|
264 |
+
model=GROQ_MODEL_NAME,
|
265 |
+
temperature=0.7,
|
266 |
+
max_tokens=500,
|
267 |
+
)
|
268 |
+
answer = chat_completion.choices[0].message.content
|
269 |
+
print("Answer generated successfully.")
|
270 |
+
return answer
|
271 |
+
except Exception as e:
|
272 |
+
print(f"An error occurred during Groq API call: {e}")
|
273 |
+
return "Could not generate an answer due to an API error."
|
requirements.txt
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohappyeyeballs==2.6.1
|
2 |
+
aiohttp==3.12.15
|
3 |
+
aiosignal==1.4.0
|
4 |
+
aiosqlite==0.21.0
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==4.9.0
|
7 |
+
asyncio==3.4.3
|
8 |
+
attrs==25.3.0
|
9 |
+
banks==2.2.0
|
10 |
+
beautifulsoup4==4.13.4
|
11 |
+
bitarray==3.6.0
|
12 |
+
blinker==1.9.0
|
13 |
+
catalogue==2.0.10
|
14 |
+
certifi==2025.7.14
|
15 |
+
charset-normalizer==3.4.2
|
16 |
+
click==8.2.1
|
17 |
+
colbert-ai==0.2.21
|
18 |
+
colorama==0.4.6
|
19 |
+
dataclasses-json==0.6.7
|
20 |
+
datasets==4.0.0
|
21 |
+
defusedxml==0.7.1
|
22 |
+
Deprecated==1.2.18
|
23 |
+
dill==0.3.8
|
24 |
+
dirtyjson==1.0.8
|
25 |
+
distro==1.9.0
|
26 |
+
dotenv==0.9.9
|
27 |
+
faiss-cpu==1.11.0.post1
|
28 |
+
fast_pytorch_kmeans==0.2.2
|
29 |
+
fastapi==0.116.1
|
30 |
+
filelock==3.18.0
|
31 |
+
filetype==1.2.0
|
32 |
+
Flask==3.1.1
|
33 |
+
frozenlist==1.7.0
|
34 |
+
fsspec==2025.3.0
|
35 |
+
git-python==1.0.3
|
36 |
+
gitdb==4.0.12
|
37 |
+
GitPython==3.1.45
|
38 |
+
greenlet==3.2.3
|
39 |
+
griffe==1.9.0
|
40 |
+
groq==0.30.0
|
41 |
+
h11==0.16.0
|
42 |
+
hf-xet==1.1.5
|
43 |
+
httpcore==1.0.9
|
44 |
+
httpx==0.28.1
|
45 |
+
huggingface-hub==0.34.3
|
46 |
+
idna==3.10
|
47 |
+
itsdangerous==2.2.0
|
48 |
+
Jinja2==3.1.6
|
49 |
+
jiter==0.10.0
|
50 |
+
joblib==1.5.1
|
51 |
+
jsonpatch==1.33
|
52 |
+
jsonpointer==3.0.0
|
53 |
+
langchain==0.3.27
|
54 |
+
langchain-core==0.3.72
|
55 |
+
langchain-pymupdf4llm==0.4.1
|
56 |
+
langchain-text-splitters==0.3.9
|
57 |
+
langsmith==0.4.8
|
58 |
+
llama-cloud==0.1.35
|
59 |
+
llama-cloud-services==0.6.53
|
60 |
+
llama-index==0.12.52
|
61 |
+
llama-index-agent-openai==0.4.12
|
62 |
+
llama-index-cli==0.4.4
|
63 |
+
llama-index-core==0.12.52.post1
|
64 |
+
llama-index-embeddings-openai==0.3.1
|
65 |
+
llama-index-indices-managed-llama-cloud==0.8.0
|
66 |
+
llama-index-instrumentation==0.3.1
|
67 |
+
llama-index-llms-openai==0.4.7
|
68 |
+
llama-index-multi-modal-llms-openai==0.5.3
|
69 |
+
llama-index-program-openai==0.3.2
|
70 |
+
llama-index-question-gen-openai==0.3.1
|
71 |
+
llama-index-readers-file==0.4.11
|
72 |
+
llama-index-readers-llama-parse==0.4.0
|
73 |
+
llama-index-workflows==1.2.0
|
74 |
+
llama-parse==0.6.53
|
75 |
+
MarkupSafe==3.0.2
|
76 |
+
marshmallow==3.26.1
|
77 |
+
mpmath==1.3.0
|
78 |
+
multidict==6.6.3
|
79 |
+
multiprocess==0.70.16
|
80 |
+
mypy_extensions==1.1.0
|
81 |
+
nest-asyncio==1.6.0
|
82 |
+
networkx==3.5
|
83 |
+
ninja==1.11.1.4
|
84 |
+
nltk==3.9.1
|
85 |
+
numpy==2.3.2
|
86 |
+
onnx==1.18.0
|
87 |
+
openai==1.98.0
|
88 |
+
orjson==3.11.1
|
89 |
+
packaging==25.0
|
90 |
+
pandas==2.2.3
|
91 |
+
pillow==11.3.0
|
92 |
+
platformdirs==4.3.8
|
93 |
+
propcache==0.3.2
|
94 |
+
protobuf==6.31.1
|
95 |
+
psutil==7.0.0
|
96 |
+
pyarrow==21.0.0
|
97 |
+
pydantic==2.11.7
|
98 |
+
pydantic_core==2.33.2
|
99 |
+
PyMuPDF==1.26.3
|
100 |
+
pymupdf4llm==0.0.27
|
101 |
+
pypdf==5.9.0
|
102 |
+
python-dateutil==2.9.0.post0
|
103 |
+
python-dotenv==1.1.1
|
104 |
+
pytz==2025.2
|
105 |
+
PyYAML==6.0.2
|
106 |
+
RAGatouille==0.0.9.post2
|
107 |
+
regex==2025.7.33
|
108 |
+
requests==2.32.4
|
109 |
+
requests-toolbelt==1.0.0
|
110 |
+
safetensors==0.5.3
|
111 |
+
scikit-learn==1.7.1
|
112 |
+
scipy==1.16.1
|
113 |
+
sentence-transformers==5.0.0
|
114 |
+
setuptools==80.9.0
|
115 |
+
six==1.17.0
|
116 |
+
smmap==5.0.2
|
117 |
+
sniffio==1.3.1
|
118 |
+
soupsieve==2.7
|
119 |
+
SQLAlchemy==2.0.42
|
120 |
+
srsly==2.5.1
|
121 |
+
starlette==0.47.2
|
122 |
+
striprtf==0.0.26
|
123 |
+
sympy==1.14.0
|
124 |
+
tenacity==9.1.2
|
125 |
+
threadpoolctl==3.6.0
|
126 |
+
tiktoken==0.9.0
|
127 |
+
tokenizers==0.21.4
|
128 |
+
torch==2.7.1
|
129 |
+
tqdm==4.67.1
|
130 |
+
transformers==4.49.0
|
131 |
+
typing==3.7.4.3
|
132 |
+
typing-inspect==0.9.0
|
133 |
+
typing-inspection==0.4.1
|
134 |
+
typing_extensions==4.14.1
|
135 |
+
tzdata==2025.2
|
136 |
+
ujson==5.10.0
|
137 |
+
urllib3==2.5.0
|
138 |
+
uvicorn==0.35.0
|
139 |
+
voyager==2.1.0
|
140 |
+
Werkzeug==3.1.3
|
141 |
+
wrapt==1.17.2
|
142 |
+
xxhash==3.5.0
|
143 |
+
yarl==1.20.1
|
144 |
+
zstandard==0.23.0
|