shreyanshknayak commited on
Commit
437d8b7
·
verified ·
1 Parent(s): 9ce7616

Upload 4 files

Browse files
Files changed (4) hide show
  1. main.py +178 -0
  2. processing_utility.py +166 -0
  3. rag_utils.py +273 -0
  4. requirements.txt +144 -0
main.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tempfile
4
+ import requests
5
+ from fastapi import FastAPI, HTTPException
6
+ from pydantic import BaseModel
7
+ from typing import List, Dict, Union
8
+ from dotenv import load_dotenv
9
+
10
+ from processing_utility import download_and_parse_document, extract_schema_from_file
11
+
12
+ # Import functions and constants from the colbert_utils.py file
13
+ # Make sure colbert_utils.py is in the same directory or accessible via PYTHONPATH
14
+ from rag_utils import (
15
+ process_markdown_with_manual_sections,
16
+ perform_vector_search,
17
+ generate_answer_with_groq,
18
+ CHUNK_SIZE,
19
+ CHUNK_OVERLAP,
20
+ TOP_K_CHUNKS,
21
+ GROQ_MODEL_NAME
22
+ )
23
+
24
+ load_dotenv()
25
+
26
+
27
+ # --- FastAPI App Initialization ---
28
+ app = FastAPI(
29
+ title="HackRX RAG API",
30
+ description="API for Retrieval-Augmented Generation from PDF documents.",
31
+ version="1.0.0",
32
+ )
33
+
34
+ # --- Groq API Key Setup ---
35
+ # It's highly recommended to set this as an environment variable in production.
36
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "NOT_FOUND")
37
+ if GROQ_API_KEY == "NOT_FOUND":
38
+ print("WARNING: GROQ_API_KEY is using a placeholder or hardcoded value. Please set GROQ_API_KEY environment variable for production.")
39
+
40
+ # --- Pydantic Models for Request and Response ---
41
+ class RunRequest(BaseModel):
42
+ documents: str # URL to the PDF document
43
+ questions: List[str]
44
+
45
+ class Answer(BaseModel):
46
+ answer: str
47
+
48
+ class RunResponse(BaseModel):
49
+ answers: List[Answer]
50
+
51
+ # --- Pseudo-functions (Replace with actual implementations if needed) ---
52
+
53
+ def convert_to_markdown(pdf_url: str) -> str:
54
+ """
55
+ PSEUDO-FUNCTION: Downloads the PDF from the URL and returns its local path.
56
+ In a real scenario, this might involve converting PDF to Markdown,
57
+ but for process_pdf_with_manual_sections, we just need the local PDF path.
58
+ """
59
+ print(f"Downloading PDF from: {pdf_url}")
60
+ try:
61
+ response = requests.get(pdf_url, stream=True)
62
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
63
+
64
+ # Create a temporary file to store the PDF
65
+ temp_pdf_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
66
+ for chunk in response.iter_content(chunk_size=8192):
67
+ temp_pdf_file.write(chunk)
68
+ temp_pdf_file.close()
69
+ print(f"PDF downloaded to temporary path: {temp_pdf_file.name}")
70
+ return temp_pdf_file.name
71
+ except requests.exceptions.RequestException as e:
72
+ raise HTTPException(status_code=500, detail=f"Failed to download PDF from URL: {e}")
73
+ except Exception as e:
74
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred during PDF download: {e}")
75
+
76
+
77
+ def fetch_headings_json(pdf_url: str) -> Dict:
78
+ """
79
+ PSEUDO-FUNCTION: Fetches section headings for the PDF.
80
+ In a real scenario, this would involve a more sophisticated service
81
+ or logic to extract headings from the PDF.
82
+ For this example, we return a hardcoded dummy JSON.
83
+ """
84
+ print(f"Fetching headings for PDF URL (pseudo-function): {pdf_url}")
85
+ # This dummy JSON should match the expected schema for process_pdf_with_manual_sections
86
+ # {"data":{"headings": ["Your Heading"]}}
87
+ dummy_headings = {
88
+ "run_id": "dummy-run-id",
89
+ "extraction_agent_id": "dummy-agent-id",
90
+ "data": {
91
+ "headings": [
92
+ "Policy Wordings",
93
+ "SECTION A) PREAMBLE",
94
+ "SECTION B) DEFINITIONS - STANDARD DEFINITIONS",
95
+ "SECTION B) DEFINITIONS - SPECIFIC DEFINITIONS",
96
+ "SECTION C) BENEFITS COVERED UNDER THE POLICY",
97
+ "PART A- COVERAGE- Domestic (Within India Only, for Imperial and Imperial Plus Plans)",
98
+ "PART B- COVERAGE- International",
99
+ "SECTION D) EXCLUSIONS- STANDARD EXCLUSIONS APPLICABLE TO PART A- DOMESTIC COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
100
+ "SECTION D) EXCLUSIONS– SPECIFIC EXCLUSIONS APPLICABLE TO PART A- DOMESTIC COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
101
+ "SECTION D) EXCLUSIONS- STANDARD EXCLUSIONS APPLICABLE TO PART B- INTERNATIONAL COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
102
+ "SECTION D) EXCLUSIONS– SPECIFIC EXCLUSIONS APPLICABLE TO INTERNATIONAL COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
103
+ "SECTION E) GENERAL TERMS AND CONDITIONS - STANDARD GENERAL TERMS AND CONDITIONS",
104
+ "SECTION E) GENERAL TERMS AND CONDITIONS - SPECIFIC TERMS AND CONDITIONS",
105
+ "SECTION E) GENERAL TERMS AND CLAUSES - STANDARD GENERAL TERMS AND CLAUSES"
106
+ ]
107
+ },
108
+ "extraction_metadata": {
109
+ "field_metadata": {},
110
+ "usage": {
111
+ "num_pages_extracted": 49,
112
+ "num_document_tokens": 48701,
113
+ "num_output_tokens": 1229
114
+ }
115
+ }
116
+ }
117
+ return dummy_headings
118
+
119
+ # --- API Endpoint ---
120
+ @app.post("/hackrx/run", response_model=RunResponse)
121
+ async def run_rag_pipeline(request: RunRequest):
122
+ """
123
+ Runs the RAG pipeline for a given PDF document and a list of questions.
124
+ """
125
+ pdf_url = request.documents
126
+ questions = request.questions
127
+
128
+ local_pdf_path = None
129
+ try:
130
+ # Step 1: Download PDF (using pseudo-function)
131
+ local_markdown_path = await download_and_parse_document(pdf_url) # Renamed from convert_to_markdown to reflect it returns local path
132
+
133
+ # Step 2: Fetch headings JSON (using pseudo-function)
134
+ headings_json = extract_schema_from_file(local_markdown_path)
135
+ with open("output.json", 'w', encoding='utf-8') as f:
136
+ json.dump(headings_json, f, indent=4, ensure_ascii=False)
137
+ if not headings_json or not headings_json.get("headings"):
138
+ raise HTTPException(status_code=400, detail="Could not retrieve valid headings from the provided PDF URL.")
139
+
140
+ # Step 3: Process PDF with manual sections to get chunks with metadata
141
+ print("Processing PDF into chunks with manual sections...")
142
+ processed_documents = process_markdown_with_manual_sections(
143
+ local_markdown_path,
144
+ headings_json,
145
+ CHUNK_SIZE,
146
+ CHUNK_OVERLAP
147
+ )
148
+ if not processed_documents:
149
+ raise HTTPException(status_code=500, detail="Failed to process PDF into document chunks.")
150
+
151
+ all_answers = []
152
+ # Step 4: Iterate through questions, perform search, and generate answers
153
+ for i, question in enumerate(questions):
154
+ print(f"Processing question {i+1}/{len(questions)}: '{question}'")
155
+ # Perform vector search
156
+ retrieved_results = perform_vector_search(processed_documents, question, TOP_K_CHUNKS)
157
+
158
+ if retrieved_results:
159
+ # Generate answer using Groq
160
+ answer_text = generate_answer_with_groq(question, retrieved_results, GROQ_API_KEY)
161
+ else:
162
+ answer_text = "No relevant information found in the document to answer this question."
163
+
164
+ all_answers.append(Answer(answer=answer_text))
165
+
166
+ return RunResponse(answers=all_answers)
167
+
168
+ except HTTPException as e:
169
+ raise e
170
+ except Exception as e:
171
+ print(f"An unhandled error occurred: {e}")
172
+ raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")
173
+ finally:
174
+ # Clean up the temporary PDF file
175
+ if local_pdf_path and os.path.exists(local_pdf_path):
176
+ os.unlink(local_pdf_path)
177
+ print(f"Cleaned up temporary PDF file: {local_pdf_path}")
178
+
processing_utility.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx # An asynchronous HTTP client.
2
+ import os # To handle file paths and create directories.
3
+ import asyncio # To run synchronous libraries in an async environment.
4
+ from urllib.parse import unquote, urlparse # To get the filename from the URL.
5
+ import uuid # To generate unique filenames if needed.
6
+
7
+ from pydantic import HttpUrl
8
+ from langchain_pymupdf4llm import PyMuPDF4LLMLoader
9
+ import json
10
+ import re
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.schema import Document
13
+
14
+
15
+ import os
16
+ import argparse
17
+ from typing import Optional
18
+
19
+ # Ensure required libraries are installed.
20
+ # You can install them using:
21
+ # pip install llama_cloud_services pydantic python-dotenv
22
+
23
+ from llama_cloud_services import LlamaExtract
24
+ from pydantic import BaseModel, Field
25
+ from dotenv import load_dotenv
26
+
27
+ class Insurance(BaseModel):
28
+ """
29
+ A Pydantic model to define the data schema for extraction.
30
+ The description helps guide the AI model.
31
+ """
32
+ headings: str = Field(description="An array of headings")
33
+
34
+
35
+
36
+ def extract_schema_from_file(file_path: str) -> Optional[Insurance]:
37
+ """
38
+ Initializes the LlamaExtract client, creates an agent, and extracts
39
+ data from the provided file path based on the Resume schema.
40
+
41
+ Args:
42
+ file_path (str): The path to the local markdown file.
43
+
44
+ Returns:
45
+ An instance of the Resume Pydantic model containing the extracted data,
46
+ or None if the extraction fails or the file doesn't exist.
47
+ """
48
+ if not os.path.exists(file_path):
49
+ print(f"❌ Error: The file '{file_path}' was not found.")
50
+ return None
51
+
52
+ print(f"🚀 Initializing extractor and sending '{file_path}' to LlamaCloud...")
53
+
54
+ try:
55
+ # Initialize the LlamaExtract client.
56
+ # It will automatically use the LLAMA_CLOUD_API_KEY from the environment.
57
+ extractor = LlamaExtract()
58
+
59
+ # Create an extraction agent with a specific name and data schema.
60
+ # This configuration is taken directly from your provided code.
61
+ # Assuming 'extractor' and 'Insurance' are already defined
62
+ # Define your configuration dictionary
63
+ agent_config = {
64
+ "system_prompt": "Identify and extract the primary section/segment headings within this legal or policy document. Focus on headings that establish the overarching theme or context for the entire block of text they introduce. Examples include 'Introduction', 'Definitions', 'Scope', 'Liabilities', or 'Terms and Conditions'. Do not extract subheadings or any headings that merely denote a list item."
65
+ }
66
+
67
+ # Create the agent, passing the config dictionary
68
+ #agent = extractor.create_agent(name="insurance-parser", data_schema=Insurance)
69
+ agent = extractor.get_agent(name="insurance-parser")
70
+
71
+
72
+ # Call the agent to extract data from the specified document.
73
+ print("🤖 Agent created. Starting extraction...")
74
+ result = agent.extract(file_path)
75
+
76
+ if result and result.data:
77
+ print("✅ Extraction successful!")
78
+ # The function returns the structured data.
79
+ return result.data
80
+ else:
81
+ print("⚠️ Extraction did not return any data.")
82
+ return None
83
+
84
+ except Exception as e:
85
+ print(f"\n❌ An error occurred during the API call: {e}")
86
+ print("Please check your API key, network connection, and file format.")
87
+ return None
88
+
89
+
90
+
91
+ async def download_and_parse_document(doc_url: HttpUrl) -> str:
92
+ """
93
+ Asynchronously downloads a document, saves it to a local directory,
94
+ and then parses it using LangChain's PyMuPDF4LLMLoader.
95
+
96
+ Args:
97
+ doc_url: The Pydantic-validated URL of the document to process.
98
+
99
+ Returns:
100
+ A single string containing the document's content as structured Markdown.
101
+ """
102
+ print(f"Initiating download from: {doc_url}")
103
+ try:
104
+ # Create the local storage directory if it doesn't exist.
105
+ LOCAL_STORAGE_DIR = "data/"
106
+ os.makedirs(LOCAL_STORAGE_DIR, exist_ok=True)
107
+
108
+ async with httpx.AsyncClient() as client:
109
+ response = await client.get(str(doc_url), timeout=30.0, follow_redirects=True)
110
+ response.raise_for_status()
111
+
112
+ doc_bytes = response.content
113
+ print("Download successful.")
114
+
115
+ # --- Logic to determine the local filename ---
116
+ # Parse the URL to extract the path.
117
+ parsed_path = urlparse(str(doc_url)).path
118
+ # Get the last part of the path and decode URL-encoded characters (like %20 for space).
119
+ filename = unquote(os.path.basename(parsed_path))
120
+
121
+ # If the filename is empty, create a unique one.
122
+ if not filename:
123
+ filename = f"{uuid.uuid4()}.pdf"
124
+
125
+ # Construct the full path where the file will be saved.
126
+ local_file_path = os.path.join(LOCAL_STORAGE_DIR, filename)
127
+
128
+ # Save the downloaded document to the local file.
129
+ with open(local_file_path, "wb") as f:
130
+ f.write(doc_bytes)
131
+
132
+ print(f"Document saved locally at: {local_file_path}")
133
+ print("Parsing document with LangChain's PyMuPDF4LLMLoader...")
134
+
135
+ # The loader's 'load' method is synchronous. Run it in a separate thread.
136
+ def load_document():
137
+ loader = PyMuPDF4LLMLoader(local_file_path)
138
+ documents = loader.load()
139
+ return documents
140
+
141
+ documents = await asyncio.to_thread(load_document)
142
+
143
+ if documents:
144
+ parsed_markdown = "\n\n".join([doc.page_content for doc in documents])
145
+ print(f"Parsing complete. Extracted {len(parsed_markdown)} characters as Markdown.")
146
+ # The local file is NOT deleted, as requested.
147
+ '''with open("sample_schema.json", 'r') as file:
148
+ # Load the JSON data from the file into a Python variable (dictionary or list)
149
+ data_variable = json.load(file)'''
150
+
151
+ #await process_markdown_with_manual_sections(parsed_markdown, data_variable, chunk_size = 1000, chunk_overlap =200)
152
+ filename = "hello.md"
153
+ with open(filename, "w", encoding="utf-8") as f:
154
+ f.write(parsed_markdown)
155
+ print(f"Markdown successfully saved to {filename}")
156
+ #return parsed_markdown
157
+ return filename
158
+ else:
159
+ raise ValueError("PyMuPDF4LLMLoader did not return any content.")
160
+
161
+ except httpx.HTTPStatusError as e:
162
+ print(f"Error downloading document: {e}")
163
+ raise
164
+ except Exception as e:
165
+ print(f"Error during processing: {e}")
166
+ raise
rag_utils.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_core.documents import Document
4
+ from ragatouille import RAGPretrainedModel
5
+ from groq import Groq
6
+ import json
7
+ #from PyPDF2 import PdfReader # New import for raw PDF text extraction
8
+ import re
9
+
10
+ # --- Configuration (can be overridden by the calling app) ---
11
+ CHUNK_SIZE = 1000
12
+ CHUNK_OVERLAP = 200
13
+ TOP_K_CHUNKS = 7
14
+ GROQ_MODEL_NAME = "llama3-8b-8192"
15
+
16
+ # --- Helper Functions ---
17
+
18
+ def extract_raw_text_from_pdf(pdf_path: str) -> str:
19
+ """
20
+ Extracts raw text from a PDF file using PyPDF2.
21
+ This is a simpler text extraction compared to LLMSherpa, suitable for manual sectioning.
22
+ """
23
+ try:
24
+ reader = PdfReader(pdf_path)
25
+ full_text = ""
26
+ for page in reader.pages:
27
+ full_text += page.extract_text() + "\n" # Add newline between pages
28
+ print(f"Extracted raw text from PDF: {len(full_text)} characters.")
29
+ return full_text
30
+ except Exception as e:
31
+ print(f"Error extracting raw text from PDF: {e}")
32
+ return ""
33
+
34
+ def process_markdown_with_manual_sections(
35
+ md_file_path: str,
36
+ headings_json: dict,
37
+ chunk_size: int,
38
+ chunk_overlap: int
39
+ ):
40
+ """
41
+ Processes a markdown document from a file path by segmenting it based on
42
+ provided section headings, and then recursively chunking each segment.
43
+ Each chunk receives the corresponding section heading as metadata.
44
+
45
+ Args:
46
+ md_file_path (str): The path to the input markdown (.md) file.
47
+ headings_json (dict): A JSON object with schema: {"headings": ["Your Heading 1", "Your Heading 2"]}
48
+ This contains the major section headings to split by.
49
+ chunk_size (int): The maximum size of each text chunk.
50
+ chunk_overlap (int): The number of characters to overlap between consecutive chunks.
51
+
52
+ Returns:
53
+ tuple[list[Document], list[dict]]: A tuple containing:
54
+ - list[Document]: A list of LangChain Document objects, each containing
55
+ a text chunk and its associated metadata.
56
+ - list[dict]: A list of dictionaries, each with {"section_heading", "section_text"}
57
+ representing the segmented sections for evaluation.
58
+ """
59
+ all_chunks_with_metadata = []
60
+ full_text = ""
61
+
62
+ # Check if the file exists and read its content
63
+ if not os.path.exists(md_file_path):
64
+ print(f"Error: File not found at '{md_file_path}'")
65
+ return [], []
66
+ if not os.path.isfile(md_file_path):
67
+ print(f"Error: Path '{md_file_path}' is not a file.")
68
+ return [], []
69
+ if not md_file_path.lower().endswith(".md"):
70
+ print(f"Warning: File '{md_file_path}' does not have a .md extension.")
71
+
72
+ try:
73
+ with open(md_file_path, 'r', encoding='utf-8') as f:
74
+ full_text = f.read()
75
+ except Exception as e:
76
+ print(f"Error reading file '{md_file_path}': {e}")
77
+ return [], []
78
+
79
+ if not full_text:
80
+ print("Input markdown file is empty.")
81
+ return [], []
82
+
83
+ text_splitter = RecursiveCharacterTextSplitter(
84
+ chunk_size=chunk_size,
85
+ chunk_overlap=chunk_overlap,
86
+ length_function=len,
87
+ is_separator_regex=False,
88
+ )
89
+
90
+ # Extract heading texts from the 'headings' key
91
+ heading_texts = headings_json.get("headings", [])
92
+ print(f"Identified headings for segmentation: {heading_texts}")
93
+
94
+ # Find start indices of all headings in the full text using regex
95
+ heading_positions = []
96
+ for heading in heading_texts:
97
+ # Create a regex pattern to match the heading, ignoring extra whitespace and making it case-insensitive
98
+ # re.escape() escapes special characters in the heading string
99
+ # \s* matches zero or more whitespace characters
100
+ # re.IGNORECASE makes the match case-insensitive
101
+ pattern = re.compile(r'\s*'.join(re.escape(word) for word in heading.split()))
102
+
103
+ match = pattern.search(full_text)
104
+ if match:
105
+ heading_positions.append({"heading_text": heading, "start_index": match.start()})
106
+ else:
107
+ print(f"Warning: Heading '{heading}' not found in the markdown text using regex. This section might be missed.")
108
+
109
+ # Sort heading positions by their start index
110
+ heading_positions.sort(key=lambda x: x["start_index"])
111
+
112
+ # Segment the text based on heading positions
113
+ segments_with_headings = []
114
+
115
+ # Handle preface (text before the first heading)
116
+ if heading_positions and heading_positions[0]["start_index"] > 0:
117
+ preface_text = full_text[:heading_positions[0]["start_index"]].strip()
118
+ if preface_text:
119
+ segments_with_headings.append({
120
+ "section_heading": "Document Start/Preface",
121
+ "section_text": preface_text
122
+ })
123
+
124
+ # Iterate through heading positions to define sections
125
+ for i, current_heading_info in enumerate(heading_positions):
126
+ start_index = current_heading_info["start_index"]
127
+ heading_text = current_heading_info["heading_text"]
128
+
129
+ # Determine the end index for the current section
130
+ end_index = len(full_text)
131
+ if i + 1 < len(heading_positions):
132
+ end_index = heading_positions[i+1]["start_index"]
133
+
134
+ # Extract section content (from current heading's start to next heading's start)
135
+ # We include the heading text itself in the section_text
136
+ section_content = full_text[start_index:end_index].strip()
137
+
138
+ if section_content:
139
+ segments_with_headings.append({
140
+ "section_heading": heading_text,
141
+ "section_text": section_content
142
+ })
143
+
144
+ print(f"Created {len(segments_with_headings)} segments based on provided headings.")
145
+
146
+ # Chunk each segment and attach metadata
147
+ for segment in segments_with_headings:
148
+ section_heading = segment["section_heading"]
149
+ section_text = segment["section_text"]
150
+
151
+ if section_text:
152
+ chunks = text_splitter.split_text(section_text)
153
+ for chunk in chunks:
154
+ metadata = {
155
+ "document_part": "Section", # All these are now considered 'Section'
156
+ "section_heading": section_heading,
157
+ }
158
+ all_chunks_with_metadata.append(Document(page_content=chunk, metadata=metadata))
159
+
160
+ print(f"Created {len(all_chunks_with_metadata)} chunks with metadata from segmented sections.")
161
+
162
+ with open("output.json", 'w', encoding='utf-8') as f:
163
+ json.dump(segments_with_headings, f, indent=4, ensure_ascii=False)
164
+ return all_chunks_with_metadata
165
+
166
+ def perform_vector_search(documents: list[Document], query: str, top_k: int, rag_model_instance=None) -> list[dict]:
167
+ """
168
+ Performs vector search using Ragatouille's ColBERT implementation
169
+ to retrieve the top k relevant chunks, preserving metadata.
170
+
171
+ Args:
172
+ documents (list[Document]): The list of LangChain Document objects to index and search.
173
+ query (str): The search query.
174
+ top_k (int): The number of top relevant chunks to retrieve.
175
+ rag_model_instance: An optional pre-loaded Ragatouille model instance.
176
+ If None, a new one will be loaded.
177
+
178
+ Returns:
179
+ list[dict]: A list of dictionaries, each containing 'content' and 'document_metadata'
180
+ from the Ragatouille search results.
181
+ """
182
+ if rag_model_instance is None:
183
+ print("Initializing Ragatouille ColBERT model...")
184
+ rag = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
185
+ else:
186
+ rag = rag_model_instance
187
+
188
+ # Separate content and metadata for indexing
189
+ collection_texts = [doc.page_content for doc in documents]
190
+ collection_metadatas = [doc.metadata for doc in documents]
191
+
192
+ index_name = "custom_chunks_index" # Changed index name
193
+ print("Indexing chunks with Ragatouille (this may take a while for large datasets)...")
194
+ rag.index(
195
+ collection=collection_texts,
196
+ document_metadatas=collection_metadatas,
197
+ index_name=index_name,
198
+ overwrite_index=True
199
+ )
200
+ print("Indexing complete.")
201
+
202
+ print(f"Performing vector search for query: '{query}' (top_k={top_k})...")
203
+ results = rag.search(query=query, k=top_k)
204
+
205
+ print(f"Retrieved {len(results)} top chunks.")
206
+ return results
207
+
208
+ def generate_answer_with_groq(query: str, retrieved_results: list[dict], groq_api_key: str) -> str:
209
+ """
210
+ Generates an answer using the Groq API based on the query and retrieved chunks' content.
211
+ Includes metadata in the prompt for better context.
212
+
213
+ Args:
214
+ query (str): The original user query.
215
+ retrieved_results (list[dict]): A list of dictionaries from Ragatouille search,
216
+ each with 'content' and 'document_metadata'.
217
+ groq_api_key (str): The Groq API key.
218
+
219
+ Returns:
220
+ str: The generated answer.
221
+ """
222
+ if not groq_api_key:
223
+ return "Error: Groq API key is not set. Cannot generate answer."
224
+
225
+ print("Generating answer with Groq API...")
226
+ client = Groq(api_key=groq_api_key)
227
+
228
+ context_parts = []
229
+ for i, res in enumerate(retrieved_results):
230
+ content = res.get("content", "")
231
+ metadata = res.get("document_metadata", {})
232
+ section_heading = metadata.get("section_heading", "N/A")
233
+ document_part = metadata.get("document_part", "N/A") # New metadata field
234
+
235
+ context_parts.append(
236
+ f"--- Context Chunk {i+1} ---\n"
237
+ f"Document Part: {document_part}\n"
238
+ f"Section Heading: {section_heading}\n"
239
+ f"Content: {content}\n"
240
+ f"-------------------------"
241
+ )
242
+ context = "\n\n".join(context_parts)
243
+
244
+ prompt = (
245
+ f"You are a specialized document analyzer assistant. Your task is to answer the user's question "
246
+ f"solely based on the provided context. Pay close attention to the section heading and document part "
247
+ f"for each context chunk. Ensure your answer incorporates all relevant details, including any legal nuances "
248
+ f"and conditions found in the context, and is concise, limited to one or two sentences. "
249
+ f"Do not explicitly mention the retrieved chunks. If the answer cannot be found in the provided context, "
250
+ f"clearly state that you do not have enough information.\n\n"
251
+ f"Context:\n{context}\n\n"
252
+ f"Question: {query}\n\n"
253
+ f"Answer:"
254
+ )
255
+
256
+ try:
257
+ chat_completion = client.chat.completions.create(
258
+ messages=[
259
+ {
260
+ "role": "user",
261
+ "content": prompt,
262
+ }
263
+ ],
264
+ model=GROQ_MODEL_NAME,
265
+ temperature=0.7,
266
+ max_tokens=500,
267
+ )
268
+ answer = chat_completion.choices[0].message.content
269
+ print("Answer generated successfully.")
270
+ return answer
271
+ except Exception as e:
272
+ print(f"An error occurred during Groq API call: {e}")
273
+ return "Could not generate an answer due to an API error."
requirements.txt ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.6.1
2
+ aiohttp==3.12.15
3
+ aiosignal==1.4.0
4
+ aiosqlite==0.21.0
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ asyncio==3.4.3
8
+ attrs==25.3.0
9
+ banks==2.2.0
10
+ beautifulsoup4==4.13.4
11
+ bitarray==3.6.0
12
+ blinker==1.9.0
13
+ catalogue==2.0.10
14
+ certifi==2025.7.14
15
+ charset-normalizer==3.4.2
16
+ click==8.2.1
17
+ colbert-ai==0.2.21
18
+ colorama==0.4.6
19
+ dataclasses-json==0.6.7
20
+ datasets==4.0.0
21
+ defusedxml==0.7.1
22
+ Deprecated==1.2.18
23
+ dill==0.3.8
24
+ dirtyjson==1.0.8
25
+ distro==1.9.0
26
+ dotenv==0.9.9
27
+ faiss-cpu==1.11.0.post1
28
+ fast_pytorch_kmeans==0.2.2
29
+ fastapi==0.116.1
30
+ filelock==3.18.0
31
+ filetype==1.2.0
32
+ Flask==3.1.1
33
+ frozenlist==1.7.0
34
+ fsspec==2025.3.0
35
+ git-python==1.0.3
36
+ gitdb==4.0.12
37
+ GitPython==3.1.45
38
+ greenlet==3.2.3
39
+ griffe==1.9.0
40
+ groq==0.30.0
41
+ h11==0.16.0
42
+ hf-xet==1.1.5
43
+ httpcore==1.0.9
44
+ httpx==0.28.1
45
+ huggingface-hub==0.34.3
46
+ idna==3.10
47
+ itsdangerous==2.2.0
48
+ Jinja2==3.1.6
49
+ jiter==0.10.0
50
+ joblib==1.5.1
51
+ jsonpatch==1.33
52
+ jsonpointer==3.0.0
53
+ langchain==0.3.27
54
+ langchain-core==0.3.72
55
+ langchain-pymupdf4llm==0.4.1
56
+ langchain-text-splitters==0.3.9
57
+ langsmith==0.4.8
58
+ llama-cloud==0.1.35
59
+ llama-cloud-services==0.6.53
60
+ llama-index==0.12.52
61
+ llama-index-agent-openai==0.4.12
62
+ llama-index-cli==0.4.4
63
+ llama-index-core==0.12.52.post1
64
+ llama-index-embeddings-openai==0.3.1
65
+ llama-index-indices-managed-llama-cloud==0.8.0
66
+ llama-index-instrumentation==0.3.1
67
+ llama-index-llms-openai==0.4.7
68
+ llama-index-multi-modal-llms-openai==0.5.3
69
+ llama-index-program-openai==0.3.2
70
+ llama-index-question-gen-openai==0.3.1
71
+ llama-index-readers-file==0.4.11
72
+ llama-index-readers-llama-parse==0.4.0
73
+ llama-index-workflows==1.2.0
74
+ llama-parse==0.6.53
75
+ MarkupSafe==3.0.2
76
+ marshmallow==3.26.1
77
+ mpmath==1.3.0
78
+ multidict==6.6.3
79
+ multiprocess==0.70.16
80
+ mypy_extensions==1.1.0
81
+ nest-asyncio==1.6.0
82
+ networkx==3.5
83
+ ninja==1.11.1.4
84
+ nltk==3.9.1
85
+ numpy==2.3.2
86
+ onnx==1.18.0
87
+ openai==1.98.0
88
+ orjson==3.11.1
89
+ packaging==25.0
90
+ pandas==2.2.3
91
+ pillow==11.3.0
92
+ platformdirs==4.3.8
93
+ propcache==0.3.2
94
+ protobuf==6.31.1
95
+ psutil==7.0.0
96
+ pyarrow==21.0.0
97
+ pydantic==2.11.7
98
+ pydantic_core==2.33.2
99
+ PyMuPDF==1.26.3
100
+ pymupdf4llm==0.0.27
101
+ pypdf==5.9.0
102
+ python-dateutil==2.9.0.post0
103
+ python-dotenv==1.1.1
104
+ pytz==2025.2
105
+ PyYAML==6.0.2
106
+ RAGatouille==0.0.9.post2
107
+ regex==2025.7.33
108
+ requests==2.32.4
109
+ requests-toolbelt==1.0.0
110
+ safetensors==0.5.3
111
+ scikit-learn==1.7.1
112
+ scipy==1.16.1
113
+ sentence-transformers==5.0.0
114
+ setuptools==80.9.0
115
+ six==1.17.0
116
+ smmap==5.0.2
117
+ sniffio==1.3.1
118
+ soupsieve==2.7
119
+ SQLAlchemy==2.0.42
120
+ srsly==2.5.1
121
+ starlette==0.47.2
122
+ striprtf==0.0.26
123
+ sympy==1.14.0
124
+ tenacity==9.1.2
125
+ threadpoolctl==3.6.0
126
+ tiktoken==0.9.0
127
+ tokenizers==0.21.4
128
+ torch==2.7.1
129
+ tqdm==4.67.1
130
+ transformers==4.49.0
131
+ typing==3.7.4.3
132
+ typing-inspect==0.9.0
133
+ typing-inspection==0.4.1
134
+ typing_extensions==4.14.1
135
+ tzdata==2025.2
136
+ ujson==5.10.0
137
+ urllib3==2.5.0
138
+ uvicorn==0.35.0
139
+ voyager==2.1.0
140
+ Werkzeug==3.1.3
141
+ wrapt==1.17.2
142
+ xxhash==3.5.0
143
+ yarl==1.20.1
144
+ zstandard==0.23.0