chaaim123 commited on
Commit
f1a2317
·
verified ·
1 Parent(s): 4c0e6c8

Create retriever/document_manager.py

Browse files
Files changed (1) hide show
  1. retriever/document_manager.py +121 -0
retriever/document_manager.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict, List
4
+ import uuid
5
+ from data.document_loader import DocumentLoader
6
+ from data.pdf_reader import PDFReader
7
+ from retriever.chunk_documents import chunk_documents
8
+ from retriever.vector_store_manager import VectorStoreManager
9
+
10
+ class DocumentManager:
11
+ def __init__(self):
12
+ self.doc_loader = DocumentLoader()
13
+ self.pdf_reader = PDFReader()
14
+ self.vector_manager = VectorStoreManager()
15
+ self.uploaded_documents = {}
16
+ self.chunked_documents = {}
17
+ self.document_ids = {}
18
+ logging.info("DocumentManager initialized")
19
+
20
+ def process_document(self, file):
21
+ """
22
+ Process an uploaded file: load, read PDF, chunk, and store in vector store.
23
+ Returns: (status_message, page_list, filename, doc_id)
24
+ """
25
+ try:
26
+ if file is None:
27
+ return "No file uploaded", None, None
28
+
29
+ logging.info(f"Processing file: {file}")
30
+
31
+ # Load and validate file
32
+ file_path = self.doc_loader.load_file(file)
33
+ filename = os.path.basename(file_path)
34
+
35
+ # Read PDF content
36
+ page_list = self.pdf_reader.read_pdf(file_path)
37
+
38
+ # Store the uploaded document
39
+ self.uploaded_documents[filename] = file_path
40
+
41
+ # Generate a unique document ID
42
+ doc_id = str(uuid.uuid4())
43
+ self.document_ids[filename] = doc_id
44
+
45
+ # Chunk the pages
46
+ chunks = chunk_documents(page_list, doc_id, chunk_size=2000, chunk_overlap=300)
47
+ self.chunked_documents[filename] = chunks
48
+
49
+ # Add chunks to vector store
50
+ self.vector_manager.add_documents(chunks)
51
+
52
+ return (
53
+ f"Successfully loaded {filename} with {len(page_list)} pages",
54
+ filename,
55
+ doc_id
56
+ )
57
+
58
+ except Exception as e:
59
+ logging.error(f"Error processing document: {str(e)}")
60
+ return f"Error: {str(e)}", [], None, None
61
+
62
+ def get_uploaded_documents(self):
63
+ """Return the list of uploaded document filenames."""
64
+ return list(self.uploaded_documents.keys())
65
+
66
+ def get_chunks(self, filename):
67
+ """Return chunks for a given filename."""
68
+ return self.chunked_documents.get(filename, [])
69
+
70
+ def get_document_id(self, filename):
71
+ """Return the document ID for a given filename."""
72
+ return self.document_ids.get(filename, None)
73
+
74
+ def retrieve_top_k(self, query: str, selected_docs: List[str], k: int = 5) -> List[Dict[str, Any]]:
75
+ """
76
+ Retrieve the top K chunks across the selected documents based on the user's query.
77
+
78
+ Args:
79
+ query (str): The user's query.
80
+ selected_docs (List[str]): List of selected document filenames from the dropdown.
81
+ k (int): Number of top results to return (default is 5).
82
+
83
+ Returns:
84
+ List[Dict[str, Any]]: List of top K chunks with their text, metadata, and scores.
85
+ """
86
+ if not selected_docs:
87
+ logging.warning("No documents selected for retrieval")
88
+ return []
89
+
90
+ all_results = []
91
+ for filename in selected_docs:
92
+ doc_id = self.get_document_id(filename)
93
+ if not doc_id:
94
+ logging.warning(f"No document ID found for filename: {filename}")
95
+ continue
96
+
97
+ # Search for relevant chunks within this document
98
+ results = self.vector_manager.search(query, doc_id, k=k)
99
+ all_results.extend(results)
100
+
101
+ # Sort all results by score in descending order and take the top K
102
+ all_results.sort(key=lambda x: x['score'], reverse=True)
103
+ top_k_results = all_results[:k]
104
+
105
+ # Log the list of retrieved documents
106
+ #logging.info(f"Result from search :{all_results} ")
107
+ logging.info(f"Retrieved top {k} documents:")
108
+ for i, result in enumerate(top_k_results, 1):
109
+ doc_id = result['metadata'].get('doc_id', 'Unknown')
110
+ filename = next((name for name, d_id in self.document_ids.items() if d_id == doc_id), 'Unknown')
111
+ logging.info(f"{i}. Filename: {filename}, Doc ID: {doc_id}, Score: {result['score']:.4f}, Text: {result['text'][:200]}...")
112
+
113
+ return top_k_results
114
+
115
+ def retrieve_summary_chunks(self, query: str, doc_id : str, k: int = 10):
116
+ logging.info(f"Retrieving {k} chunks for summary: {query}, Document Id: {doc_id}")
117
+ results = self.vector_manager.search(query, doc_id, k=k)
118
+ top_k_results = results[:k]
119
+ logging.info(f"Retrieved {len(top_k_results)} chunks for summary")
120
+
121
+ return top_k_results