Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# main.py (HF Space FastAPI)
|
2 |
from contextlib import asynccontextmanager
|
3 |
from fastapi import FastAPI
|
4 |
from document_processor import DocumentProcessor
|
@@ -35,15 +35,20 @@ app = FastAPI(
|
|
35 |
|
36 |
@app.post("/analyze_document")
|
37 |
async def analyze_document(data: AnalyzeDocumentInput):
|
38 |
-
"""Unified endpoint for complete document analysis WITH
|
39 |
try:
|
40 |
start_time = time.time()
|
41 |
|
42 |
if not data.document_text:
|
43 |
return {"error": "No document text provided"}
|
44 |
|
45 |
-
#
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Process document completely with pre-computed embeddings
|
49 |
result, chunk_data = await processor.process_document(data.document_text, doc_id)
|
@@ -52,7 +57,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
|
|
52 |
try:
|
53 |
success = vector_store.save_document_embeddings_optimized(
|
54 |
chunk_data=chunk_data,
|
55 |
-
document_id=doc_id,
|
56 |
analysis_results=result
|
57 |
)
|
58 |
if success:
|
@@ -69,6 +74,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
|
|
69 |
|
70 |
processing_time = time.time() - start_time
|
71 |
result["total_processing_time"] = f"{processing_time:.2f}s"
|
|
|
72 |
|
73 |
return result
|
74 |
|
@@ -107,6 +113,9 @@ async def chat_with_document(data: ChatInput):
|
|
107 |
if not data.message or not data.document_id:
|
108 |
return {"error": "Message and document_id are required"}
|
109 |
|
|
|
|
|
|
|
110 |
# Get retriever for specific document
|
111 |
retriever = vector_store.get_retriever(
|
112 |
clause_tagger=processor.clause_tagger,
|
@@ -123,9 +132,12 @@ async def chat_with_document(data: ChatInput):
|
|
123 |
return {
|
124 |
"response": "I couldn't find relevant information in the document to answer your question.",
|
125 |
"sources": [],
|
126 |
-
"document_id": data.document_id
|
|
|
127 |
}
|
128 |
|
|
|
|
|
129 |
# Prepare context from relevant chunks
|
130 |
context = "\n\n".join([doc.page_content for doc in relevant_chunks])
|
131 |
|
@@ -153,8 +165,75 @@ async def chat_with_document(data: ChatInput):
|
|
153 |
}
|
154 |
|
155 |
except Exception as e:
|
|
|
156 |
return {"error": f"Chat failed: {str(e)}"}
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
# Keep backward compatibility endpoints
|
159 |
@app.post("/chunk")
|
160 |
def chunk_text(data: ChunkInput):
|
@@ -172,7 +251,9 @@ def health_check():
|
|
172 |
"document_processor": "active",
|
173 |
"vector_store": "active",
|
174 |
"gemini_llm": "active"
|
175 |
-
}
|
|
|
|
|
176 |
}
|
177 |
|
178 |
@app.get("/cache_stats")
|
@@ -182,4 +263,3 @@ def get_cache_stats():
|
|
182 |
if __name__ == "__main__":
|
183 |
import uvicorn
|
184 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
185 |
-
|
|
|
1 |
+
# main.py (HF Space FastAPI) - UPDATED with doc_id alignment
|
2 |
from contextlib import asynccontextmanager
|
3 |
from fastapi import FastAPI
|
4 |
from document_processor import DocumentProcessor
|
|
|
35 |
|
36 |
@app.post("/analyze_document")
|
37 |
async def analyze_document(data: AnalyzeDocumentInput):
|
38 |
+
"""Unified endpoint for complete document analysis WITH doc_id alignment"""
|
39 |
try:
|
40 |
start_time = time.time()
|
41 |
|
42 |
if not data.document_text:
|
43 |
return {"error": "No document text provided"}
|
44 |
|
45 |
+
# β Use forced doc_id if provided (from Flask), otherwise generate from text
|
46 |
+
if data.force_doc_id:
|
47 |
+
doc_id = data.force_doc_id
|
48 |
+
print(f"π§ Using Flask-provided doc_id: {doc_id}")
|
49 |
+
else:
|
50 |
+
doc_id = hashlib.sha256(data.document_text.encode()).hexdigest()[:16]
|
51 |
+
print(f"π§ Generated new doc_id: {doc_id}")
|
52 |
|
53 |
# Process document completely with pre-computed embeddings
|
54 |
result, chunk_data = await processor.process_document(data.document_text, doc_id)
|
|
|
57 |
try:
|
58 |
success = vector_store.save_document_embeddings_optimized(
|
59 |
chunk_data=chunk_data,
|
60 |
+
document_id=doc_id, # Use the aligned doc_id
|
61 |
analysis_results=result
|
62 |
)
|
63 |
if success:
|
|
|
74 |
|
75 |
processing_time = time.time() - start_time
|
76 |
result["total_processing_time"] = f"{processing_time:.2f}s"
|
77 |
+
result["doc_id"] = doc_id # β Ensure doc_id is returned
|
78 |
|
79 |
return result
|
80 |
|
|
|
113 |
if not data.message or not data.document_id:
|
114 |
return {"error": "Message and document_id are required"}
|
115 |
|
116 |
+
print(f"π Processing chat for doc_id: {data.document_id}")
|
117 |
+
print(f"π User question: {data.message}")
|
118 |
+
|
119 |
# Get retriever for specific document
|
120 |
retriever = vector_store.get_retriever(
|
121 |
clause_tagger=processor.clause_tagger,
|
|
|
132 |
return {
|
133 |
"response": "I couldn't find relevant information in the document to answer your question.",
|
134 |
"sources": [],
|
135 |
+
"document_id": data.document_id,
|
136 |
+
"chunks_used": 0
|
137 |
}
|
138 |
|
139 |
+
print(f"π Found {len(relevant_chunks)} relevant chunks")
|
140 |
+
|
141 |
# Prepare context from relevant chunks
|
142 |
context = "\n\n".join([doc.page_content for doc in relevant_chunks])
|
143 |
|
|
|
165 |
}
|
166 |
|
167 |
except Exception as e:
|
168 |
+
print(f"β Chat error: {e}")
|
169 |
return {"error": f"Chat failed: {str(e)}"}
|
170 |
|
171 |
+
@app.get("/debug_pinecone/{document_id}")
|
172 |
+
async def debug_pinecone_storage(document_id: str):
|
173 |
+
"""Debug what's actually stored in Pinecone for a document"""
|
174 |
+
try:
|
175 |
+
# Initialize Pinecone
|
176 |
+
vector_store._initialize_pinecone()
|
177 |
+
index = vector_store.pc.Index(vector_store.index_name)
|
178 |
+
|
179 |
+
# Query Pinecone directly for this document
|
180 |
+
query_response = index.query(
|
181 |
+
vector=[0.0] * 768, # Dummy query vector
|
182 |
+
filter={"document_id": document_id},
|
183 |
+
top_k=10,
|
184 |
+
include_metadata=True
|
185 |
+
)
|
186 |
+
|
187 |
+
return {
|
188 |
+
"document_id": document_id,
|
189 |
+
"pinecone_index": vector_store.index_name,
|
190 |
+
"vectors_found": len(query_response.matches),
|
191 |
+
"index_stats": index.describe_index_stats(),
|
192 |
+
"matches": [
|
193 |
+
{
|
194 |
+
"id": match.id,
|
195 |
+
"score": match.score,
|
196 |
+
"metadata": match.metadata
|
197 |
+
}
|
198 |
+
for match in query_response.matches[:3]
|
199 |
+
]
|
200 |
+
}
|
201 |
+
|
202 |
+
except Exception as e:
|
203 |
+
return {"error": f"Pinecone debug failed: {str(e)}"}
|
204 |
+
|
205 |
+
@app.post("/debug_retrieval")
|
206 |
+
async def debug_retrieval(data: ChatInput):
|
207 |
+
"""Debug endpoint to see what chunks are available for a document"""
|
208 |
+
try:
|
209 |
+
retriever = vector_store.get_retriever(
|
210 |
+
clause_tagger=processor.clause_tagger,
|
211 |
+
document_id=data.document_id
|
212 |
+
)
|
213 |
+
|
214 |
+
if not retriever:
|
215 |
+
return {"error": "Failed to create retriever"}
|
216 |
+
|
217 |
+
# Get all chunks for this document (no similarity filtering)
|
218 |
+
all_chunks = retriever.get_relevant_documents(data.message)
|
219 |
+
|
220 |
+
return {
|
221 |
+
"document_id": data.document_id,
|
222 |
+
"query": data.message,
|
223 |
+
"total_chunks_found": len(all_chunks),
|
224 |
+
"chunks": [
|
225 |
+
{
|
226 |
+
"chunk_index": doc.metadata.get("chunk_index", 0),
|
227 |
+
"text_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
|
228 |
+
"metadata": doc.metadata
|
229 |
+
}
|
230 |
+
for doc in all_chunks[:5] # Show first 5 chunks
|
231 |
+
]
|
232 |
+
}
|
233 |
+
|
234 |
+
except Exception as e:
|
235 |
+
return {"error": f"Debug failed: {str(e)}"}
|
236 |
+
|
237 |
# Keep backward compatibility endpoints
|
238 |
@app.post("/chunk")
|
239 |
def chunk_text(data: ChunkInput):
|
|
|
251 |
"document_processor": "active",
|
252 |
"vector_store": "active",
|
253 |
"gemini_llm": "active"
|
254 |
+
},
|
255 |
+
"pinecone_index": vector_store.index_name,
|
256 |
+
"embedding_model": "InLegalBERT"
|
257 |
}
|
258 |
|
259 |
@app.get("/cache_stats")
|
|
|
263 |
if __name__ == "__main__":
|
264 |
import uvicorn
|
265 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|