sagar008 commited on
Commit
113cf8e
Β·
verified Β·
1 Parent(s): 2051f5b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +88 -8
main.py CHANGED
@@ -1,4 +1,4 @@
1
- # main.py (HF Space FastAPI)
2
  from contextlib import asynccontextmanager
3
  from fastapi import FastAPI
4
  from document_processor import DocumentProcessor
@@ -35,15 +35,20 @@ app = FastAPI(
35
 
36
  @app.post("/analyze_document")
37
  async def analyze_document(data: AnalyzeDocumentInput):
38
- """Unified endpoint for complete document analysis WITH optimized vector storage"""
39
  try:
40
  start_time = time.time()
41
 
42
  if not data.document_text:
43
  return {"error": "No document text provided"}
44
 
45
- # Generate document ID
46
- doc_id = hashlib.sha256(data.document_text.encode()).hexdigest()[:16]
 
 
 
 
 
47
 
48
  # Process document completely with pre-computed embeddings
49
  result, chunk_data = await processor.process_document(data.document_text, doc_id)
@@ -52,7 +57,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
52
  try:
53
  success = vector_store.save_document_embeddings_optimized(
54
  chunk_data=chunk_data,
55
- document_id=doc_id,
56
  analysis_results=result
57
  )
58
  if success:
@@ -69,6 +74,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
69
 
70
  processing_time = time.time() - start_time
71
  result["total_processing_time"] = f"{processing_time:.2f}s"
 
72
 
73
  return result
74
 
@@ -107,6 +113,9 @@ async def chat_with_document(data: ChatInput):
107
  if not data.message or not data.document_id:
108
  return {"error": "Message and document_id are required"}
109
 
 
 
 
110
  # Get retriever for specific document
111
  retriever = vector_store.get_retriever(
112
  clause_tagger=processor.clause_tagger,
@@ -123,9 +132,12 @@ async def chat_with_document(data: ChatInput):
123
  return {
124
  "response": "I couldn't find relevant information in the document to answer your question.",
125
  "sources": [],
126
- "document_id": data.document_id
 
127
  }
128
 
 
 
129
  # Prepare context from relevant chunks
130
  context = "\n\n".join([doc.page_content for doc in relevant_chunks])
131
 
@@ -153,8 +165,75 @@ async def chat_with_document(data: ChatInput):
153
  }
154
 
155
  except Exception as e:
 
156
  return {"error": f"Chat failed: {str(e)}"}
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # Keep backward compatibility endpoints
159
  @app.post("/chunk")
160
  def chunk_text(data: ChunkInput):
@@ -172,7 +251,9 @@ def health_check():
172
  "document_processor": "active",
173
  "vector_store": "active",
174
  "gemini_llm": "active"
175
- }
 
 
176
  }
177
 
178
  @app.get("/cache_stats")
@@ -182,4 +263,3 @@ def get_cache_stats():
182
  if __name__ == "__main__":
183
  import uvicorn
184
  uvicorn.run(app, host="0.0.0.0", port=7860)
185
-
 
1
+ # main.py (HF Space FastAPI) - UPDATED with doc_id alignment
2
  from contextlib import asynccontextmanager
3
  from fastapi import FastAPI
4
  from document_processor import DocumentProcessor
 
35
 
36
  @app.post("/analyze_document")
37
  async def analyze_document(data: AnalyzeDocumentInput):
38
+ """Unified endpoint for complete document analysis WITH doc_id alignment"""
39
  try:
40
  start_time = time.time()
41
 
42
  if not data.document_text:
43
  return {"error": "No document text provided"}
44
 
45
+ # ⭐ Use forced doc_id if provided (from Flask), otherwise generate from text
46
+ if data.force_doc_id:
47
+ doc_id = data.force_doc_id
48
+ print(f"πŸ”§ Using Flask-provided doc_id: {doc_id}")
49
+ else:
50
+ doc_id = hashlib.sha256(data.document_text.encode()).hexdigest()[:16]
51
+ print(f"πŸ”§ Generated new doc_id: {doc_id}")
52
 
53
  # Process document completely with pre-computed embeddings
54
  result, chunk_data = await processor.process_document(data.document_text, doc_id)
 
57
  try:
58
  success = vector_store.save_document_embeddings_optimized(
59
  chunk_data=chunk_data,
60
+ document_id=doc_id, # Use the aligned doc_id
61
  analysis_results=result
62
  )
63
  if success:
 
74
 
75
  processing_time = time.time() - start_time
76
  result["total_processing_time"] = f"{processing_time:.2f}s"
77
+ result["doc_id"] = doc_id # ⭐ Ensure doc_id is returned
78
 
79
  return result
80
 
 
113
  if not data.message or not data.document_id:
114
  return {"error": "Message and document_id are required"}
115
 
116
+ print(f"πŸ” Processing chat for doc_id: {data.document_id}")
117
+ print(f"πŸ“ User question: {data.message}")
118
+
119
  # Get retriever for specific document
120
  retriever = vector_store.get_retriever(
121
  clause_tagger=processor.clause_tagger,
 
132
  return {
133
  "response": "I couldn't find relevant information in the document to answer your question.",
134
  "sources": [],
135
+ "document_id": data.document_id,
136
+ "chunks_used": 0
137
  }
138
 
139
+ print(f"πŸ“Š Found {len(relevant_chunks)} relevant chunks")
140
+
141
  # Prepare context from relevant chunks
142
  context = "\n\n".join([doc.page_content for doc in relevant_chunks])
143
 
 
165
  }
166
 
167
  except Exception as e:
168
+ print(f"❌ Chat error: {e}")
169
  return {"error": f"Chat failed: {str(e)}"}
170
 
171
+ @app.get("/debug_pinecone/{document_id}")
172
+ async def debug_pinecone_storage(document_id: str):
173
+ """Debug what's actually stored in Pinecone for a document"""
174
+ try:
175
+ # Initialize Pinecone
176
+ vector_store._initialize_pinecone()
177
+ index = vector_store.pc.Index(vector_store.index_name)
178
+
179
+ # Query Pinecone directly for this document
180
+ query_response = index.query(
181
+ vector=[0.0] * 768, # Dummy query vector
182
+ filter={"document_id": document_id},
183
+ top_k=10,
184
+ include_metadata=True
185
+ )
186
+
187
+ return {
188
+ "document_id": document_id,
189
+ "pinecone_index": vector_store.index_name,
190
+ "vectors_found": len(query_response.matches),
191
+ "index_stats": index.describe_index_stats(),
192
+ "matches": [
193
+ {
194
+ "id": match.id,
195
+ "score": match.score,
196
+ "metadata": match.metadata
197
+ }
198
+ for match in query_response.matches[:3]
199
+ ]
200
+ }
201
+
202
+ except Exception as e:
203
+ return {"error": f"Pinecone debug failed: {str(e)}"}
204
+
205
+ @app.post("/debug_retrieval")
206
+ async def debug_retrieval(data: ChatInput):
207
+ """Debug endpoint to see what chunks are available for a document"""
208
+ try:
209
+ retriever = vector_store.get_retriever(
210
+ clause_tagger=processor.clause_tagger,
211
+ document_id=data.document_id
212
+ )
213
+
214
+ if not retriever:
215
+ return {"error": "Failed to create retriever"}
216
+
217
+ # Get all chunks for this document (no similarity filtering)
218
+ all_chunks = retriever.get_relevant_documents(data.message)
219
+
220
+ return {
221
+ "document_id": data.document_id,
222
+ "query": data.message,
223
+ "total_chunks_found": len(all_chunks),
224
+ "chunks": [
225
+ {
226
+ "chunk_index": doc.metadata.get("chunk_index", 0),
227
+ "text_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
228
+ "metadata": doc.metadata
229
+ }
230
+ for doc in all_chunks[:5] # Show first 5 chunks
231
+ ]
232
+ }
233
+
234
+ except Exception as e:
235
+ return {"error": f"Debug failed: {str(e)}"}
236
+
237
  # Keep backward compatibility endpoints
238
  @app.post("/chunk")
239
  def chunk_text(data: ChunkInput):
 
251
  "document_processor": "active",
252
  "vector_store": "active",
253
  "gemini_llm": "active"
254
+ },
255
+ "pinecone_index": vector_store.index_name,
256
+ "embedding_model": "InLegalBERT"
257
  }
258
 
259
  @app.get("/cache_stats")
 
263
  if __name__ == "__main__":
264
  import uvicorn
265
  uvicorn.run(app, host="0.0.0.0", port=7860)