Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 10

Commit

113cf8e

verified ·

1 Parent(s): 2051f5b

Update main.py

Browse files

Files changed (1) hide show

main.py +88 -8

main.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# main.py (HF Space FastAPI)
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from document_processor import DocumentProcessor
@@ -35,15 +35,20 @@ app = FastAPI(
 @app.post("/analyze_document")
 async def analyze_document(data: AnalyzeDocumentInput):
-    """Unified endpoint for complete document analysis WITH optimized vector storage"""
     try:
         start_time = time.time()
         if not data.document_text:
             return {"error": "No document text provided"}
-        # Generate document ID
-        doc_id = hashlib.sha256(data.document_text.encode()).hexdigest()[:16]
         # Process document completely with pre-computed embeddings
         result, chunk_data = await processor.process_document(data.document_text, doc_id)
@@ -52,7 +57,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
         try:
             success = vector_store.save_document_embeddings_optimized(
                 chunk_data=chunk_data,
-                document_id=doc_id,
                 analysis_results=result
             )
             if success:
@@ -69,6 +74,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
         processing_time = time.time() - start_time
         result["total_processing_time"] = f"{processing_time:.2f}s"
         return result
@@ -107,6 +113,9 @@ async def chat_with_document(data: ChatInput):
         if not data.message or not data.document_id:
             return {"error": "Message and document_id are required"}
         # Get retriever for specific document
         retriever = vector_store.get_retriever(
             clause_tagger=processor.clause_tagger,
@@ -123,9 +132,12 @@ async def chat_with_document(data: ChatInput):
             return {
                 "response": "I couldn't find relevant information in the document to answer your question.",
                 "sources": [],
-                "document_id": data.document_id
             }
         # Prepare context from relevant chunks
         context = "\n\n".join([doc.page_content for doc in relevant_chunks])
@@ -153,8 +165,75 @@ async def chat_with_document(data: ChatInput):
         }
     except Exception as e:
         return {"error": f"Chat failed: {str(e)}"}
 # Keep backward compatibility endpoints
 @app.post("/chunk")
 def chunk_text(data: ChunkInput):
@@ -172,7 +251,9 @@ def health_check():
             "document_processor": "active",
             "vector_store": "active",
             "gemini_llm": "active"
-        }
     }
 @app.get("/cache_stats")
@@ -182,4 +263,3 @@ def get_cache_stats():
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

+# main.py (HF Space FastAPI) - UPDATED with doc_id alignment
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from document_processor import DocumentProcessor
 @app.post("/analyze_document")
 async def analyze_document(data: AnalyzeDocumentInput):
+    """Unified endpoint for complete document analysis WITH doc_id alignment"""
     try:
         start_time = time.time()
         if not data.document_text:
             return {"error": "No document text provided"}
+        # ⭐ Use forced doc_id if provided (from Flask), otherwise generate from text
+        if data.force_doc_id:
+            doc_id = data.force_doc_id
+            print(f"🔧 Using Flask-provided doc_id: {doc_id}")
+        else:
+            doc_id = hashlib.sha256(data.document_text.encode()).hexdigest()[:16]
+            print(f"🔧 Generated new doc_id: {doc_id}")
         # Process document completely with pre-computed embeddings
         result, chunk_data = await processor.process_document(data.document_text, doc_id)
         try:
             success = vector_store.save_document_embeddings_optimized(
                 chunk_data=chunk_data,
+                document_id=doc_id,  # Use the aligned doc_id
                 analysis_results=result
             )
             if success:
         processing_time = time.time() - start_time
         result["total_processing_time"] = f"{processing_time:.2f}s"
+        result["doc_id"] = doc_id  # ⭐ Ensure doc_id is returned
         return result
         if not data.message or not data.document_id:
             return {"error": "Message and document_id are required"}
+        print(f"🔍 Processing chat for doc_id: {data.document_id}")
+        print(f"📝 User question: {data.message}")
         # Get retriever for specific document
         retriever = vector_store.get_retriever(
             clause_tagger=processor.clause_tagger,
             return {
                 "response": "I couldn't find relevant information in the document to answer your question.",
                 "sources": [],
+                "document_id": data.document_id,
+                "chunks_used": 0
             }
+        print(f"📊 Found {len(relevant_chunks)} relevant chunks")
         # Prepare context from relevant chunks
         context = "\n\n".join([doc.page_content for doc in relevant_chunks])
         }
     except Exception as e:
+        print(f"❌ Chat error: {e}")
         return {"error": f"Chat failed: {str(e)}"}
+@app.get("/debug_pinecone/{document_id}")
+async def debug_pinecone_storage(document_id: str):
+    """Debug what's actually stored in Pinecone for a document"""
+    try:
+        # Initialize Pinecone
+        vector_store._initialize_pinecone()
+        index = vector_store.pc.Index(vector_store.index_name)
+        # Query Pinecone directly for this document
+        query_response = index.query(
+            vector=[0.0] * 768,  # Dummy query vector
+            filter={"document_id": document_id},
+            top_k=10,
+            include_metadata=True
+        )
+        return {
+            "document_id": document_id,
+            "pinecone_index": vector_store.index_name,
+            "vectors_found": len(query_response.matches),
+            "index_stats": index.describe_index_stats(),
+            "matches": [
+                {
+                    "id": match.id,
+                    "score": match.score,
+                    "metadata": match.metadata
+                }
+                for match in query_response.matches[:3]
+            ]
+        }
+    except Exception as e:
+        return {"error": f"Pinecone debug failed: {str(e)}"}
+@app.post("/debug_retrieval")
+async def debug_retrieval(data: ChatInput):
+    """Debug endpoint to see what chunks are available for a document"""
+    try:
+        retriever = vector_store.get_retriever(
+            clause_tagger=processor.clause_tagger,
+            document_id=data.document_id
+        )
+        if not retriever:
+            return {"error": "Failed to create retriever"}
+        # Get all chunks for this document (no similarity filtering)
+        all_chunks = retriever.get_relevant_documents(data.message)
+        return {
+            "document_id": data.document_id,
+            "query": data.message,
+            "total_chunks_found": len(all_chunks),
+            "chunks": [
+                {
+                    "chunk_index": doc.metadata.get("chunk_index", 0),
+                    "text_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
+                    "metadata": doc.metadata
+                }
+                for doc in all_chunks[:5]  # Show first 5 chunks
+            ]
+        }
+    except Exception as e:
+        return {"error": f"Debug failed: {str(e)}"}
 # Keep backward compatibility endpoints
 @app.post("/chunk")
 def chunk_text(data: ChunkInput):
             "document_processor": "active",
             "vector_store": "active",
             "gemini_llm": "active"
+        },
+        "pinecone_index": vector_store.index_name,
+        "embedding_model": "InLegalBERT"
     }
 @app.get("/cache_stats")
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)