Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 10

Commit

2ecb6b7

verified ·

1 Parent(s): 137a4e7

Update document_processor.py

Browse files

Files changed (1) hide show

document_processor.py +170 -36

document_processor.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import time
 from typing import List, Dict, Any
 from chunker import DocumentChunker
 from summarizer import DocumentSummarizer
@@ -13,6 +16,7 @@ class DocumentProcessor:
         self.risk_detector = None
         self.clause_tagger = None
         self.cache = {}  # Simple in-memory cache
     async def initialize(self):
         """Initialize all components"""
@@ -23,14 +27,18 @@ class DocumentProcessor:
         self.risk_detector = RiskDetector()
         self.clause_tagger = ClauseTagger()
-        # Initialize models
-        await self.summarizer.initialize()
-        await self.clause_tagger.initialize()
         print("✅ Document Processor initialized")
-    async def process_document(self, text: str, doc_id: str) -> Dict[str, Any]:
-        """Process document through all analysis stages"""
         # Check cache first
         if doc_id in self.cache:
@@ -38,42 +46,168 @@ class DocumentProcessor:
             return self.cache[doc_id]
         print(f"🔄 Processing new document: {doc_id}")
-        # Step 1: Chunk the document
-        chunks = self.chunker.chunk_by_tokens(text, max_tokens=1600, stride=50)
-        print(f"📦 Created {len(chunks)} chunks")
-        # Step 2: Batch summarization
-        summary_result = await self.summarizer.batch_summarize(chunks)
-        # Step 3: Risk detection (can run in parallel with summarization)
-        risk_terms = self.risk_detector.detect_risks(chunks)
-        # Step 4: Clause tagging
-        key_clauses = await self.clause_tagger.tag_clauses(chunks)
-        result = {
-            "summary": summary_result,
-            "risky_terms": risk_terms,
-            "key_clauses": key_clauses,
-            "chunk_count": len(chunks)
-        }
-        # Cache the result
-        self.cache[doc_id] = result
-        return result
     def chunk_text(self, data: ChunkInput) -> Dict[str, Any]:
         """Standalone chunking endpoint"""
         start = time.time()
-        chunks = self.chunker.chunk_by_tokens(data.text, data.max_tokens, data.stride)
-        return {
-            "chunks": chunks,
-            "chunk_count": len(chunks),
-            "time_taken": f"{time.time() - start:.2f}s"
-        }
     def summarize_batch(self, data: SummarizeBatchInput) -> Dict[str, Any]:
         """Standalone batch summarization endpoint"""
-        return self.summarizer.summarize_texts_sync(data.texts, data.max_length, data.min_length)

+# document_processor.py
 import time
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Any
 from chunker import DocumentChunker
 from summarizer import DocumentSummarizer
         self.risk_detector = None
         self.clause_tagger = None
         self.cache = {}  # Simple in-memory cache
+        self.executor = ThreadPoolExecutor(max_workers=3)  # For CPU-bound parallel tasks
     async def initialize(self):
         """Initialize all components"""
         self.risk_detector = RiskDetector()
         self.clause_tagger = ClauseTagger()
+        # Initialize models in parallel for faster startup
+        init_tasks = [
+            self.summarizer.initialize(),
+            self.clause_tagger.initialize()
+        ]
+        await asyncio.gather(*init_tasks)
         print("✅ Document Processor initialized")
+    async def process_document(self, text: str, doc_id: str) -> tuple[Dict[str, Any], List[Dict]]:
+        """Process document with optimized single embedding generation"""
         # Check cache first
         if doc_id in self.cache:
             return self.cache[doc_id]
         print(f"🔄 Processing new document: {doc_id}")
+        start_time = time.time()
+        try:
+            # Step 1: Chunk the document (fast, do this first)
+            chunks = self.chunker.chunk_by_tokens(text, max_tokens=1600, stride=50)
+            print(f"📦 Created {len(chunks)} chunks in {time.time() - start_time:.2f}s")
+            # Step 2: Generate embeddings ONCE for all chunks
+            print(f"🧠 Generating embeddings for {len(chunks)} chunks...")
+            embedding_start = time.time()
+            # Generate all embeddings in one batch
+            if self.clause_tagger.embedding_model:
+                chunk_embeddings = self.clause_tagger.embedding_model.encode(chunks)
+                embedding_time = time.time() - embedding_start
+                print(f"✅ Generated embeddings in {embedding_time:.2f}s")
+                # Store embeddings for reuse
+                chunk_data = [
+                    {"text": chunk, "embedding": embedding}
+                    for chunk, embedding in zip(chunks, chunk_embeddings)
+                ]
+            else:
+                chunk_data = [{"text": chunk, "embedding": None} for chunk in chunks]
+                embedding_time = 0
+                print("⚠️ No embedding model available")
+            # Step 3: Run analysis tasks in parallel using pre-computed embeddings
+            tasks = []
+            # Parallel task 1: Batch summarization (async)
+            summary_task = asyncio.create_task(
+                self.summarizer.batch_summarize(chunks)
+            )
+            tasks.append(('summary', summary_task))
+            # Parallel task 2: Risk detection (CPU-bound, run in thread pool)
+            risk_task = asyncio.get_event_loop().run_in_executor(
+                self.executor,
+                self.risk_detector.detect_risks,
+                chunks
+            )
+            tasks.append(('risks', risk_task))
+            # Parallel task 3: Clause tagging using pre-computed embeddings
+            if self.clause_tagger.embedding_model and chunk_data[0]["embedding"] is not None:
+                clause_task = asyncio.create_task(
+                    self.clause_tagger.tag_clauses_with_embeddings(chunk_data)
+                )
+                tasks.append(('clauses', clause_task))
+            print(f"🚀 Starting {len(tasks)} parallel analysis tasks...")
+            # Wait for all tasks to complete with progress tracking
+            results = {}
+            for task_name, task in tasks:
+                try:
+                    print(f"⏳ Waiting for {task_name} analysis...")
+                    results[task_name] = await task
+                    print(f"✅ {task_name} completed")
+                except Exception as e:
+                    print(f"⚠️ {task_name} analysis failed: {e}")
+                    # Provide fallback results
+                    if task_name == 'summary':
+                        results[task_name] = "Summary generation failed"
+                    elif task_name == 'risks':
+                        results[task_name] = []
+                    elif task_name == 'clauses':
+                        results[task_name] = []
+            # Combine results
+            processing_time = time.time() - start_time
+            result = {
+                "summary": results.get('summary', 'Summary not available'),
+                "risky_terms": results.get('risks', []),
+                "key_clauses": results.get('clauses', []),
+                "chunk_count": len(chunks),
+                "processing_time": f"{processing_time:.2f}s",
+                "embedding_time": f"{embedding_time:.2f}s",
+                "embeddings_generated": len(chunk_embeddings) if 'chunk_embeddings' in locals() else 0,
+                "doc_id": doc_id,
+                "parallel_tasks_completed": len([r for r in results.values() if r])
+            }
+            # Cache the result
+            cached_data = (result, chunk_data)
+            self.cache[doc_id] = cached_data
+            print(f"🎉 Document processing completed in {processing_time:.2f}s")
+            return result, chunk_data
+        except Exception as e:
+            error_time = time.time() - start_time
+            print(f"❌ Document processing failed after {error_time:.2f}s: {e}")
+            # Return error result
+            error_result = {
+                "error": str(e),
+                "summary": "Processing failed",
+                "risky_terms": [],
+                "key_clauses": [],
+                "chunk_count": 0,
+                "processing_time": f"{error_time:.2f}s",
+                "doc_id": doc_id
+            }
+            return error_result, []
     def chunk_text(self, data: ChunkInput) -> Dict[str, Any]:
         """Standalone chunking endpoint"""
         start = time.time()
+        try:
+            chunks = self.chunker.chunk_by_tokens(data.text, data.max_tokens, data.stride)
+            return {
+                "chunks": chunks,
+                "chunk_count": len(chunks),
+                "time_taken": f"{time.time() - start:.2f}s",
+                "status": "success"
+            }
+        except Exception as e:
+            return {
+                "error": str(e),
+                "chunks": [],
+                "chunk_count": 0,
+                "time_taken": f"{time.time() - start:.2f}s",
+                "status": "failed"
+            }
     def summarize_batch(self, data: SummarizeBatchInput) -> Dict[str, Any]:
         """Standalone batch summarization endpoint"""
+        start = time.time()
+        try:
+            result = self.summarizer.summarize_texts_sync(data.texts, data.max_length, data.min_length)
+            result["time_taken"] = f"{time.time() - start:.2f}s"
+            result["status"] = "success"
+            return result
+        except Exception as e:
+            return {
+                "error": str(e),
+                "summary": "Summarization failed",
+                "time_taken": f"{time.time() - start:.2f}s",
+                "status": "failed"
+            }
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics for monitoring"""
+        return {
+            "cached_documents": len(self.cache),
+            "cache_keys": list(self.cache.keys())
+        }
+    def clear_cache(self) -> Dict[str, str]:
+        """Clear the document cache"""
+        cleared_count = len(self.cache)
+        self.cache.clear()
+        return {
+            "message": f"Cleared {cleared_count} cached documents",
+            "status": "success"
+        }
+    def __del__(self):
+        """Cleanup thread pool on destruction"""
+        if hasattr(self, 'executor'):
+            self.executor.shutdown(wait=True)