Spaces:

0edon
/

test

Paused

App Files Files Community

christopher commited on May 6

Commit

e113735

1 Parent(s): 95f7578

query threads

Browse files

Files changed (1) hide show

database/query_processor.py +37 -14

database/query_processor.py CHANGED Viewed

@@ -4,6 +4,8 @@ import numpy as np
 from models.LexRank import degree_centrality_scores
 import logging
 from datetime import datetime as dt
 logger = logging.getLogger(__name__)
@@ -13,6 +15,7 @@ class QueryProcessor:
         self.summarization_model = summarization_model
         self.nlp_model = nlp_model
         self.db_service = db_service
         logger.info("QueryProcessor initialized")
     async def process(
@@ -23,33 +26,33 @@ class QueryProcessor:
         end_date: Optional[str] = None
     ) -> Dict[str, Any]:
         try:
-            # Date handling
             start_dt = self._parse_date(start_date) if start_date else None
             end_dt = self._parse_date(end_date) if end_date else None
-            # Query processing
-            query_embedding = self.embedding_model.encode(query).tolist()
             logger.debug(f"Generated embedding for query: {query[:50]}...")
-            # Entity extraction
-            entities = self.nlp_model.extract_entities(query)
             logger.debug(f"Extracted entities: {entities}")
-            # Database search
             articles = await self._execute_semantic_search(
                 query_embedding,
                 start_dt,
                 end_dt,
                 topic,
-                [ent[0] for ent in entities]  # Just the entity texts
             )
             if not articles:
                 logger.info("No articles found matching criteria")
                 return {"message": "No articles found", "articles": []}
-            # Summary generation
-            summary_data = self._generate_summary(articles)
             return {
                 "summary": summary_data["summary"],
@@ -91,14 +94,35 @@ class QueryProcessor:
             logger.error(f"Semantic search failed: {str(e)}")
             raise
-    def _generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """Generate summary from articles with fallback handling"""
         try:
             # Extract and process content
             sentences = []
             for article in articles:
                 if content := article.get("content"):
-                    sentences.extend(self.nlp_model.tokenize_sentences(content))
             if not sentences:
                 logger.warning("No sentences available for summarization")
@@ -107,12 +131,11 @@ class QueryProcessor:
                     "key_sentences": []
                 }
-            # Generate summary
             embeddings = self.embedding_model.encode(sentences)
             similarity_matrix = np.inner(embeddings, embeddings)
             centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
-            # Get top 10 most central sentences
             top_indices = np.argsort(-centrality_scores)[:10]
             key_sentences = [sentences[idx].strip() for idx in top_indices]

 from models.LexRank import degree_centrality_scores
 import logging
 from datetime import datetime as dt
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
 logger = logging.getLogger(__name__)
         self.summarization_model = summarization_model
         self.nlp_model = nlp_model
         self.db_service = db_service
+        self.executor = ThreadPoolExecutor(max_workers=4)  # For CPU-bound tasks
         logger.info("QueryProcessor initialized")
     async def process(
         end_date: Optional[str] = None
     ) -> Dict[str, Any]:
         try:
+            # Date handling (sync but fast)
             start_dt = self._parse_date(start_date) if start_date else None
             end_dt = self._parse_date(end_date) if end_date else None
+            # Async query processing
+            query_embedding = await self._async_encode(query)
             logger.debug(f"Generated embedding for query: {query[:50]}...")
+            # Entity extraction (sync but fast)
+            entities = await asyncio.to_thread(self.nlp_model.extract_entities, query)
             logger.debug(f"Extracted entities: {entities}")
+            # Async database search
             articles = await self._execute_semantic_search(
                 query_embedding,
                 start_dt,
                 end_dt,
                 topic,
+                [ent[0] for ent in entities]
             )
             if not articles:
                 logger.info("No articles found matching criteria")
                 return {"message": "No articles found", "articles": []}
+            # Async summary generation
+            summary_data = await self._async_generate_summary(articles)
             return {
                 "summary": summary_data["summary"],
             logger.error(f"Semantic search failed: {str(e)}")
             raise
+    async def _async_encode(self, text: str) -> List[float]:
+        """Run embedding in thread pool"""
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(
+            self.executor,
+            lambda: self.embedding_model.encode(text).tolist()
+        )
+    async def _async_generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Run summary generation in thread pool"""
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(
+            self.executor,
+            lambda: self._sync_generate_summary(articles)
+        )
+    def _sync_generate_summary(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Synchronous version for thread pool execution"""
         try:
             # Extract and process content
             sentences = []
             for article in articles:
                 if content := article.get("content"):
+                    sentences.extend(
+                        asyncio.run_coroutine_threadsafe(
+                            asyncio.to_thread(self.nlp_model.tokenize_sentences, content),
+                            loop=asyncio.get_event_loop()
+                        ).result()
+                    )
             if not sentences:
                 logger.warning("No sentences available for summarization")
                     "key_sentences": []
                 }
+            # CPU-intensive operations
             embeddings = self.embedding_model.encode(sentences)
             similarity_matrix = np.inner(embeddings, embeddings)
             centrality_scores = degree_centrality_scores(similarity_matrix, threshold=None)
             top_indices = np.argsort(-centrality_scores)[:10]
             key_sentences = [sentences[idx].strip() for idx in top_indices]