Spaces:
Sleeping
Sleeping
| import json | |
| from typing import List, Tuple | |
| import numpy as np | |
| # from fastapi.responses import JSONResponse | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| from app.db_local_storage.vector_files_db import vector_files_db | |
| from app.db_local_storage.files_db import VECTOR_FILES_DIRECTORY | |
| from app.db_local_storage.in_memory_db import query_response_storage | |
| class QuerySearchFeature: | |
| def __init__(self, model, qa_pipeline): | |
| self.model = model | |
| self.qa_pipeline = qa_pipeline | |
| async def query_search(self, query: str) -> dict: | |
| user_query = { | |
| "text": query, | |
| "isSender": True, | |
| } | |
| query_response_storage.append(user_query) | |
| # dataBase = await QuerySearchFeature.load_data() | |
| dataBase = vector_files_db | |
| text_data, embeddings = await QuerySearchFeature.split_dataBase(dataBase) | |
| lexical_results = await QuerySearchFeature.lexical_search(query, text_data) | |
| semantic_results = await QuerySearchFeature.semantic_search( | |
| query, text_data, embeddings, self.model | |
| ) | |
| combined_results = list(set(lexical_results + semantic_results)) | |
| context = await QuerySearchFeature.get_context(combined_results) | |
| response = self.qa_pipeline(question=query, context=context) | |
| response_query = { | |
| "text": response["answer"], | |
| "isSender": False, | |
| } | |
| query_response_storage.append(response_query) | |
| return { | |
| "message": response["answer"], | |
| "context_used": context, | |
| "chunks": context, | |
| } | |
| async def semantic_search( | |
| query: str, chunks: List[str], embeddings: np.ndarray, model | |
| ) -> List[str]: | |
| query_embedding = model.encode([query]) | |
| similarities = np.dot(embeddings, query_embedding.T).flatten() | |
| top_indices = np.argsort(-similarities)[:3] | |
| return [chunks[i] for i in top_indices] | |
| async def lexical_search(query: str, chunks: List[str]) -> List[str]: | |
| return [chunk for chunk in chunks if query.lower() in chunk.lower()] | |
| async def load_data(): | |
| with open(VECTOR_FILES_DIRECTORY, "r") as file: | |
| dataBase = json.load(file) | |
| return dataBase | |
| async def split_dataBase(db) -> Tuple[List[str], np.ndarray]: | |
| text_data = [] | |
| embeddings = [] | |
| for document in db.values(): | |
| for page in document["data"]: | |
| text_data.append(page["metadata"]["original_text"]) | |
| embeddings.append(page["embedding"]) | |
| return text_data, embeddings | |
| async def get_context(chunks: List[str]) -> str: | |
| return " ".join(chunks) | |