Spaces:

Y-Mangoes
/

Semantic-Search

Running

File size: 4,458 Bytes

be6ede2
 
ac90524
be6ede2
 
 
 
 
ac90524
 
 
 
be6ede2
ac90524
 
 
 
be6ede2
 
 
 
 
 
 
 
 
 
ac90524
 
 
 
be6ede2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cee4fc8
be6ede2
 
cee4fc8
be6ede2
 
 
 
cee4fc8
be6ede2
 
cee4fc8
be6ede2
 
 
 
 
 
 
cee4fc8
 
 
16bd142
 
 
be6ede2
 
cee4fc8
be6ede2
16bd142
 
 
be6ede2
6325e03
f27158e
16bd142
 
 
 
 
 
 
 
 
cee4fc8
16bd142
cee4fc8
 
 
 
be6ede2
cee4fc8
be6ede2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cee4fc8
be6ede2
 
 
 
 
 
 
 
 
 
 
cee4fc8
be6ede2

import uuid
import chromadb
import torch
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
import gradio as gr

# Set device to GPU if available, else CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device}
)

# Initialize ChromaDB client and collection
chroma_client = chromadb.PersistentClient(path="./chroma_db")
vectorstore = Chroma(
    client=chroma_client,
    collection_name="text_collection",
    embedding_function=embedding_model,
)

# Initialize reranker
reranker = HuggingFaceCrossEncoder(
    model_name="BAAI/bge-reranker-base",
    model_kwargs={"device": device}
)

def add_text_to_db(text):
    """
    Add a piece of text to the vector database.
    
    Args:
        text (str): The text to add.
        
    Returns:
        str: Confirmation message.
    """
    if not text or not text.strip():
        return "Error: Text cannot be empty."
    
    # Generate unique ID
    doc_id = str(uuid.uuid4())
    
    # Add text to vectorstore
    vectorstore.add_texts(
        texts=[text],
        metadatas=[{"text": text}],
        ids=[doc_id]
    )
    
    return f"Text added successfully with ID: {doc_id}"

def search_similar_texts(query, k, threshold):
    """
    Search for the top k similar texts in the vector database and rerank them.
    Only return results with similarity scores above the threshold.
    
    Args:
        query (str): The search query.
        k (int): Number of results to return.
        threshold (float): Minimum similarity score (0 to 1).
        
    Returns:
        str: Formatted search results with similarity scores or "No such record".
    """
    if not query or not query.strip():
        return "Error: Query cannot be empty."
    
    if not isinstance(k, int) or k < 1:
        return "Error: k must be a positive integer."
    
    if not isinstance(threshold, (int, float)) or threshold < 0 or threshold > 1:
        return "Error: Threshold must be a number between 0 and 1."
    
    # Retrieve initial documents
    retriever = vectorstore.as_retriever(search_kwargs={"k": max(k * 2, 10)})
    docs = retriever.get_relevant_documents(query)
    
    if not docs:
        return "No such record."
    
    # Compute reranker scores
    scored_docs = []
    for doc in docs:
        text = doc.metadata.get("text", "No text available")
        # Compute score using reranker
        score = reranker.score([query, text])
        doc.metadata["score"] = float(score)
        scored_docs.append((doc, score))
    
    # Sort by score in descending order
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    
    # Filter by threshold and limit to k
    results = []
    for i, (doc, score) in enumerate(scored_docs[:k]):
        if score >= threshold:
            text = doc.metadata.get("text", "No text available")
            results.append(f"Result {i+1}:\nText: {text}\nScore: {score:.4f}\n")
    
    if not results:
        return "No such record."
    
    return "\n".join(results)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Semantic Search Pipeline")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("## Add Text to Database")
            text_input = gr.Textbox(label="Enter text to add")
            add_button = gr.Button("Add Text")
            add_output = gr.Textbox(label="Result")
            
        with gr.Column():
            gr.Markdown("## Search Similar Texts")
            query_input = gr.Textbox(label="Enter search query")
            k_input = gr.Number(label="Number of results (k)", value=5, precision=0)
            threshold_input = gr.Number(label="Similarity threshold (0 to 1)", value=0.5, minimum=0, maximum=1)
            search_button = gr.Button("Search")
            search_output = gr.Textbox(label="Search Results")
    
    # Button actions
    add_button.click(
        fn=add_text_to_db,
        inputs=text_input,
        outputs=add_output
    )
    search_button.click(
        fn=search_similar_texts,
        inputs=[query_input, k_input, threshold_input],
        outputs=search_output
    )

# Launch Gradio app
if __name__ == "__main__":
    demo.launch()