Y-Mangoes commited on
Commit
be6ede2
·
verified ·
1 Parent(s): 696be49

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -0
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import chromadb
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.retrievers import ContextualCompressionRetriever
6
+ from langchain.retrievers.document_compressors import CrossEncoderReranker
7
+ from langchain_community.cross_encoders import HuggingFaceCrossEncoder
8
+ import gradio as gr
9
+
10
+ # Initialize embedding model
11
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
12
+
13
+ # Initialize ChromaDB client and collection
14
+ chroma_client = chromadb.PersistentClient(path="./chroma_db")
15
+ vectorstore = Chroma(
16
+ client=chroma_client,
17
+ collection_name="text_collection",
18
+ embedding_function=embedding_model,
19
+ )
20
+
21
+ # Initialize reranker
22
+ reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
23
+ compressor = CrossEncoderReranker(model=reranker, top_n=5)
24
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) # Retrieve 2k initially
25
+ compression_retriever = ContextualCompressionRetriever(
26
+ base_compressor=compressor, base_retriever=retriever
27
+ )
28
+
29
+ def add_text_to_db(text):
30
+ """
31
+ Add a piece of text to the vector database.
32
+
33
+ Args:
34
+ text (str): The text to add.
35
+
36
+ Returns:
37
+ str: Confirmation message.
38
+ """
39
+ if not text or not text.strip():
40
+ return "Error: Text cannot be empty."
41
+
42
+ # Generate unique ID
43
+ doc_id = str(uuid.uuid4())
44
+
45
+ # Add text to vectorstore
46
+ vectorstore.add_texts(
47
+ texts=[text],
48
+ metadatas=[{"text": text}],
49
+ ids=[doc_id]
50
+ )
51
+
52
+ return f"Text added successfully with ID: {doc_id}"
53
+
54
+ def search_similar_texts(query, k):
55
+ """
56
+ Search for the top k similar texts in the vector database and rerank them.
57
+
58
+ Args:
59
+ query (str): The search query.
60
+ k (int): Number of results to return.
61
+
62
+ Returns:
63
+ str: Formatted search results with similarity scores.
64
+ """
65
+ if not query or not query.strip():
66
+ return "Error: Query cannot be empty."
67
+
68
+ if not isinstance(k, int) or k < 1:
69
+ return "Error: k must be a positive integer."
70
+
71
+ # Retrieve and rerank
72
+ retriever.search_kwargs["k"] = max(k * 2, 10) # Retrieve 2k or at least 10
73
+ compressor.top_n = k # Rerank to top k
74
+ docs = compression_retriever.get_relevant_documents(query)
75
+
76
+ if not docs:
77
+ return "No results found."
78
+
79
+ # Format results
80
+ results = []
81
+ for i, doc in enumerate(docs[:k]): # Ensure we return at most k
82
+ text = doc.metadata.get("text", "No text available")
83
+ score = doc.metadata.get("score", 0.0) # Reranker score
84
+ results.append(f"Result {i+1}:\nText: {text}\nScore: {score:.4f}\n")
85
+
86
+ return "\n".join(results) or "No results found."
87
+
88
+ # Gradio interface
89
+ with gr.Blocks() as demo:
90
+ gr.Markdown("# Semantic Search Pipeline")
91
+
92
+ with gr.Row():
93
+ with gr.Column():
94
+ gr.Markdown("## Add Text to Database")
95
+ text_input = gr.Textbox(label="Enter text to add")
96
+ add_button = gr.Button("Add Text")
97
+ add_output = gr.Textbox(label="Result")
98
+
99
+ with gr.Column():
100
+ gr.Markdown("## Search Similar Texts")
101
+ query_input = gr.Textbox(label="Enter search query")
102
+ k_input = gr.Number(label="Number of results (k)", value=5, precision=0)
103
+ search_button = gr.Button("Search")
104
+ search_output = gr.Textbox(label="Search Results")
105
+
106
+ # Button actions
107
+ add_button.click(
108
+ fn=add_text_to_db,
109
+ inputs=text_input,
110
+ outputs=add_output
111
+ )
112
+ search_button.click(
113
+ fn=search_similar_texts,
114
+ inputs=[query_input, k_input],
115
+ outputs=search_output
116
+ )
117
+
118
+ # Launch Gradio app
119
+ if __name__ == "__main__":
120
+ demo.launch()