Nevidu commited on
Commit
098e1df
·
verified ·
1 Parent(s): 3a99ad1

Upload App.py

Browse files
Files changed (1) hide show
  1. App.py +157 -0
App.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ # from sklearn.decomposition import PCA
5
+ from langchain_community.llms import Ollama
6
+ from langchain_chroma import Chroma
7
+ import langchain
8
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
9
+
10
+ from langchain_experimental.text_splitter import SemanticChunker
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain_community.embeddings.ollama import OllamaEmbeddings
13
+
14
+ from typing import List, Dict
15
+ from langchain.docstore.document import Document
16
+
17
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
18
+
19
+ tokenizer = T5Tokenizer.from_pretrained("Voicelab/vlt5-base-keywords")
20
+ model = T5ForConditionalGeneration.from_pretrained("Voicelab/vlt5-base-keywords")
21
+
22
+ vectorstore = Chroma(
23
+ # docs,
24
+ embedding_function=OllamaEmbeddings(model = "gemma:2b"),
25
+ persist_directory="chroma_db"
26
+ )
27
+
28
+ print(vectorstore.similarity_search_with_score("Course Leader"))
29
+
30
+ llm = Ollama(
31
+ model="llama3.2:3b"
32
+ )
33
+
34
+ def retrieve_relevant_chunks(
35
+ vector_store: Chroma,
36
+ query: str,
37
+ n_docs: int = 2,
38
+ chunks_per_doc: int = 5
39
+ ) -> Dict[str, List[Document]]:
40
+
41
+ # Get more results initially to ensure we have enough unique documents
42
+ results = vector_store.similarity_search_with_score(
43
+ query,
44
+ k=50 # Fetch more to ensure we have enough unique documents
45
+ )
46
+
47
+ # Group results by document ID
48
+ doc_chunks: Dict[str, List[tuple]] = {}
49
+ for doc, score in results:
50
+ doc_id = doc.metadata.get('source', '') # or use appropriate metadata field
51
+ if doc_id:
52
+ if doc_id not in doc_chunks:
53
+ doc_chunks[doc_id] = []
54
+ doc_chunks[doc_id].append((doc, score))
55
+
56
+ # Sort documents by their best matching chunk's score
57
+ sorted_docs = sorted(
58
+ doc_chunks.items(),
59
+ key=lambda x: min(chunk[1] for chunk in x[1])
60
+ )
61
+
62
+ # Take only the top n_docs documents
63
+ top_docs = sorted_docs[:n_docs]
64
+
65
+ # For each top document, get the best chunks_per_doc chunks
66
+ final_results: Dict[str, List[Document]] = {}
67
+ for doc_id, chunks in top_docs:
68
+ # Sort chunks by score (relevance)
69
+ sorted_chunks = sorted(chunks, key=lambda x: x[1])
70
+ # Take only the specified number of chunks and store just the Document objects
71
+ final_results[doc_id] = [chunk[0] for chunk in sorted_chunks[:chunks_per_doc]]
72
+
73
+ return final_results
74
+
75
+ def display_results(results: Dict[str, List[str]]) -> None:
76
+ """
77
+ Display the retrieved chunks in a formatted way.
78
+
79
+ Args:
80
+ results: Dictionary mapping document IDs to lists of text chunks
81
+ """
82
+ prompt = " "
83
+ for doc_id, chunks in results.items():
84
+ # prompt += f"\nDocument ID: {doc_id}\n"
85
+ prompt += "-" * 50
86
+ for i, chunk in enumerate(chunks, 1):
87
+ # prompt += f"\nChunk {i}:"
88
+ prompt += str(chunk) + "\n"
89
+ # prompt += "-" * 30
90
+ return prompt
91
+
92
+ def main(query):
93
+
94
+ # Initialize your vector store (example)
95
+ # vector_store = Chroma(
96
+ # persist_directory="path/to/your/vectorstore",
97
+ # embedding_function=your_embedding_function
98
+ # )
99
+
100
+ upd_query = "Keyword: " + query
101
+ input_ids = tokenizer.encode(upd_query, return_tensors="pt")
102
+ outputs = model.generate(input_ids)
103
+ output_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)
104
+ # print(output_sequence)
105
+ result_list = list(set(item.strip() for item in output_sequence.split(',')))
106
+ # print(result_list)
107
+ output_string = ", ".join(result_list)
108
+ print(output_string)
109
+
110
+ try:
111
+ results = retrieve_relevant_chunks(
112
+ vector_store=vectorstore,
113
+ query=output_string,
114
+ n_docs=2,
115
+ chunks_per_doc=5
116
+ )
117
+
118
+ prompt = display_results(results)
119
+
120
+ except Exception as e:
121
+ print(f"Error: {str(e)}")
122
+
123
+ formatted_prompt = f"""
124
+ You are an AI assistant. Your goal is to answer questions regarding student handbooks based on the following context provided. Make sure all the answers are within the given context:
125
+ {prompt}
126
+
127
+ Based on the above, answer the following question:
128
+ {query}
129
+ Give the answer in a clear and concise manner
130
+ """
131
+
132
+ response = llm.predict(formatted_prompt)
133
+
134
+ return response
135
+
136
+ with gr.Blocks() as demo:
137
+ #gr.Image("../Documentation/Context Diagram.png", scale=2)
138
+ #gr(title="Your Interface Title")
139
+ gr.Markdown("""
140
+ <center>
141
+ <span style='font-size: 50px; font-weight: Bold; font-family: "Graduate", serif'>
142
+ IIT RAG Student Handbooks
143
+ </span>
144
+ </center>
145
+ """)
146
+ with gr.Group():
147
+ query = gr.Textbox(label="Question")
148
+ answer = gr.Textbox(label="Answer")
149
+
150
+ with gr.Row():
151
+ login_btn = gr.Button(value="Generate")
152
+
153
+ login_btn.click(main, inputs=[query], outputs=answer)
154
+
155
+ # demo.launch(share = True, auth=authenticate)
156
+ demo.launch(share = True)
157
+