TKM03 commited on
Commit
c3d4aea
·
verified ·
1 Parent(s): eac23b1

Create the py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+ import faiss
4
+ import re
5
+ import gradio as gr
6
+
7
+ def preprocess_text(text):
8
+ """
9
+ Preprocess the text into structured question-answer pairs
10
+ """
11
+ # Split text into sections by questions
12
+ sections = []
13
+ current_section = []
14
+
15
+ for line in text.split('\n'):
16
+ line = line.strip()
17
+ if line.startswith('Question'):
18
+ if current_section:
19
+ sections.append(' '.join(current_section))
20
+ current_section = [line]
21
+ elif line:
22
+ current_section.append(line)
23
+
24
+ if current_section:
25
+ sections.append(' '.join(current_section))
26
+
27
+ # Create a structured format
28
+ structured_sections = []
29
+ for section in sections:
30
+ # Remove page numbers and other irrelevant text
31
+ section = re.sub(r'\d+\s*$', '', section)
32
+ section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
33
+ structured_sections.append(section.strip())
34
+
35
+ return structured_sections
36
+
37
+ def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.4):
38
+ """
39
+ Query the QA system with improved matching
40
+ """
41
+ # Encode and normalize the question
42
+ question_embedding = model.encode([question])
43
+ faiss.normalize_L2(question_embedding)
44
+
45
+ # Search for the most similar chunks
46
+ k = 1 # Get only the best match
47
+ similarities, indices = index.search(question_embedding, k)
48
+
49
+ best_idx = indices[0][0]
50
+ similarity_score = similarities[0][0] # Cosine similarity score
51
+
52
+ if similarity_score >= similarity_threshold:
53
+ matched_text = text_chunks[best_idx]
54
+ # Extract just the question number for reference
55
+ question_num = re.search(r'Question \d+:', matched_text)
56
+ question_num = question_num.group(0) if question_num else "Matching section"
57
+
58
+ return {
59
+ 'question': question_num,
60
+ 'full_text': matched_text,
61
+ 'confidence': float(similarity_score),
62
+ 'found_answer': True
63
+ }
64
+ else:
65
+ return {
66
+ 'question': None,
67
+ 'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
68
+ 'confidence': float(similarity_score),
69
+ 'found_answer': False
70
+ }
71
+
72
+ # Function to handle PDF file upload and initialization
73
+ def initialize_qa_system(pdf_file):
74
+ # Read the uploaded PDF
75
+ try:
76
+ from PyPDF2 import PdfReader
77
+ pdf_reader = PdfReader(pdf_file.name)
78
+ pdf_text = ""
79
+ for page in pdf_reader.pages:
80
+ text = page.extract_text()
81
+ if text:
82
+ pdf_text += text + "\n"
83
+
84
+ # Process text and create embeddings
85
+ text_chunks = preprocess_text(pdf_text)
86
+ model = SentenceTransformer("all-MiniLM-L6-v2")
87
+ embeddings = model.encode(text_chunks)
88
+
89
+ # Create index
90
+ dimension = embeddings.shape[1]
91
+ faiss.normalize_L2(embeddings)
92
+ index = faiss.IndexFlatIP(dimension)
93
+ index.add(embeddings)
94
+
95
+ return {
96
+ 'model': model,
97
+ 'index': index,
98
+ 'text_chunks': text_chunks,
99
+ 'status': f"System initialized with {len(text_chunks)} text chunks from your PDF!"
100
+ }
101
+ except Exception as e:
102
+ return {
103
+ 'model': None,
104
+ 'index': None,
105
+ 'text_chunks': None,
106
+ 'status': f"Error: {str(e)}"
107
+ }
108
+
109
+ # Global variables to store our QA system components
110
+ qa_system = {'model': None, 'index': None, 'text_chunks': None}
111
+
112
+ # Function to handle file upload
113
+ def upload_file(pdf_file):
114
+ global qa_system
115
+ result = initialize_qa_system(pdf_file)
116
+ qa_system = result
117
+ return result['status']
118
+
119
+ # Function to handle questions
120
+ def answer_question(question):
121
+ global qa_system
122
+
123
+ if not qa_system['model'] or not qa_system['index'] or not qa_system['text_chunks']:
124
+ return "Please upload a PDF file first."
125
+
126
+ result = query_qa_system(question, qa_system['model'], qa_system['index'], qa_system['text_chunks'])
127
+ answer_start = result['full_text'].find('Answer:') + len('Answer:')
128
+ answer = result['full_text'][answer_start:].strip()
129
+
130
+
131
+ if result['found_answer']:
132
+ return f"Match (confidence: {result['confidence']:.2f}):\n\n{answer}"
133
+ else:
134
+ return f"{answer}\nBest match confidence: {result['confidence']:.2f}"
135
+
136
+ # Create the Gradio interface
137
+ with gr.Blocks(title="Interview Q&A Assistant") as demo:
138
+ gr.Markdown("# Interview Q&A Assistant")
139
+ gr.Markdown("Upload your interview questions PDF and ask questions to get the most relevant sections.")
140
+
141
+ with gr.Row():
142
+ with gr.Column():
143
+ pdf_upload = gr.File(label="Upload PDF File")
144
+ upload_button = gr.Button("Initialize Q&A System")
145
+ status_text = gr.Textbox(label="Status", value="Upload a PDF to begin")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ question_input = gr.Textbox(label="Ask a question about interview preparation")
150
+ submit_button = gr.Button("Get Answer")
151
+
152
+ with gr.Row():
153
+ answer_output = gr.Textbox(label="Answer", lines=10)
154
+
155
+ # Set up events
156
+ upload_button.click(upload_file, inputs=pdf_upload, outputs=status_text)
157
+ submit_button.click(answer_question, inputs=question_input, outputs=answer_output)
158
+
159
+ # Launch the app
160
+ if __name__ == "__main__":
161
+ demo.launch(share=True)