Spaces:
Sleeping
Sleeping
done
Browse files
app.py
CHANGED
@@ -4,7 +4,114 @@ import faiss
|
|
4 |
import re
|
5 |
import gradio as gr
|
6 |
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Custom CSS for professional styling
|
10 |
custom_css = """
|
|
|
4 |
import re
|
5 |
import gradio as gr
|
6 |
|
7 |
+
import numpy as np
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
import faiss
|
10 |
+
import re
|
11 |
+
|
12 |
+
def preprocess_text(text):
|
13 |
+
"""
|
14 |
+
Preprocess the text into structured question-answer pairs
|
15 |
+
"""
|
16 |
+
# Split text into sections by questions
|
17 |
+
sections = []
|
18 |
+
current_section = []
|
19 |
+
|
20 |
+
for line in text.split('\n'):
|
21 |
+
line = line.strip()
|
22 |
+
if line.startswith('Question'):
|
23 |
+
if current_section:
|
24 |
+
sections.append(' '.join(current_section))
|
25 |
+
current_section = [line]
|
26 |
+
elif line:
|
27 |
+
current_section.append(line)
|
28 |
+
|
29 |
+
if current_section:
|
30 |
+
sections.append(' '.join(current_section))
|
31 |
+
|
32 |
+
# Create a structured format
|
33 |
+
structured_sections = []
|
34 |
+
for section in sections:
|
35 |
+
# Remove page numbers and other irrelevant text
|
36 |
+
section = re.sub(r'\d+\s*$', '', section)
|
37 |
+
section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
|
38 |
+
structured_sections.append(section.strip())
|
39 |
+
|
40 |
+
return structured_sections
|
41 |
+
|
42 |
+
def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"):
|
43 |
+
"""
|
44 |
+
Create and return a QA system with the processed text
|
45 |
+
"""
|
46 |
+
# Process text into structured sections
|
47 |
+
text_chunks = preprocess_text(pdf_text)
|
48 |
+
|
49 |
+
# Create embeddings
|
50 |
+
model = SentenceTransformer(model_name)
|
51 |
+
embeddings = model.encode(text_chunks)
|
52 |
+
|
53 |
+
# Create FAISS index with cosine similarity
|
54 |
+
dimension = embeddings.shape[1]
|
55 |
+
|
56 |
+
# Normalize vectors for cosine similarity
|
57 |
+
faiss.normalize_L2(embeddings)
|
58 |
+
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
|
59 |
+
index.add(embeddings)
|
60 |
+
|
61 |
+
return model, index, text_chunks
|
62 |
+
|
63 |
+
def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3):
|
64 |
+
"""
|
65 |
+
Query the QA system with improved matching
|
66 |
+
"""
|
67 |
+
# Encode and normalize the question
|
68 |
+
question_embedding = model.encode([question])
|
69 |
+
faiss.normalize_L2(question_embedding)
|
70 |
+
|
71 |
+
# Search for the most similar chunks
|
72 |
+
k = 1 # Get only the best match
|
73 |
+
similarities, indices = index.search(question_embedding, k)
|
74 |
+
|
75 |
+
best_idx = indices[0][0]
|
76 |
+
similarity_score = similarities[0][0] # Cosine similarity score
|
77 |
+
|
78 |
+
if similarity_score >= similarity_threshold:
|
79 |
+
matched_text = text_chunks[best_idx]
|
80 |
+
# Extract just the question number for reference
|
81 |
+
question_num = re.search(r'Question \d+:', matched_text)
|
82 |
+
question_num = question_num.group(0) if question_num else "Matching section"
|
83 |
+
|
84 |
+
return {
|
85 |
+
'question': question_num,
|
86 |
+
'full_text': matched_text,
|
87 |
+
'confidence': float(similarity_score),
|
88 |
+
'found_answer': True
|
89 |
+
}
|
90 |
+
else:
|
91 |
+
return {
|
92 |
+
'question': None,
|
93 |
+
'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
|
94 |
+
'confidence': float(similarity_score),
|
95 |
+
'found_answer': False
|
96 |
+
}
|
97 |
+
|
98 |
+
def ask_question(question, model, index, text_chunks):
|
99 |
+
"""
|
100 |
+
User-friendly interface for asking questions
|
101 |
+
"""
|
102 |
+
result = query_qa_system(question, model, index, text_chunks)
|
103 |
+
print("\nQ:", question)
|
104 |
+
print("-" * 50)
|
105 |
+
if result['found_answer']:
|
106 |
+
print(f"Found matching section (confidence: {result['confidence']:.2f}):")
|
107 |
+
print(f"\n{result['full_text']}\n")
|
108 |
+
return result
|
109 |
+
else:
|
110 |
+
print(result['full_text'])
|
111 |
+
print(f"Best match confidence: {result['confidence']:.2f}")
|
112 |
+
return result
|
113 |
+
# Initialize the system
|
114 |
+
model, index, text_chunks = create_qa_system(pdf_text)
|
115 |
|
116 |
# Custom CSS for professional styling
|
117 |
custom_css = """
|