TKM03 commited on
Commit
a22d089
·
verified ·
1 Parent(s): 3c45742
Files changed (1) hide show
  1. app.py +108 -1
app.py CHANGED
@@ -4,7 +4,114 @@ import faiss
4
  import re
5
  import gradio as gr
6
 
7
- # [Previous functions remain exactly the same - preprocess_text, query_qa_system, initialize_qa_system, etc.]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Custom CSS for professional styling
10
  custom_css = """
 
4
  import re
5
  import gradio as gr
6
 
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer
9
+ import faiss
10
+ import re
11
+
12
+ def preprocess_text(text):
13
+ """
14
+ Preprocess the text into structured question-answer pairs
15
+ """
16
+ # Split text into sections by questions
17
+ sections = []
18
+ current_section = []
19
+
20
+ for line in text.split('\n'):
21
+ line = line.strip()
22
+ if line.startswith('Question'):
23
+ if current_section:
24
+ sections.append(' '.join(current_section))
25
+ current_section = [line]
26
+ elif line:
27
+ current_section.append(line)
28
+
29
+ if current_section:
30
+ sections.append(' '.join(current_section))
31
+
32
+ # Create a structured format
33
+ structured_sections = []
34
+ for section in sections:
35
+ # Remove page numbers and other irrelevant text
36
+ section = re.sub(r'\d+\s*$', '', section)
37
+ section = re.sub(r'TRAPS:|BEST ANSWER:|PASSABLE ANSWER:', ' ', section)
38
+ structured_sections.append(section.strip())
39
+
40
+ return structured_sections
41
+
42
+ def create_qa_system(pdf_text, model_name="all-MiniLM-L6-v2"):
43
+ """
44
+ Create and return a QA system with the processed text
45
+ """
46
+ # Process text into structured sections
47
+ text_chunks = preprocess_text(pdf_text)
48
+
49
+ # Create embeddings
50
+ model = SentenceTransformer(model_name)
51
+ embeddings = model.encode(text_chunks)
52
+
53
+ # Create FAISS index with cosine similarity
54
+ dimension = embeddings.shape[1]
55
+
56
+ # Normalize vectors for cosine similarity
57
+ faiss.normalize_L2(embeddings)
58
+ index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
59
+ index.add(embeddings)
60
+
61
+ return model, index, text_chunks
62
+
63
+ def query_qa_system(question, model, index, text_chunks, similarity_threshold=0.3):
64
+ """
65
+ Query the QA system with improved matching
66
+ """
67
+ # Encode and normalize the question
68
+ question_embedding = model.encode([question])
69
+ faiss.normalize_L2(question_embedding)
70
+
71
+ # Search for the most similar chunks
72
+ k = 1 # Get only the best match
73
+ similarities, indices = index.search(question_embedding, k)
74
+
75
+ best_idx = indices[0][0]
76
+ similarity_score = similarities[0][0] # Cosine similarity score
77
+
78
+ if similarity_score >= similarity_threshold:
79
+ matched_text = text_chunks[best_idx]
80
+ # Extract just the question number for reference
81
+ question_num = re.search(r'Question \d+:', matched_text)
82
+ question_num = question_num.group(0) if question_num else "Matching section"
83
+
84
+ return {
85
+ 'question': question_num,
86
+ 'full_text': matched_text,
87
+ 'confidence': float(similarity_score),
88
+ 'found_answer': True
89
+ }
90
+ else:
91
+ return {
92
+ 'question': None,
93
+ 'full_text': "I couldn't find a sufficiently relevant answer to your question in the provided document.",
94
+ 'confidence': float(similarity_score),
95
+ 'found_answer': False
96
+ }
97
+
98
+ def ask_question(question, model, index, text_chunks):
99
+ """
100
+ User-friendly interface for asking questions
101
+ """
102
+ result = query_qa_system(question, model, index, text_chunks)
103
+ print("\nQ:", question)
104
+ print("-" * 50)
105
+ if result['found_answer']:
106
+ print(f"Found matching section (confidence: {result['confidence']:.2f}):")
107
+ print(f"\n{result['full_text']}\n")
108
+ return result
109
+ else:
110
+ print(result['full_text'])
111
+ print(f"Best match confidence: {result['confidence']:.2f}")
112
+ return result
113
+ # Initialize the system
114
+ model, index, text_chunks = create_qa_system(pdf_text)
115
 
116
  # Custom CSS for professional styling
117
  custom_css = """