gaur3009 commited on
Commit
03a8095
·
verified ·
1 Parent(s): 75d88e1

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +23 -7
src/streamlit_app.py CHANGED
@@ -3,10 +3,12 @@ import re
3
  import random
4
  import PyPDF2
5
  import numpy as np
6
- from collections import defaultdict, deque
7
- from sentence_transformers import SentenceTransformer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
 
 
 
10
  # ---------------------
11
  # Tokenization
12
  # ---------------------
@@ -20,13 +22,15 @@ class PDFQASystem:
20
  def __init__(self):
21
  self.text_chunks = []
22
  self.embeddings = None
23
- self.model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
24
  self.active_document = None
25
-
26
  def process_pdf_stream(self, uploaded_file):
27
  text = self._extract_pdf_text(uploaded_file)
28
  self.text_chunks = self._chunk_text(text)
29
- self.embeddings = self.model.encode(self.text_chunks)
30
  self.active_document = uploaded_file.name
31
 
32
  def _extract_pdf_text(self, uploaded_file):
@@ -42,11 +46,23 @@ class PDFQASystem:
42
  words = text.split()
43
  return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def answer_question(self, question):
46
  if not self.active_document:
47
  return "No document loaded. Please upload a PDF first."
48
-
49
- question_embedding = self.model.encode(question)
50
  similarities = cosine_similarity([question_embedding], self.embeddings)[0]
51
  best_match_idx = np.argmax(similarities)
52
  return self.text_chunks[best_match_idx]
 
3
  import random
4
  import PyPDF2
5
  import numpy as np
6
+ from collections import defaultdict
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
 
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoModel
11
+
12
  # ---------------------
13
  # Tokenization
14
  # ---------------------
 
22
  def __init__(self):
23
  self.text_chunks = []
24
  self.embeddings = None
25
+ self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
26
+ self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
27
+ self.model.eval()
28
  self.active_document = None
29
+
30
  def process_pdf_stream(self, uploaded_file):
31
  text = self._extract_pdf_text(uploaded_file)
32
  self.text_chunks = self._chunk_text(text)
33
+ self.embeddings = self._embed(self.text_chunks)
34
  self.active_document = uploaded_file.name
35
 
36
  def _extract_pdf_text(self, uploaded_file):
 
46
  words = text.split()
47
  return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
48
 
49
+ def _mean_pooling(self, model_output, attention_mask):
50
+ token_embeddings = model_output.last_hidden_state
51
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
52
+ return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
53
+
54
+ def _embed(self, texts):
55
+ inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
56
+ with torch.no_grad():
57
+ model_output = self.model(**inputs)
58
+ embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
59
+ return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()
60
+
61
  def answer_question(self, question):
62
  if not self.active_document:
63
  return "No document loaded. Please upload a PDF first."
64
+
65
+ question_embedding = self._embed([question])[0]
66
  similarities = cosine_similarity([question_embedding], self.embeddings)[0]
67
  best_match_idx = np.argmax(similarities)
68
  return self.text_chunks[best_match_idx]