gaur3009 commited on
Commit
1b89b73
Β·
verified Β·
1 Parent(s): 2af71a4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +73 -131
src/streamlit_app.py CHANGED
@@ -1,141 +1,83 @@
1
  import streamlit as st
2
- import re
3
- import random
4
  import PyPDF2
5
- import numpy as np
6
- from collections import defaultdict
7
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- import torch
10
- from transformers import AutoTokenizer, AutoModel
11
-
12
- # ---------------------
13
- # Tokenization
14
- # ---------------------
15
- def tokenize(text):
16
- return re.findall(r"\w+", text.lower())
17
-
18
- # ---------------------
19
- # PDF QA System
20
- # ---------------------
21
- class PDFQASystem:
22
- def __init__(self):
23
- self.text_chunks = []
24
- self.embeddings = None
25
- self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
26
- self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
27
- self.model.eval()
28
- self.active_document = None
29
-
30
- def process_pdf_stream(self, uploaded_file):
31
- text = self._extract_pdf_text(uploaded_file)
32
- self.text_chunks = self._chunk_text(text)
33
- self.embeddings = self._embed(self.text_chunks)
34
- self.active_document = uploaded_file.name
35
-
36
- def _extract_pdf_text(self, uploaded_file):
37
- text = ""
38
- reader = PyPDF2.PdfReader(uploaded_file)
39
- for page in reader.pages:
40
- page_text = page.extract_text()
41
- if page_text:
42
- text += page_text
43
- return text
44
-
45
- def _chunk_text(self, text, chunk_size=500):
46
- words = text.split()
47
- return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
48
-
49
- def _mean_pooling(self, model_output, attention_mask):
50
- token_embeddings = model_output.last_hidden_state
51
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
52
- return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
53
-
54
- def _embed(self, texts):
55
- inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
56
- with torch.no_grad():
57
- model_output = self.model(**inputs)
58
- embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
59
- return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()
60
-
61
- def answer_question(self, question):
62
- if not self.active_document:
63
- return "No document loaded. Please upload a PDF first."
64
-
65
- question_embedding = self._embed([question])[0]
66
- similarities = cosine_similarity([question_embedding], self.embeddings)[0]
67
- best_match_idx = np.argmax(similarities)
68
- return self.text_chunks[best_match_idx]
69
-
70
- # ---------------------
71
- # Intent Classifier
72
- # ---------------------
73
- class IntentClassifier:
74
- def __init__(self):
75
- self.intents = {
76
- "greet": ["hello", "hi", "hey"],
77
- "bye": ["bye", "goodbye", "exit"],
78
- "qa": ["what", "when", "how", "explain", "tell", "who", "why"],
79
- "help": ["help", "support", "assist"]
80
- }
81
-
82
- def predict(self, tokens):
83
- scores = defaultdict(int)
84
- for token in tokens:
85
- for intent, keywords in self.intents.items():
86
- if token in keywords:
87
- scores[intent] += 1
88
- return max(scores, key=scores.get) if scores else "qa"
89
-
90
- # ---------------------
91
- # AI Agent Core
92
- # ---------------------
93
- class DocumentAI:
94
- def __init__(self):
95
- self.intent_recognizer = IntentClassifier()
96
- self.qa_system = PDFQASystem()
97
- self.responses = {
98
- "greet": ["πŸ‘‹ Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
99
- "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
100
- "help": "Upload a PDF and ask questions. I’ll answer from its content!",
101
- "no_doc": "Please upload a PDF document first."
102
- }
103
-
104
- def handle_query(self, text):
105
- tokens = tokenize(text)
106
- intent = self.intent_recognizer.predict(tokens)
107
-
108
- if intent == "greet":
109
- return random.choice(self.responses["greet"])
110
- elif intent == "bye":
111
- return random.choice(self.responses["bye"])
112
- elif intent == "help":
113
- return self.responses["help"]
114
- elif intent == "qa":
115
- if self.qa_system.active_document:
116
- return self.qa_system.answer_question(text)
117
- else:
118
- return self.responses["no_doc"]
119
- else:
120
- return "πŸ€– I’m not sure how to respond. Try saying 'help'."
121
-
122
- # ---------------------
123
  # Streamlit UI
124
- # ---------------------
125
- st.set_page_config(page_title="Document AI Assistant", page_icon="πŸ“„")
126
- st.title("πŸ“„ AI PDF Assistant")
127
- st.markdown("Ask questions from uploaded PDF files!")
 
 
 
 
 
 
 
 
 
 
128
 
129
- ai = DocumentAI()
130
 
131
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
 
 
132
 
133
- if uploaded_file:
134
- ai.qa_system.process_pdf_stream(uploaded_file)
135
- st.success(f"βœ… PDF '{uploaded_file.name}' processed successfully!")
136
 
137
- query = st.text_input("Ask a question from the document:")
 
 
138
 
139
- if query:
140
- answer = ai.handle_query(query)
141
- st.markdown(f"**🧠 Answer:** {answer}")
 
1
  import streamlit as st
 
 
2
  import PyPDF2
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModel, pipeline
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
+ import tempfile
8
+
9
+ # Load local models once
10
+ @st.cache_resource
11
+ def load_models():
12
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
13
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
14
+ qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base")
15
+ return tokenizer, model, qa_pipeline_model
16
+
17
+ embedding_tokenizer, embedding_model, qa_pipeline_model = load_models()
18
+
19
+ # PDF loader
20
+ def load_pdf(file):
21
+ reader = PyPDF2.PdfReader(file)
22
+ text = ''
23
+ for page in reader.pages:
24
+ text += page.extract_text() or ''
25
+ return text
26
+
27
+ # Embed text
28
+ def get_embedding(text):
29
+ inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
30
+ with torch.no_grad():
31
+ model_output = embedding_model(**inputs)
32
+ return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
33
+
34
+ # Store vectors in-memory
35
+ vector_store = []
36
+
37
+ def upload_document_chunks(chunks):
38
+ vector_store.clear()
39
+ for chunk in chunks:
40
+ embedding = get_embedding(chunk)
41
+ vector_store.append((chunk, embedding))
42
+
43
+ def query_answer(query):
44
+ query_vec = get_embedding(query)
45
+ similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store]
46
+ top_indices = np.argsort(similarities)[-3:][::-1]
47
+ return [vector_store[i][0] for i in top_indices]
48
+
49
+ def generate_response(context, query):
50
+ prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
51
+ response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True)
52
+ return response[0]['generated_text'].strip()
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Streamlit UI
55
+ st.set_page_config(page_title="Offline PDF QA Bot", layout="centered")
56
+ st.title("πŸ“„ Offline PDF QA Bot πŸ”")
57
+ st.markdown(
58
+ "Upload a PDF document, ask a question, and get an answer using **only local models** β€” no external APIs involved."
59
+ )
60
+
61
+ uploaded_file = st.file_uploader("πŸ“ Upload PDF", type="pdf")
62
+ user_query = st.text_input("❓ Ask a question based on the document")
63
+
64
+ if uploaded_file and user_query:
65
+ with st.spinner("Processing..."):
66
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
67
+ tmp_file.write(uploaded_file.read())
68
+ document_text = load_pdf(tmp_file.name)
69
 
70
+ document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)]
71
 
72
+ upload_document_chunks(document_chunks)
73
+ top_chunks = query_answer(user_query)
74
+ context = " ".join(top_chunks)
75
 
76
+ answer = generate_response(context, user_query)
 
 
77
 
78
+ st.subheader("πŸ“œ Retrieved Document Segments")
79
+ for i, chunk in enumerate(top_chunks, 1):
80
+ st.markdown(f"**Chunk {i}:** {chunk}")
81
 
82
+ st.subheader("πŸ’¬ Answer")
83
+ st.success(answer)