shallou commited on
Commit
131ff8a
·
verified ·
1 Parent(s): 297e092

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -21
app.py CHANGED
@@ -2,15 +2,48 @@ from dotenv import load_dotenv
2
  import streamlit as st
3
  import pickle
4
  from PyPDF2 import PdfReader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from transformers import pipeline
9
  import os
 
 
10
 
11
  # Load environment variables from .env file
12
  load_dotenv()
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def main():
15
  st.header("LLM-powered PDF Chatbot 💬")
16
 
@@ -19,43 +52,41 @@ def main():
19
 
20
  if pdf is not None:
21
  pdf_reader = PdfReader(pdf)
22
-
23
  text = ""
24
  for page in pdf_reader.pages:
25
  text += page.extract_text()
26
 
27
- text_splitter = RecursiveCharacterTextSplitter(
28
- chunk_size=1000,
29
- chunk_overlap=200,
30
- length_function=len
31
- )
32
- chunks = text_splitter.split_text(text=text)
33
 
34
- # Process and store embeddings
35
  store_name = pdf.name[:-4]
36
  st.write(f'{store_name}')
37
 
38
  if os.path.exists(f"{store_name}.pkl"):
39
  with open(f"{store_name}.pkl", "rb") as f:
40
- VectorStore = pickle.load(f)
41
  st.write('Embeddings Loaded from the Disk')
42
  else:
43
- embeddings = HuggingFaceEmbeddings()
44
- VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
45
  with open(f"{store_name}.pkl", "wb") as f:
46
- pickle.dump(VectorStore, f)
47
 
48
  # Accept user questions/query
49
  query = st.text_input("Ask questions about your PDF file:")
50
 
51
  if query:
52
- docs = VectorStore.similarity_search(query=query, k=3)
 
 
 
 
 
53
 
54
  # Use Hugging Face pipeline for question answering
55
- model_name = "distilbert-base-uncased-distilled-squad" # Example model
56
- qa_pipeline = pipeline("question-answering", model=model_name)
57
- context = " ".join([doc.page_content for doc in docs])
58
- result = qa_pipeline(question=query, context=context)
59
  st.write(result['answer'])
60
 
61
  if __name__ == '__main__':
 
2
  import streamlit as st
3
  import pickle
4
  from PyPDF2 import PdfReader
5
+ from transformers import pipeline, AutoTokenizer, AutoModel
 
 
 
6
  import os
7
+ import torch
8
+ import numpy as np
9
 
10
  # Load environment variables from .env file
11
  load_dotenv()
12
 
13
+ # Define a function to manually chunk text
14
+ def chunk_text(text, chunk_size=1000, chunk_overlap=200):
15
+ chunks = []
16
+ i = 0
17
+ while i < len(text):
18
+ # Ensure chunk size and overlap are handled properly
19
+ chunks.append(text[i:i + chunk_size])
20
+ i += chunk_size - chunk_overlap
21
+ return chunks
22
+
23
+ # Function to generate embeddings using transformers
24
+ def generate_embeddings(text_chunks, model_name='sentence-transformers/all-MiniLM-L6-v2'):
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ model = AutoModel.from_pretrained(model_name)
27
+
28
+ embeddings = []
29
+ for text in text_chunks:
30
+ # Tokenize the text and generate embeddings
31
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
32
+ with torch.no_grad():
33
+ outputs = model(**inputs)
34
+ # Mean pooling on the last hidden state
35
+ embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
36
+ return embeddings
37
+
38
+ # Function to find the most relevant chunk based on the cosine similarity
39
+ def find_best_chunk(query_embedding, text_embeddings):
40
+ cosine_similarities = np.dot(text_embeddings, query_embedding) / (
41
+ np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
42
+ )
43
+ best_index = np.argmax(cosine_similarities)
44
+ return best_index, cosine_similarities[best_index]
45
+
46
+ # Main Streamlit app function
47
  def main():
48
  st.header("LLM-powered PDF Chatbot 💬")
49
 
 
52
 
53
  if pdf is not None:
54
  pdf_reader = PdfReader(pdf)
55
+
56
  text = ""
57
  for page in pdf_reader.pages:
58
  text += page.extract_text()
59
 
60
+ # Split text into chunks
61
+ chunks = chunk_text(text)
 
 
 
 
62
 
63
+ # Generate embeddings for the chunks
64
  store_name = pdf.name[:-4]
65
  st.write(f'{store_name}')
66
 
67
  if os.path.exists(f"{store_name}.pkl"):
68
  with open(f"{store_name}.pkl", "rb") as f:
69
+ text_embeddings = pickle.load(f)
70
  st.write('Embeddings Loaded from the Disk')
71
  else:
72
+ text_embeddings = generate_embeddings(chunks)
 
73
  with open(f"{store_name}.pkl", "wb") as f:
74
+ pickle.dump(text_embeddings, f)
75
 
76
  # Accept user questions/query
77
  query = st.text_input("Ask questions about your PDF file:")
78
 
79
  if query:
80
+ # Generate embeddings for the query
81
+ query_embedding = generate_embeddings([query])[0]
82
+
83
+ # Find the best chunk for the query
84
+ best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
85
+ best_chunk = chunks[best_index]
86
 
87
  # Use Hugging Face pipeline for question answering
88
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
89
+ result = qa_pipeline(question=query, context=best_chunk)
 
 
90
  st.write(result['answer'])
91
 
92
  if __name__ == '__main__':