ZeeAI1 commited on
Commit
1a235fe
Β·
verified Β·
1 Parent(s): fe031dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -77
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import streamlit as st
3
  import pdfplumber
4
- from concurrent.futures import ThreadPoolExecutor
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.vectorstores import FAISS
@@ -18,76 +17,30 @@ def load_summarization_pipeline():
18
 
19
  summarizer = load_summarization_pipeline()
20
 
21
- # Split text into manageable chunks
22
- @st.cache_data
23
- def get_text_chunks(text):
24
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
25
- chunks = text_splitter.split_text(text)
26
- return chunks
27
-
28
- # Initialize embedding function
29
- embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
 
31
- # Create a FAISS vector store with embeddings, checking for empty chunks
32
- @st.cache_resource
33
- def load_or_create_vector_store(text_chunks):
34
- if not text_chunks:
35
- st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
36
- return None
37
- vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
38
- return vector_store
39
-
40
- # Helper function to process a single PDF
41
- def process_single_pdf(file_path):
42
- text = ""
43
- try:
44
  with pdfplumber.open(file_path) as pdf:
45
  for page in pdf.pages:
46
  page_text = page.extract_text()
47
  if page_text:
48
- text += page_text
49
- except Exception as e:
50
- st.error(f"Failed to read PDF: {file_path} - {e}")
51
- return text
52
-
53
- # Function to load PDFs with progress display
54
- def load_pdfs_with_progress(folder_path):
55
- all_text = ""
56
- pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
57
- num_files = len(pdf_files)
58
-
59
- if num_files == 0:
60
- st.error("No PDF files found in the specified folder.")
61
- st.session_state['vector_store'] = None
62
- st.session_state['loading'] = False
63
- return
64
-
65
- # Title for the progress bar
66
- st.markdown("### Loading data...")
67
- progress_bar = st.progress(0)
68
- status_text = st.empty()
69
-
70
- processed_count = 0
71
-
72
- for file_path in pdf_files:
73
- result = process_single_pdf(file_path)
74
- all_text += result
75
- processed_count += 1
76
- progress_percentage = int((processed_count / num_files) * 100)
77
- progress_bar.progress(processed_count / num_files)
78
- status_text.text(f"Loading documents: {progress_percentage}% completed")
79
-
80
- progress_bar.empty() # Remove the progress bar when done
81
- status_text.text("Document loading completed!") # Show completion message
82
 
83
  if all_text:
84
- text_chunks = get_text_chunks(all_text)
85
- vector_store = load_or_create_vector_store(text_chunks)
86
- st.session_state['vector_store'] = vector_store
87
- else:
88
- st.session_state['vector_store'] = None
89
-
90
- st.session_state['loading'] = False # Mark loading as complete
 
 
 
 
91
 
92
  # Generate summary based on the retrieved text
93
  def generate_summary_with_huggingface(query, retrieved_text):
@@ -98,10 +51,7 @@ def generate_summary_with_huggingface(query, retrieved_text):
98
  return summary[0]["summary_text"]
99
 
100
  # Generate response for user query
101
- def user_input(user_question):
102
- vector_store = st.session_state.get('vector_store')
103
- if vector_store is None:
104
- return "The app is still loading documents or no documents were successfully loaded."
105
  docs = vector_store.similarity_search(user_question)
106
  context_text = " ".join([doc.page_content for doc in docs])
107
  return generate_summary_with_huggingface(user_question, context_text)
@@ -109,25 +59,25 @@ def user_input(user_question):
109
  # Main function to run the Streamlit app
110
  def main():
111
  st.title("πŸ“„ Gen AI Lawyers Guide")
 
 
112
 
113
- # Start loading documents if not already loaded
114
- if 'loading' not in st.session_state or st.session_state['loading']:
115
- st.session_state['loading'] = True
116
- load_pdfs_with_progress('documents1')
117
 
118
- user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
 
119
 
120
- if st.session_state.get('loading', True):
121
- st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")
122
 
123
  if st.button("Get Response"):
124
  if not user_question:
125
  st.warning("Please enter a question before submitting.")
126
  else:
127
  with st.spinner("Generating response..."):
128
- answer = user_input(user_question)
129
  st.markdown(f"**πŸ€– AI:** {answer}")
130
 
131
  if __name__ == "__main__":
132
  main()
133
-
 
1
  import os
2
  import streamlit as st
3
  import pdfplumber
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.vectorstores import FAISS
 
17
 
18
  summarizer = load_summarization_pipeline()
19
 
20
+ # Function to preprocess PDFs and store embeddings
21
+ def preprocess_pdfs(folder_path, save_vectorstore_path):
22
+ all_text = ""
23
+ pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
 
 
 
 
 
24
 
25
+ for file_path in pdf_files:
 
 
 
 
 
 
 
 
 
 
 
 
26
  with pdfplumber.open(file_path) as pdf:
27
  for page in pdf.pages:
28
  page_text = page.extract_text()
29
  if page_text:
30
+ all_text += page_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  if all_text:
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
34
+ text_chunks = text_splitter.split_text(all_text)
35
+ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
36
+ vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
37
+ vector_store.save_local(save_vectorstore_path)
38
+ st.success("Data preprocessing and vector store creation completed!")
39
+
40
+ # Load pre-trained FAISS vector store
41
+ @st.cache_resource
42
+ def load_vector_store(save_vectorstore_path):
43
+ return FAISS.load_local(save_vectorstore_path, embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
44
 
45
  # Generate summary based on the retrieved text
46
  def generate_summary_with_huggingface(query, retrieved_text):
 
51
  return summary[0]["summary_text"]
52
 
53
  # Generate response for user query
54
+ def user_input(user_question, vector_store):
 
 
 
55
  docs = vector_store.similarity_search(user_question)
56
  context_text = " ".join([doc.page_content for doc in docs])
57
  return generate_summary_with_huggingface(user_question, context_text)
 
59
  # Main function to run the Streamlit app
60
  def main():
61
  st.title("πŸ“„ Gen AI Lawyers Guide")
62
+ data_folder = 'documents1' # Folder where your PDFs are located
63
+ vectorstore_path = 'vector_store_data/faiss_vectorstore' # Folder to save the vector store
64
 
65
+ # Uncomment this line for initial preprocessing only. Once done, comment it out.
66
+ # preprocess_pdfs(data_folder, vectorstore_path)
 
 
67
 
68
+ # Load the pre-trained vector store
69
+ vector_store = load_vector_store(vectorstore_path)
70
 
71
+ user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
 
72
 
73
  if st.button("Get Response"):
74
  if not user_question:
75
  st.warning("Please enter a question before submitting.")
76
  else:
77
  with st.spinner("Generating response..."):
78
+ answer = user_input(user_question, vector_store)
79
  st.markdown(f"**πŸ€– AI:** {answer}")
80
 
81
  if __name__ == "__main__":
82
  main()
83
+