ZeeAI1 commited on
Commit
710b05f
·
verified ·
1 Parent(s): 003effb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -0
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Search models, datasets, users...
4
+ Models
5
+ Datasets
6
+ Spaces
7
+ Posts
8
+ Docs
9
+ Solutions
10
+ Pricing
11
+
12
+
13
+
14
+ Spaces:
15
+
16
+ wahab5763
17
+ /
18
+ LawyersGuide
19
+
20
+
21
+ like
22
+ 0
23
+ App
24
+ Files
25
+ Community
26
+ LawyersGuide
27
+ /
28
+ app.py
29
+
30
+ wahab5763's picture
31
+ wahab5763
32
+ Update app.py
33
+ d21c369
34
+ verified
35
+ about 4 hours ago
36
+ raw
37
+
38
+ Copy download link
39
+ history
40
+ blame
41
+ contribute
42
+ delete
43
+
44
+ 5.01 kB
45
+ import os
46
+ import streamlit as st
47
+ import pdfplumber
48
+ from concurrent.futures import ThreadPoolExecutor
49
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
50
+ from langchain.embeddings import HuggingFaceEmbeddings
51
+ from langchain.vectorstores import FAISS
52
+ from transformers import pipeline
53
+
54
+ # Set up the page configuration
55
+ st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
56
+
57
+ # Load the summarization pipeline model
58
+ @st.cache_resource
59
+ def load_summarization_pipeline():
60
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
61
+ return summarizer
62
+
63
+ summarizer = load_summarization_pipeline()
64
+
65
+ # Split text into manageable chunks
66
+ @st.cache_data
67
+ def get_text_chunks(text):
68
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
69
+ chunks = text_splitter.split_text(text)
70
+ return chunks
71
+
72
+ # Initialize embedding function
73
+ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
74
+
75
+ # Create a FAISS vector store with embeddings, checking for empty chunks
76
+ @st.cache_resource
77
+ def load_or_create_vector_store(text_chunks):
78
+ if not text_chunks:
79
+ st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
80
+ return None
81
+ vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
82
+ return vector_store
83
+
84
+ # Helper function to process a single PDF
85
+ def process_single_pdf(file_path):
86
+ text = ""
87
+ try:
88
+ with pdfplumber.open(file_path) as pdf:
89
+ for page in pdf.pages:
90
+ page_text = page.extract_text()
91
+ if page_text:
92
+ text += page_text
93
+ except Exception as e:
94
+ st.error(f"Failed to read PDF: {file_path} - {e}")
95
+ return text
96
+
97
+ # Function to load PDFs with progress display
98
+ def load_pdfs_with_progress(folder_path):
99
+ all_text = ""
100
+ pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
101
+ num_files = len(pdf_files)
102
+
103
+ if num_files == 0:
104
+ st.error("No PDF files found in the specified folder.")
105
+ st.session_state['vector_store'] = None
106
+ st.session_state['loading'] = False
107
+ return
108
+
109
+ # Title for the progress bar
110
+ st.markdown("### Loading data...")
111
+ progress_bar = st.progress(0)
112
+ status_text = st.empty()
113
+
114
+ processed_count = 0
115
+
116
+ for file_path in pdf_files:
117
+ result = process_single_pdf(file_path)
118
+ all_text += result
119
+ processed_count += 1
120
+ progress_percentage = int((processed_count / num_files) * 100)
121
+ progress_bar.progress(processed_count / num_files)
122
+ status_text.text(f"Loading documents: {progress_percentage}% completed")
123
+
124
+ progress_bar.empty() # Remove the progress bar when done
125
+ status_text.text("Document loading completed!") # Show completion message
126
+
127
+ if all_text:
128
+ text_chunks = get_text_chunks(all_text)
129
+ vector_store = load_or_create_vector_store(text_chunks)
130
+ st.session_state['vector_store'] = vector_store
131
+ else:
132
+ st.session_state['vector_store'] = None
133
+
134
+ st.session_state['loading'] = False # Mark loading as complete
135
+
136
+ # Generate summary based on the retrieved text
137
+ def generate_summary_with_huggingface(query, retrieved_text):
138
+ summarization_input = f"{query} Related information:{retrieved_text}"
139
+ max_input_length = 1024
140
+ summarization_input = summarization_input[:max_input_length]
141
+ summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
142
+ return summary[0]["summary_text"]
143
+
144
+ # Generate response for user query
145
+ def user_input(user_question):
146
+ vector_store = st.session_state.get('vector_store')
147
+ if vector_store is None:
148
+ return "The app is still loading documents or no documents were successfully loaded."
149
+ docs = vector_store.similarity_search(user_question)
150
+ context_text = " ".join([doc.page_content for doc in docs])
151
+ return generate_summary_with_huggingface(user_question, context_text)
152
+
153
+ # Main function to run the Streamlit app
154
+ def main():
155
+ st.title("📄 Gen AI Lawyers Guide")
156
+
157
+ # Start loading documents if not already loaded
158
+ if 'loading' not in st.session_state or st.session_state['loading']:
159
+ st.session_state['loading'] = True
160
+ load_pdfs_with_progress('documents1')
161
+
162
+ user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
163
+
164
+ if st.session_state.get('loading', True):
165
+ st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")
166
+
167
+ if st.button("Get Response"):
168
+ if not user_question:
169
+ st.warning("Please enter a question before submitting.")
170
+ else:
171
+ with st.spinner("Generating response..."):
172
+ answer = user_input(user_question)
173
+ st.markdown(f"**🤖 AI:** {answer}")
174
+
175
+ if __name__ == "__main__":
176
+ main()