ZeeAI1 commited on
Commit
aca722a
·
verified ·
1 Parent(s): c3ce139

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import streamlit as st
4
+ from io import BytesIO
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain.vectorstores import FAISS
9
+ from transformers import pipeline
10
+ import torch
11
+
12
+ # Set up the page configuration
13
+ st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
14
+
15
+ # Load the summarization pipeline model
16
+ @st.cache_resource
17
+ def load_summarization_pipeline():
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
19
+ return summarizer
20
+
21
+ summarizer = load_summarization_pipeline()
22
+
23
+ # Dictionary of Hugging Face PDF URLs grouped by folders
24
+ PDF_FOLDERS = {
25
+ # Add folder-specific lists of PDF URLs as shown above
26
+ }
27
+
28
+ # Helper function to convert Hugging Face blob URLs to direct download URLs
29
+ def get_huggingface_raw_url(url):
30
+ if "huggingface.co" in url and "/blob/" in url:
31
+ return url.replace("/blob/", "/resolve/")
32
+ return url
33
+
34
+ # Fetch and extract text from all PDFs in specified folders
35
+ def fetch_pdf_text_from_folders(pdf_folders):
36
+ all_text = ""
37
+ for folder_name, urls in pdf_folders.items():
38
+ folder_text = f"\n[Folder: {folder_name}]\n"
39
+ for url in urls:
40
+ raw_url = get_huggingface_raw_url(url)
41
+ try:
42
+ response = requests.get(raw_url)
43
+ response.raise_for_status()
44
+ pdf_file = BytesIO(response.content)
45
+ pdf_reader = PdfReader(pdf_file)
46
+ for page in pdf_reader.pages:
47
+ page_text = page.extract_text()
48
+ if page_text:
49
+ folder_text += page_text
50
+ except requests.RequestException as e:
51
+ st.error(f"Failed to fetch PDF from URL: {url} - {e}")
52
+ except Exception as e:
53
+ st.error(f"Failed to read PDF from URL {url}: {e}")
54
+ all_text += folder_text
55
+ return all_text
56
+
57
+ # Split text into manageable chunks
58
+ @st.cache_data
59
+ def get_text_chunks(text):
60
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
61
+ chunks = text_splitter.split_text(text)
62
+ return chunks
63
+
64
+ # Initialize embedding function
65
+ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
66
+
67
+ # Create a FAISS vector store with embeddings, checking for empty chunks
68
+ @st.cache_resource
69
+ def load_or_create_vector_store(text_chunks):
70
+ if not text_chunks:
71
+ st.error("No valid text chunks found to create a vector store. Please check your PDF URLs or file content.")
72
+ return None
73
+ vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
74
+ return vector_store
75
+
76
+ # Generate summary based on the retrieved text
77
+ def generate_summary_with_huggingface(query, retrieved_text):
78
+ summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
79
+ max_input_length = 1024
80
+ summarization_input = summarization_input[:max_input_length]
81
+ summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
82
+ return summary[0]["summary_text"]
83
+
84
+ # Generate response for user query
85
+ def user_input(user_question, vector_store):
86
+ if vector_store is None:
87
+ return "Vector store is empty due to failed PDF loading or empty documents."
88
+ docs = vector_store.similarity_search(user_question)
89
+ context_text = " ".join([doc.page_content for doc in docs])
90
+ return generate_summary_with_huggingface(user_question, context_text)
91
+
92
+ # Main function to run the Streamlit app
93
+ def main():
94
+ st.title("📄 Gen AI Lawyers Guide")
95
+ raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
96
+ text_chunks = get_text_chunks(raw_text)
97
+ vector_store = load_or_create_vector_store(text_chunks)
98
+
99
+ user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
100
+
101
+ if st.button("Get Response"):
102
+ if not user_question:
103
+ st.warning("Please enter a question before submitting.")
104
+ else:
105
+ with st.spinner("Generating response..."):
106
+ answer = user_input(user_question, vector_store)
107
+ st.markdown(f"**🤖 AI:** {answer}")
108
+
109
+ if __name__ == "__main__":
110
+ main()
111
+