Spaces:

ZeeAI1
/

LawFi3

Sleeping

File size: 3,536 Bytes

9e3a84c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a235fe
 
 
 
9e3a84c
1a235fe
9e3a84c
 
 
 
1a235fe
9e3a84c
 
1a235fe
 
 
 
 
 
 
 
 
 
58ab5fb
ea1cfc6
 
58ab5fb
9e3a84c
 
 
 
 
 
 
 
 
 
1a235fe
9e3a84c
 
 
 
 
 
 
1a235fe
 
9e3a84c
1a235fe
 
9e3a84c
1a235fe
 
9e3a84c
1a235fe
9e3a84c
 
 
 
 
 
1a235fe
9e3a84c
 
 
 
1a235fe
58ab5fb

import os
import streamlit as st
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline

# Set up the page configuration
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")

# Load the summarization pipeline model
@st.cache_resource
def load_summarization_pipeline():
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return summarizer

summarizer = load_summarization_pipeline()

# Function to preprocess PDFs and store embeddings
def preprocess_pdfs(folder_path, save_vectorstore_path):
    all_text = ""
    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]

    for file_path in pdf_files:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    all_text += page_text

    if all_text:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
        text_chunks = text_splitter.split_text(all_text)
        embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
        vector_store.save_local(save_vectorstore_path)
        st.success("Data preprocessing and vector store creation completed!")

# Load pre-trained FAISS vector store
@st.cache_resource
def load_vector_store(save_vectorstore_path):
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.load_local(save_vectorstore_path, embedding_function, allow_dangerous_deserialization=True)



# Generate summary based on the retrieved text
def generate_summary_with_huggingface(query, retrieved_text):
    summarization_input = f"{query} Related information:{retrieved_text}"
    max_input_length = 1024
    summarization_input = summarization_input[:max_input_length]
    summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
    return summary[0]["summary_text"]

# Generate response for user query
def user_input(user_question, vector_store):
    docs = vector_store.similarity_search(user_question)
    context_text = " ".join([doc.page_content for doc in docs])
    return generate_summary_with_huggingface(user_question, context_text)

# Main function to run the Streamlit app
def main():
    st.title("📄 Gen AI Lawyers Guide")
    data_folder = 'documents1'  # Folder where your PDFs are located
    vectorstore_path = 'vector_store_data/faiss_vectorstore'  # Folder to save the vector store

    # Uncomment this line for initial preprocessing only. Once done, comment it out.
    # preprocess_pdfs(data_folder, vectorstore_path)

    # Load the pre-trained vector store
    vector_store = load_vector_store(vectorstore_path)

    user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")

    if st.button("Get Response"):
        if not user_question:
            st.warning("Please enter a question before submitting.")
        else:
            with st.spinner("Generating response..."):
                answer = user_input(user_question, vector_store)
                st.markdown(f"**🤖 AI:** {answer}")

if __name__ == "__main__":
    main()