File size: 4,796 Bytes
eec0a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7a9d20
eec0a2d
dc26a36
 
eec0a2d
 
b7a9d20
eec0a2d
 
 
b7a9d20
eec0a2d
 
b7a9d20
eec0a2d
 
b7a9d20
eec0a2d
 
b7a9d20
eec0a2d
 
b7a9d20
eec0a2d
 
b7a9d20
dc26a36
eec0a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import requests
import streamlit as st
from io import BytesIO
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
import torch

# Set up the page configuration as the first Streamlit command
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")

# Load the summarization pipeline model
@st.cache_resource
def load_summarization_pipeline():
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return summarizer

summarizer = load_summarization_pipeline()

# Dictionary of Hugging Face PDF URLs grouped by folders
# Dictionary of Hugging Face PDF URLs grouped by folders
# Dictionary of Hugging Face PDF URLs grouped by folders

PDF_FOLDERS = {
    "PPC and Administration": [
        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PPC%20and%20Administration"
    ],
    "IHC": [
        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/IHC"
    ],
    "LHC": [
        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/LHC"
    ],
    "Lahore High Court Rules and Orders": [
        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/Lahore%20High%20Court%20Rules%20and%20Orders"
    ],
    "PHC": [
        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/PHC"
    ],
    "SC": [
        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/tree/main/SC"
    ]
}



# Helper function to convert Hugging Face blob URLs to direct download URLs
def get_huggingface_raw_url(url):
    if "huggingface.co" in url and "/blob/" in url:
        return url.replace("/blob/", "/resolve/")
    return url

# Fetch and extract text from all PDFs in specified folders
def fetch_pdf_text_from_folders(pdf_folders):
    all_text = ""
    for folder_name, urls in pdf_folders.items():
        folder_text = f"\n[Folder: {folder_name}]\n"
        for url in urls:
            raw_url = get_huggingface_raw_url(url)
            response = requests.get(raw_url)
            if response.status_code == 200:
                pdf_file = BytesIO(response.content)
                try:
                    pdf_reader = PdfReader(pdf_file)
                    for page in pdf_reader.pages:
                        page_text = page.extract_text()
                        if page_text:
                            folder_text += page_text
                except Exception as e:
                    st.error(f"Failed to read PDF from URL {url}: {e}")
            else:
                st.error(f"Failed to fetch PDF from URL: {url}")
        all_text += folder_text
    return all_text

# Split text into manageable chunks
@st.cache_data
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

# Initialize embedding function
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store with embeddings
@st.cache_resource
def load_or_create_vector_store(text_chunks):
    vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
    return vector_store

# Generate summary based on the retrieved text
def generate_summary_with_huggingface(query, retrieved_text):
    summarization_input = f"{query}\n\nRelated information:\n{retrieved_text}"
    max_input_length = 1024
    summarization_input = summarization_input[:max_input_length]
    summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
    return summary[0]["summary_text"]

# Generate response for user query
def user_input(user_question, vector_store):
    docs = vector_store.similarity_search(user_question)
    context_text = " ".join([doc.page_content for doc in docs])
    return generate_summary_with_huggingface(user_question, context_text)

# Main function to run the Streamlit app
def main():
    st.title("πŸ“„ Gen AI Lawyers Guide")
    raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
    text_chunks = get_text_chunks(raw_text)
    vector_store = load_or_create_vector_store(text_chunks)

    user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")

    if st.button("Get Response"):
        if not user_question:
            st.warning("Please enter a question before submitting.")
        else:
            with st.spinner("Generating response..."):
                answer = user_input(user_question, vector_store)
                st.markdown(f"**πŸ€– AI:** {answer}")

if __name__ == "__main__":
    main()