Spaces:

shallou
/

pdfchatbot

Sleeping

File size: 5,297 Bytes

4d4e63a
0128aff
131ff8a
0128aff
 
 
 
 
b35ee0f
 
4d4e63a
b35ee0f
131ff8a
 
 
 
 
 
 
 
 
0128aff
 
 
 
131ff8a
 
 
 
 
 
 
 
 
 
 
4d4e63a
 
b35ee0f
 
4d4e63a
b35ee0f
4d4e63a
 
 
 
 
b35ee0f
131ff8a
 
b35ee0f
131ff8a
4d4e63a
 
b35ee0f
4d4e63a
 
131ff8a
b35ee0f
4d4e63a
131ff8a
4d4e63a
131ff8a
b35ee0f
4d4e63a
 
b35ee0f
4d4e63a
131ff8a
 
 
 
 
 
b35ee0f
b4dece8
131ff8a
 
b4dece8
b35ee0f
0128aff
 
b35ee0f
4d4e63a
 
 
 
 
 
297e092
4d4e63a
297e092
4d4e63a
 
 
 
 
297e092
4d4e63a
 
 
 
 
 
 
b35ee0f
4d4e63a
0128aff

import os
import pickle
import numpy as np
from PyPDF2 import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import streamlit as st

# Load environment variables from .env file
load_dotenv()

# Define a function to manually chunk text
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    chunks = []
    i = 0
    while i < len(text):
        chunks.append(text[i:i + chunk_size])
        i += chunk_size - chunk_overlap
    return chunks

# Function to generate embeddings using sentence-transformers
def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(text_chunks, convert_to_tensor=False)
    return embeddings

# Function to find the most relevant chunk based on the cosine similarity
def find_best_chunk(query_embedding, text_embeddings):
    cosine_similarities = np.dot(text_embeddings, query_embedding) / (
        np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
    )
    best_index = np.argmax(cosine_similarities)
    return best_index, cosine_similarities[best_index]

# Main Streamlit app function
def main():
    st.header("LLM-powered PDF Chatbot 💬")

    # Upload a PDF file
    pdf = st.file_uploader("Upload your PDF", type='pdf')

    if pdf is not None:
        pdf_reader = PdfReader(pdf)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()

        # Split text into chunks
        chunks = chunk_text(text)

        # Generate embeddings for the chunks
        store_name = pdf.name[:-4]
        st.write(f'{store_name}')

        if os.path.exists(f"{store_name}.pkl"):
            with open(f"{store_name}.pkl", "rb") as f:
                text_embeddings = pickle.load(f)
            st.write('Embeddings Loaded from the Disk')
        else:
            text_embeddings = generate_embeddings(chunks)
            with open(f"{store_name}.pkl", "wb") as f:
                pickle.dump(text_embeddings, f)

        # Accept user questions/query
        query = st.text_input("Ask questions about your PDF file:")

        if query:
            # Generate embeddings for the query
            query_embedding = generate_embeddings([query])[0]

            # Find the best chunk for the query
            best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
            best_chunk = chunks[best_index]

            # Use Hugging Face pipeline for question answering
            qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
            result = qa_pipeline(question=query, context=best_chunk)
            st.write(result['answer'])

    # Set background image from URL
    set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5)

def set_bg_from_url(url, opacity=1):
    footer = """
    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
    <footer>
        <div style='visibility: visible;margin-top:7rem;justify-content:center;display:flex;'>
            <p style="font-size:1.1rem;">
                Made by Asmae El-ghezzaz
                &nbsp;
                <a href="https://www.linkedin.com/in/asmae-el-ghezzaz/">
                    <svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-linkedin" viewBox="0 0 16 16">
                        <path d="M0 1.146C0 .513.526 0 1.175 0h13.65C15.474 0 16 .513 16 1.146v13.708c0 .633-.526 1.146-1.175 1.146H1.175C.526 16 0 15.487 0 14.854V1.146zm4.943 12.248V6.169H2.542v7.225h2.401zm-1.2-8.212c.837 0 1.358-.554 1.358-1.248-.015-.709-.52-1.248-1.342-1.248-.822 0-1.359.54-1.359 1.248 0 .694.521 1.248 1.327 1.248h.016zm4.908 8.212V9.359c0-.216.016-.432.08-.586.173-.431.568-.878 1.232-.878.869 0 1.216.662 1.216 1.634v3.865h2.401V9.25c0-2.22-1.184-3.252-2.764-3.252-1.274 0-1.845.7-2.165 1.193v.025h-.016a5.54 5.54 0 0 1 .016-.025V6.169h-2.4c.03.678 0 7.225 0 7.225h2.4z"/>
                    </svg>          
                </a>
                &nbsp;
                <a href="https://github.com/aelghezzaz">
                    <svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-github" viewBox="0 0 16 16">
                        <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
                    </svg>
                </a>
            </p>
        </div>
    </footer>
    """
    st.markdown(footer, unsafe_allow_html=True)
    
    # Set background image using