Spaces:
Sleeping
Sleeping
File size: 5,297 Bytes
4d4e63a 0128aff 131ff8a 0128aff b35ee0f 4d4e63a b35ee0f 131ff8a 0128aff 131ff8a 4d4e63a b35ee0f 4d4e63a b35ee0f 4d4e63a b35ee0f 131ff8a b35ee0f 131ff8a 4d4e63a b35ee0f 4d4e63a 131ff8a b35ee0f 4d4e63a 131ff8a 4d4e63a 131ff8a b35ee0f 4d4e63a b35ee0f 4d4e63a 131ff8a b35ee0f b4dece8 131ff8a b4dece8 b35ee0f 0128aff b35ee0f 4d4e63a 297e092 4d4e63a 297e092 4d4e63a 297e092 4d4e63a b35ee0f 4d4e63a 0128aff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import os
import pickle
import numpy as np
from PyPDF2 import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import streamlit as st
# Load environment variables from .env file
load_dotenv()
# Define a function to manually chunk text
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
chunks = []
i = 0
while i < len(text):
chunks.append(text[i:i + chunk_size])
i += chunk_size - chunk_overlap
return chunks
# Function to generate embeddings using sentence-transformers
def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
embeddings = model.encode(text_chunks, convert_to_tensor=False)
return embeddings
# Function to find the most relevant chunk based on the cosine similarity
def find_best_chunk(query_embedding, text_embeddings):
cosine_similarities = np.dot(text_embeddings, query_embedding) / (
np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
)
best_index = np.argmax(cosine_similarities)
return best_index, cosine_similarities[best_index]
# Main Streamlit app function
def main():
st.header("LLM-powered PDF Chatbot 💬")
# Upload a PDF file
pdf = st.file_uploader("Upload your PDF", type='pdf')
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Split text into chunks
chunks = chunk_text(text)
# Generate embeddings for the chunks
store_name = pdf.name[:-4]
st.write(f'{store_name}')
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
text_embeddings = pickle.load(f)
st.write('Embeddings Loaded from the Disk')
else:
text_embeddings = generate_embeddings(chunks)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(text_embeddings, f)
# Accept user questions/query
query = st.text_input("Ask questions about your PDF file:")
if query:
# Generate embeddings for the query
query_embedding = generate_embeddings([query])[0]
# Find the best chunk for the query
best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
best_chunk = chunks[best_index]
# Use Hugging Face pipeline for question answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
result = qa_pipeline(question=query, context=best_chunk)
st.write(result['answer'])
# Set background image from URL
set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5)
def set_bg_from_url(url, opacity=1):
footer = """
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
<footer>
<div style='visibility: visible;margin-top:7rem;justify-content:center;display:flex;'>
<p style="font-size:1.1rem;">
Made by Asmae El-ghezzaz
<a href="https://www.linkedin.com/in/asmae-el-ghezzaz/">
<svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-linkedin" viewBox="0 0 16 16">
<path d="M0 1.146C0 .513.526 0 1.175 0h13.65C15.474 0 16 .513 16 1.146v13.708c0 .633-.526 1.146-1.175 1.146H1.175C.526 16 0 15.487 0 14.854V1.146zm4.943 12.248V6.169H2.542v7.225h2.401zm-1.2-8.212c.837 0 1.358-.554 1.358-1.248-.015-.709-.52-1.248-1.342-1.248-.822 0-1.359.54-1.359 1.248 0 .694.521 1.248 1.327 1.248h.016zm4.908 8.212V9.359c0-.216.016-.432.08-.586.173-.431.568-.878 1.232-.878.869 0 1.216.662 1.216 1.634v3.865h2.401V9.25c0-2.22-1.184-3.252-2.764-3.252-1.274 0-1.845.7-2.165 1.193v.025h-.016a5.54 5.54 0 0 1 .016-.025V6.169h-2.4c.03.678 0 7.225 0 7.225h2.4z"/>
</svg>
</a>
<a href="https://github.com/aelghezzaz">
<svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-github" viewBox="0 0 16 16">
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
</svg>
</a>
</p>
</div>
</footer>
"""
st.markdown(footer, unsafe_allow_html=True)
# Set background image using
|