Spaces:
Running
Running
File size: 4,929 Bytes
74390ec 225b022 74390ec 0a89103 225b022 0a89103 225b022 5001a34 225b022 0a89103 225b022 74390ec 225b022 74390ec 225b022 74390ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_community.embeddings import HuggingFaceEmbeddings # Using Hugging Face embeddings
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import re
# Load environment variables
load_dotenv()
os.getenv("GROQ_API_KEY")
def get_pdf_text(pdf_docs):
"""Extracts text from uploaded PDF files."""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""Splits extracted text into manageable chunks."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
return chunks
def get_vector_store(text_chunks):
"""Creates and saves a FAISS vector store from text chunks."""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Using Hugging Face embeddings
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")
def get_conversational_chain():
"""Sets up a conversational chain using Groq LLM."""
prompt_template = """
Answer the question as detailed as possible from the provided context. If the answer is not in
the provided context, just say, "answer is not available in the context." Do not provide incorrect answers.
Context:
{context}?
Question:
{question}
Answer:
"""
model = ChatGroq(
temperature=0.3,
model_name="deepseek-r1-distill-llama-70b", # Using Mixtral model through Groq
groq_api_key=os.getenv("GROQ_API_KEY")
)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
def user_input(user_question):
"""Handles user queries by retrieving answers from the vector store."""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Using Hugging Face embeddings
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)
chain = get_conversational_chain()
response = chain(
{"input_documents": docs, "question": user_question},
return_only_outputs=True
)
# Debugging: Print the original response
print("Original Response:", response['output_text'])
# Extract the thought process
thought_process = ""
if "<think>" in response['output_text'] and "</think>" in response['output_text']:
thought_process_match = re.search(r"<think>(.*?)</think>", response['output_text'], re.DOTALL)
if thought_process_match:
thought_process = thought_process_match.group(1).strip()
# Remove the thought process from the main response
clean_response = response['output_text'].replace(f"<think>{thought_process}</think>", "").strip()
# Debugging: Print the cleaned response
print("Cleaned Response:", clean_response)
# Display the model's thought process in the expander
with st.expander("Model Thought Process"):
st.write(thought_process)
st.markdown(f"### Reply:\n{clean_response}")
def main():
"""Main function to run the Streamlit app."""
st.set_page_config(page_title="Chat PDF", page_icon=":books:", layout="wide")
st.title("Chat with PDF using DeepSeek Ai")
st.sidebar.header("Upload & Process PDF Files")
st.sidebar.markdown(
"Using DeepSeek R1 model for advanced conversational capabilities.")
with st.sidebar:
pdf_docs = st.file_uploader(
"Upload your PDF files:",
accept_multiple_files=True,
type=["pdf"]
)
if st.button("Submit & Process"):
with st.spinner("Processing your files..."):
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)
st.success("PDFs processed and indexed successfully!")
st.markdown(
"### Ask Questions from Your PDF Files :mag:\n"
"Once you upload and process your PDFs, type your questions below."
)
user_question = st.text_input("Enter your question:", placeholder="What do you want to know?")
if user_question:
with st.spinner("Fetching your answer..."):
user_input(user_question)
st.sidebar.info(
"**Note:** This app uses DeepSeek R1 model for answering questions accurately."
)
if __name__ == "__main__":
main()
|