Spaces:
Sleeping
Sleeping
File size: 5,583 Bytes
dec661d fb56610 dec661d 1b14b9b b2fe6e1 9c46b79 b2fe6e1 1b14b9b 6cc8723 dec661d b2fe6e1 dec661d b2fe6e1 dec661d c5e28fd cc7e366 dec661d b2fe6e1 dec661d b2fe6e1 dec661d b2fe6e1 dec661d b2fe6e1 dec661d b2fe6e1 75dc350 b2fe6e1 75dc350 b2fe6e1 dec661d 400f53b dec661d 1b14b9b b2fe6e1 1b14b9b b2fe6e1 1b14b9b b2fe6e1 1b14b9b b2fe6e1 1b14b9b dec661d 9c46b79 dec661d 82eb5b8 dec661d 1b14b9b dec661d 7ff980a dec661d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import os
import gradio as gr
from pinecone import Pinecone
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import LanceDB
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
gemini = GoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.0, top_k=1, top_p=0.0)
prompt_template = """
Context:\n {context}?\n
Question: \n{question}\n
Answer:
"""
prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
chain = prompt | gemini
index_name = "langchain-test-index"
def extract_text_from_pdf(pdf_path):
raw_documents = []
for path in pdf_path:
raw_documents.extend(PyPDFLoader(path).load())
return raw_documents
def chunk_text(raw_documents, chunk_size, chunk_overlap):
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = text_splitter.split_documents(raw_documents)
return documents
def delete_pinecone():
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index(host="https://langchain-test-index-la2n80y.svc.aped-4627-b74a.pinecone.io")
if index.describe_index_stats()['total_vector_count'] > 0:
index.delete(delete_all=True)
def store_chroma_db(documents):
chroma_db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
def store_faiss_db(documents):
faiss_db = FAISS.from_documents(documents, embeddings)
faiss_db.save_local("./faiss_db")
def store_lance_db(documents):
lance_db = LanceDB.from_documents(documents, embeddings, uri="./lance_db")
def store_pinecone_db(documents):
pinecone_db = PineconeVectorStore.from_documents(documents, index_name=index_name,
embedding=embeddings)
def load_chroma_db():
chroma_db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
return chroma_db
def load_faiss_db():
faiss_db = FAISS.load_local("./faiss_db", embeddings, allow_dangerous_deserialization=True)
return faiss_db
def load_lance_db():
lance_db = LanceDB(embedding=embeddings, uri="./lance_db")
return lance_db
def connect_pinecone_db():
pinecone_db = PineconeVectorStore(index_name=index_name, embedding=embeddings)
return pinecone_db
def invoke_chain(db, query):
docs = db.similarity_search(query)
answer = chain.invoke({"context":docs, "question": query}, return_only_outputs=True)
return answer
def store_embeddings(pdf_path, chunk_size, chunk_overlap):
raw_documents = extract_text_from_pdf(pdf_path)
documents = chunk_text(raw_documents, chunk_size, chunk_overlap)
delete_pinecone()
store_chroma_db(documents)
store_chroma_db(documents)
store_lance_db(documents)
store_pinecone_db(documents)
return "All embeddings are stored in vector database"
title = "PDF Chat"
description = "A simple Gradio interface to query PDFs and compare vector database"
examples = [[["data/amazon-10-k-2024.pdf"], 1000, 100],
[["data/goog-10-k-2023.pdf"], 1000, 100]]
def inference(query):
chroma_db = load_chroma_db()
chroma_answer = invoke_chain(chroma_db, query)
faiss_db = load_faiss_db()
faiss_answer = invoke_chain(faiss_db, query)
lance_db = load_lance_db()
lance_answer = invoke_chain(lance_db, query)
pinecone_db = connect_pinecone_db()
pinecoce_answer = invoke_chain(pinecone_db, query)
return chroma_answer, faiss_answer, lance_answer, pinecoce_answer
with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
gr.Markdown(f"# {title}\n{description}")
with gr.Row():
with gr.Column():
pdf = gr.File(label="Input PDFs", file_count="multiple", file_types=[".pdf"])
chunk_size = gr.Slider(0, 2000, 1000, 100, label="Size of Chunk")
chunk_overlap = gr.Slider(0, 1000, 100, 100, label="Size of Chunk Overlap")
with gr.Row():
clear_btn = gr.ClearButton(components=[pdf, chunk_size, chunk_overlap])
submit_btn = gr.Button("Store Embeddings", variant='primary')
with gr.Column():
message = gr.Textbox(label="Status", type="text")
chroma_out = gr.Textbox(label="ChromaDB Response", type="text")
faiss_out = gr.Textbox(label="FaissDB Response", type="text")
lance_out = gr.Textbox(label="LanceDB Response", type="text")
pinecone_out = gr.Textbox(label="PineconeDB Response", type="text")
submit_btn.click(store_embeddings, inputs=[pdf, chunk_size, chunk_overlap], outputs=message)
with gr.Row():
with gr.Column():
text = gr.Textbox(label="Question", type="text")
with gr.Row():
chat_clear_btn = gr.ClearButton(components=[text])
chat_submit_btn = gr.Button("Submit", variant='primary')
chat_submit_btn.click(inference, inputs=[text], outputs=[chroma_out, faiss_out, lance_out,
pinecone_out])
examples_obj = gr.Examples(examples=examples, inputs=[pdf, chunk_size, chunk_overlap])
demo.launch()
|