Spaces:

Adarsh-aot
/

pdf_qa

Paused

App Files Files Community

pdf_qa / app.py

Adarsh-aot

Update app.py

228558e verified about 1 year ago

raw

history blame

3.26 kB


	import streamlit as st
	import csv
	import chromadb
	from chromadb.utils import embedding_functions
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from transformers import pipeline
	from langchain.llms import HuggingFacePipeline

	# Load sample data (a restaurant menu of items)
	# with open('./data.csv') as file:
	# lines = csv.reader(file)

	# # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
	# documents = []

	# # Store the corresponding menu item IDs in this array.
	# metadatas = []

	# # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
	# ids = []
	# id = 1

	# # Loop thru each line and populate the 3 arrays.
	# for i, line in enumerate(lines):
	# if i == 0:
	# # Skip the first row (the column headers)
	# continue

	# documents.append(line[0])
	# metadatas.append({"item_id": line[1]})
	# ids.append(str(id))
	# id += 1

	# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
	chroma_client = chromadb.PersistentClient(path="vector_db")

	# Select the embedding model to use.
	# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
	sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

	# Use this to delete the database
	# chroma_client.delete_collection(name="my_collection")

	# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
	collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)

	# Add the data to the collection
	# collection.add(
	# documents=documents,
	# metadatas=metadatas,
	# ids=ids
	# )

	# Streamlit app layout
	st.title("ChromaDB and HuggingFace Pipeline Integration")

	query = st.text_input("Enter your query:", value="director")

	if st.button("Search"):
	results = collection.query(
	query_texts=[query],
	n_results=1,
	include=['documents', 'distances', 'metadatas']
	)
	st.write("Query Results:")
	st.write(results['metadatas'])

	if results['documents']:
	context = results['documents']
	st.write("Context:")
	st.write(context)
	tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
	model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")

	pipe = pipeline(
	"text2text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=512
	)

	local_llm = HuggingFacePipeline(pipeline=pipe)

	l = f"""
	Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

	{context}

	Question: {query}
	Helpful Answer:
	"""

	answer = local_llm(l)
	st.write("Answer:")
	st.write(answer)