Spaces:

Adarsh-aot
/

pdf_qa

Paused

App Files Files Community

pdf_qa / app.py

Adarsh-aot

Rename csv_app.py to app.py

4465b2f verified about 1 year ago

raw

history blame

2.94 kB

	# import csv

	# # Load sample data (a restaurant menu of items)
	# with open('./data.csv') as file:
	# lines = csv.reader(file)

	# # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
	# documents = []

	# # Store the corresponding menu item IDs in this array.
	# metadatas = []

	# # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
	# ids = []
	# id = 1

	# # Loop thru each line and populate the 3 arrays.
	# for i, line in enumerate(lines):
	# if i==0:
	# # Skip the first row (the column headers)
	# continue

	# documents.append(line[0])
	# metadatas.append({"item_id": line[1]})
	# ids.append(str(id))
	# id+=1


	import chromadb
	from chromadb.utils import embedding_functions
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from transformers import pipeline
	from langchain.llms import HuggingFacePipeline
	import torch
	# Instantiate chromadb instance. Data is stored in memory only.
	# chroma_client = chromadb.Client()

	# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
	chroma_client = chromadb.PersistentClient(path="vector_db")

	# Select the embedding model to use.
	# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
	sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

	# Use this to delete the database
	# chroma_client.delete_collection(name="my_collection")

	# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
	collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)


	# collection.add(
	# documents=documents,
	# metadatas=metadatas,
	# ids=ids
	# )



	results = collection.query(
	query_texts=["director"],
	n_results=1,
	include=['documents', 'distances', 'metadatas']
	)
	print(results['metadatas'])



	tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
	model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")

	pipe = pipeline(
	"text2text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=512
	)

	local_llm = HuggingFacePipeline(pipeline=pipe)


	context = results['documents'][0][0]
	question = "director job"


	l = f"""
	Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer.

	{context}

	Question: {question}
	Helpful Answer:
	"""


	print(local_llm(l))