Spaces:
Paused
Paused
# import csv | |
# # Load sample data (a restaurant menu of items) | |
# with open('./data.csv') as file: | |
# lines = csv.reader(file) | |
# # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc. | |
# documents = [] | |
# # Store the corresponding menu item IDs in this array. | |
# metadatas = [] | |
# # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there. | |
# ids = [] | |
# id = 1 | |
# # Loop thru each line and populate the 3 arrays. | |
# for i, line in enumerate(lines): | |
# if i==0: | |
# # Skip the first row (the column headers) | |
# continue | |
# documents.append(line[0]) | |
# metadatas.append({"item_id": line[1]}) | |
# ids.append(str(id)) | |
# id+=1 | |
import chromadb | |
from chromadb.utils import embedding_functions | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from transformers import pipeline | |
from langchain.llms import HuggingFacePipeline | |
import torch | |
# Instantiate chromadb instance. Data is stored in memory only. | |
# chroma_client = chromadb.Client() | |
# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file). | |
chroma_client = chromadb.PersistentClient(path="vector_db") | |
# Select the embedding model to use. | |
# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html | |
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2") | |
# Use this to delete the database | |
# chroma_client.delete_collection(name="my_collection") | |
# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding. | |
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef) | |
# collection.add( | |
# documents=documents, | |
# metadatas=metadatas, | |
# ids=ids | |
# ) | |
results = collection.query( | |
query_texts=["director"], | |
n_results=1, | |
include=['documents', 'distances', 'metadatas'] | |
) | |
print(results['metadatas']) | |
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M") | |
model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M") | |
pipe = pipeline( | |
"text2text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_length=512 | |
) | |
local_llm = HuggingFacePipeline(pipeline=pipe) | |
context = results['documents'][0][0] | |
question = "director job" | |
l = f""" | |
Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
{context} | |
Question: {question} | |
Helpful Answer: | |
""" | |
print(local_llm(l)) |