pdf_qa / app.py
Adarsh-aot's picture
Rename csv_app.py to app.py
4465b2f verified
raw
history blame
2.94 kB
# import csv
# # Load sample data (a restaurant menu of items)
# with open('./data.csv') as file:
# lines = csv.reader(file)
# # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
# documents = []
# # Store the corresponding menu item IDs in this array.
# metadatas = []
# # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
# ids = []
# id = 1
# # Loop thru each line and populate the 3 arrays.
# for i, line in enumerate(lines):
# if i==0:
# # Skip the first row (the column headers)
# continue
# documents.append(line[0])
# metadatas.append({"item_id": line[1]})
# ids.append(str(id))
# id+=1
import chromadb
from chromadb.utils import embedding_functions
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch
# Instantiate chromadb instance. Data is stored in memory only.
# chroma_client = chromadb.Client()
# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="vector_db")
# Select the embedding model to use.
# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
# Use this to delete the database
# chroma_client.delete_collection(name="my_collection")
# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)
# collection.add(
# documents=documents,
# metadatas=metadatas,
# ids=ids
# )
results = collection.query(
query_texts=["director"],
n_results=1,
include=['documents', 'distances', 'metadatas']
)
print(results['metadatas'])
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")
pipe = pipeline(
"text2text-generation",
model=model,
tokenizer=tokenizer,
max_length=512
)
local_llm = HuggingFacePipeline(pipeline=pipe)
context = results['documents'][0][0]
question = "director job"
l = f"""
Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:
"""
print(local_llm(l))