Spaces:

Adarsh-aot
/

pdf_qa

Paused

File size: 2,938 Bytes

6aabcd0

# import csv

# # Load sample data (a restaurant menu of items)
# with open('./data.csv') as file:
#     lines = csv.reader(file)

#     # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
#     documents = []

#     # Store the corresponding menu item IDs in this array.
#     metadatas = []

#     # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
#     ids = []
#     id = 1

#     # Loop thru each line and populate the 3 arrays.
#     for i, line in enumerate(lines):
#         if i==0:
#             # Skip the first row (the column headers)
#             continue

#         documents.append(line[0])
#         metadatas.append({"item_id": line[1]})
#         ids.append(str(id))
#         id+=1


import chromadb
from chromadb.utils import embedding_functions
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch
# Instantiate chromadb instance. Data is stored in memory only.
# chroma_client = chromadb.Client()

# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="vector_db")

# Select the embedding model to use.
# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Use this to delete the database
# chroma_client.delete_collection(name="my_collection")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)


# collection.add(
#     documents=documents,
#     metadatas=metadatas,
#     ids=ids
# )



results = collection.query(
    query_texts=["director"],
    n_results=1,
    include=['documents', 'distances', 'metadatas']
)
print(results['metadatas'])



tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

local_llm = HuggingFacePipeline(pipeline=pipe)


context = results['documents'][0][0]
question = "director job"


l = f"""

Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer.



{context}



Question: {question}

Helpful Answer:

"""


print(local_llm(l))