Spaces:
Runtime error
Runtime error
import logging | |
import pickle | |
import numpy as np | |
import openai | |
import pandas as pd | |
from buster.docparser import EMBEDDING_MODEL | |
from openai.embeddings_utils import cosine_similarity, get_embedding | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
# search through the reviews for a specific product | |
def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame: | |
product_embedding = get_embedding( | |
query, | |
engine=EMBEDDING_MODEL, | |
) | |
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding)) | |
if top_k == -1: | |
# return all results | |
n = len(df) | |
results = df.sort_values("similarity", ascending=False).head(top_k) | |
return results | |
def engineer_prompt(question: str, documents: list[str]) -> str: | |
return " ".join(documents) + "\nNow answer the following question:\n" + question | |
def get_gpt_response(question: str, df) -> str: | |
# rank the documents, get the highest scoring doc and generate the prompt | |
candidates = rank_documents(df, query=question, top_k=1) | |
documents = candidates.text.to_list() | |
prompt = engineer_prompt(question, documents) | |
logger.info(f"querying GPT...") | |
logger.info(f"User Question:\n{question}") | |
# Call the API to generate a response | |
try: | |
response = openai.Completion.create( | |
engine="text-davinci-003", | |
prompt=prompt, | |
max_tokens=200, | |
# temperature=0, | |
# top_p=0, | |
frequency_penalty=1, | |
presence_penalty=1, | |
) | |
# Get the response text | |
response_text = response["choices"][0]["text"] | |
logger.info( | |
f""" | |
GPT Response:\n{response_text} | |
""" | |
) | |
return response_text | |
except Exception as e: | |
import traceback | |
logging.error(traceback.format_exc()) | |
return "Oops, something went wrong. Try again later!" | |
def load_embeddings(path: str) -> pd.DataFrame: | |
logger.info(f"loading embeddings from {path}...") | |
df = pd.read_csv(path) | |
df["embedding"] = df.embedding.apply(eval).apply(np.array) | |
logger.info(f"embeddings loaded.") | |
return df | |
if __name__ == "__main__": | |
# we generate the embeddings using docparser.py | |
df = load_embeddings("data/document_embeddings.csv") | |
question = "Where should I put my datasets when I am running a job?" | |
response = get_gpt_response(question, df) | |