File size: 2,477 Bytes
e112463
 
 
 
 
 
05dabf4
e112463
 
05dabf4
e112463
 
 
05dabf4
e112463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05dabf4
e112463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import logging
import pickle

import numpy as np
import openai
import pandas as pd
from buster.docparser import EMBEDDING_MODEL
from openai.embeddings_utils import cosine_similarity, get_embedding


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


# search through the reviews for a specific product
def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
    product_embedding = get_embedding(
        query,
        engine=EMBEDDING_MODEL,
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    if top_k == -1:
        # return all results
        n = len(df)

    results = df.sort_values("similarity", ascending=False).head(top_k)
    return results


def engineer_prompt(question: str, documents: list[str]) -> str:
    return " ".join(documents) + "\nNow answer the following question:\n" + question


def get_gpt_response(question: str, df) -> str:
    # rank the documents, get the highest scoring doc and generate the prompt
    candidates = rank_documents(df, query=question, top_k=1)
    documents = candidates.text.to_list()
    prompt = engineer_prompt(question, documents)

    logger.info(f"querying GPT...")
    logger.info(f"User Question:\n{question}")
    # Call the API to generate a response
    try:
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=200,
            #  temperature=0,
            #  top_p=0,
            frequency_penalty=1,
            presence_penalty=1,
        )

        # Get the response text
        response_text = response["choices"][0]["text"]
        logger.info(
            f"""
        GPT Response:\n{response_text}
        """
        )
        return response_text
    except Exception as e:
        import traceback

        logging.error(traceback.format_exc())
        return "Oops, something went wrong. Try again later!"


def load_embeddings(path: str) -> pd.DataFrame:
    logger.info(f"loading embeddings from {path}...")
    df = pd.read_csv(path)
    df["embedding"] = df.embedding.apply(eval).apply(np.array)
    logger.info(f"embeddings loaded.")
    return df


if __name__ == "__main__":
    # we generate the embeddings using docparser.py
    df = load_embeddings("data/document_embeddings.csv")

    question = "Where should I put my datasets when I am running a job?"
    response = get_gpt_response(question, df)