File size: 3,597 Bytes
e112463
 
 
 
 
 
 
eec81fa
05dabf4
e112463
 
 
05dabf4
e112463
f97aa81
e112463
 
 
 
 
 
f97aa81
 
 
e112463
 
 
 
 
 
 
 
 
f97aa81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e112463
f97aa81
 
 
 
 
 
 
 
e112463
f97aa81
 
e112463
f97aa81
 
 
 
 
 
 
05dabf4
f97aa81
e112463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f97aa81
 
e112463
 
 
 
f97aa81
 
e112463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71eff3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import logging

import numpy as np
import openai
import pandas as pd
from openai.embeddings_utils import cosine_similarity, get_embedding

from buster.docparser import EMBEDDING_MODEL

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


# search through the reviews for a specific product
def rank_documents(df: pd.DataFrame, query: str, top_k: int = 1, thresh: float = None) -> pd.DataFrame:
    product_embedding = get_embedding(
        query,
        engine=EMBEDDING_MODEL,
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    if thresh:
        df = df[df.similarity > thresh]

    if top_k == -1:
        # return all results
        n = len(df)

    results = df.sort_values("similarity", ascending=False).head(top_k)
    return results


def engineer_prompt(question: str, documents: list[str]) -> str:
    documents_str = " ".join(documents)
    if len(documents_str) > 3000:
        logger.info("truncating documents to fit...")
        documents_str = documents_str[0:3000]
    return documents_str + "\nNow answer the following question:\n" + question


def format_response(response_text, sources_url=None):

    response = f"{response_text}\n"

    if sources_url:
        response += f"<br><br>Here are the sources I used to answer your question:\n"
        for url in sources_url:
            response += f"<br>[{url}]({url})\n"

    response += "<br><br>"
    response += """
    ```
    I'm a bot 🤖 and not always perfect.
    For more info, view the full documentation here (https://docs.mila.quebec/) or contact [email protected]
    ```
    """
    return response


def answer_question(question: str, df, top_k: int = 1, thresh: float = None) -> str:
    # rank the documents, get the highest scoring doc and generate the prompt
    candidates = rank_documents(df, query=question, top_k=top_k, thresh=thresh)

    logger.info(f"candidate responses: {candidates}")

    if len(candidates) == 0:
        return format_response("I did not find any relevant documentation related to your question.")

    documents = candidates.text.to_list()
    sources_url = candidates.url.to_list()
    prompt = engineer_prompt(question, documents)

    logger.info(f"querying GPT...")
    logger.info(f"User Question:\n{question}")
    # Call the API to generate a response
    try:
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=200,
            #  temperature=0,
            #  top_p=0,
            frequency_penalty=1,
            presence_penalty=1,
        )

        # Get the response text
        response_text = response["choices"][0]["text"]
        logger.info(
            f"""
        GPT Response:\n{response_text}
        """
        )
        return format_response(response_text, sources_url)

    except Exception as e:
        import traceback

        logging.error(traceback.format_exc())
        response = "Oops, something went wrong. Try again later!"
        return format_response(response)


def load_embeddings(path: str) -> pd.DataFrame:
    logger.info(f"loading embeddings from {path}...")
    df = pd.read_csv(path)
    df["embedding"] = df.embedding.apply(eval).apply(np.array)
    logger.info(f"embeddings loaded.")
    return df


if __name__ == "__main__":
    # we generate the embeddings using docparser.py
    df = load_embeddings("data/document_embeddings.csv")

    question = "Where should I put my datasets when I am running a job?"
    response = answer_question(question, df)