Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

jerpint commited on Jan 30, 2023

Commit

fa9ac7e

unverified ·

1 Parent(s): 8756061

Add slackbot support (#12)

Browse files

* fix relative import

* add embeddings requirement

* update openai embeddings requirements...

* format responses appropriately

* add markdown response

* Fix newline formatting

* add threshold and top_k

* update response

* fix merge conflict

* Add slackbot

* refactor with a nice config interface

* add TODO

* isort

* add dataclass for chatbot config

* black

* Add support for orion bot

* format text

* Update docs

* use default_factory for dataclass

* Update app home tab

* update unk tokens

* move init to function

* Add logging

Files changed (2) hide show

app.py +144 -0
buster/chatbot.py +189 -85

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+from slack_bolt import App
+from buster.chatbot import Chatbot, ChatbotConfig
+MILA_CLUSTER_CHANNEL = "C04LR4H9KQA"
+ORION_CHANNEL = "C04LYHGUYB0"
+buster_cfg = ChatbotConfig(
+    documents_csv="buster/data/document_embeddings.csv",
+    unknown_prompt="This doesn't seem to be related to cluster usage. I am not sure how to answer.",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_chars=3000,
+    completion_kwargs={
+        "engine": "text-davinci-003",
+        "max_tokens": 200,
+    },
+    separator="\n",
+    link_format="slack",
+    text_after_response="""I'm a bot 🤖 and not always perfect.
+    For more info, view the full documentation here (https://docs.mila.quebec/) or contact [email protected]
+    """,
+    text_before_prompt="""
+    You are a slack chatbot assistant answering technical questions about a cluster.
+    Make sure to format your answers in Markdown format, including code block and snippets.
+    Do not include any links to urls or hyperlinks in your answers.
+    If you do not know the answer to a question, or if it is completely irrelevant to cluster usage, simply reply with:
+    'This doesn't seem to be related to cluster usage.'
+    For example:
+    What is the meaning of life on the cluster?
+    This doesn't seem to be related to cluster usage.
+    Now answer the following question:
+    """,
+)
+buster_chatbot = Chatbot(buster_cfg)
+orion_cfg = ChatbotConfig(
+    documents_csv="buster/data/document_embeddings_orion.csv",
+    unknown_prompt="This doesn't seem to be related to the orion library. I am not sure how to answer.",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_chars=3000,
+    completion_kwargs={
+        "engine": "text-davinci-003",
+        "max_tokens": 200,
+    },
+    separator="\n",
+    link_format="slack",
+    text_after_response="I'm a bot 🤖 and not always perfect.",
+    text_before_prompt="""You are a slack chatbot assistant answering technical questions about orion, a hyperparameter optimization library written in python.
+    Make sure to format your answers in Markdown format, including code block and snippets.
+    Do not include any links to urls or hyperlinks in your answers.
+    If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:
+    'This doesn't seem to be related to the orion library.'
+    For example:
+    What is the meaning of life for orion?
+    This doesn't seem to be related to cluster usage.
+    Now answer the following question:
+    """,
+)
+orion_chatbot = Chatbot(orion_cfg)
+app = App(token=os.environ.get("SLACK_BOT_TOKEN"), signing_secret=os.environ.get("SLACK_SIGNING_SECRET"))
+@app.event("app_mention")
+def respond_to_question(event, say):
+    print(event)
+    # user's text
+    text = event["text"]
+    channel = event["channel"]
+    if channel == MILA_CLUSTER_CHANNEL:
+        print("*******using BUSTER********")
+        answer = buster_chatbot.process_input(text)
+    elif channel == ORION_CHANNEL:
+        print("*******using ORION********")
+        answer = orion_chatbot.process_input(text)
+    # responds to the message in the thread
+    thread_ts = event["event_ts"]
+    say(text=answer, thread_ts=thread_ts)
+@app.event("app_home_opened")
+def update_home_tab(client, event, logger):
+    try:
+        # views.publish is the method that your app uses to push a view to the Home tab
+        client.views_publish(
+            # the user that opened your app's app home
+            user_id=event["user"],
+            # the view object that appears in the app home
+            view={
+                "type": "home",
+                "callback_id": "home_view",
+                # body of the view
+                "blocks": [
+                    {"type": "section", "text": {"type": "mrkdwn", "text": "*Hello, I'm _BusterBot_* :tada:"}},
+                    {"type": "divider"},
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": (
+                                "I am a chatbot 🤖 designed to answer questions related to technical documentation.\n\n"
+                                "I use OpenAI's GPT models to target which relevant sections of documentation are relevant and respond with.\n"
+                                "I am open-sourced, and my code is available on github: https://github.com/jerpint/buster\n\n"
+                                "For more information, contact either Jeremy or Hadrien from the AMLRT team.\n"
+                            ),
+                        },
+                    },
+                    # {
+                    #     "type": "actions",
+                    #     "elements": [{"type": "button", "text": {"type": "plain_text", "text": "Click me!"}}],
+                    # },
+                ],
+            },
+        )
+    except Exception as e:
+        logger.error(f"Error publishing home tab: {e}")
+# Start your app
+if __name__ == "__main__":
+    app.start(port=int(os.environ.get("PORT", 3000)))

buster/chatbot.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import logging
 import numpy as np
 import openai
 import pandas as pd
 from openai.embeddings_utils import cosine_similarity, get_embedding
 from buster.docparser import EMBEDDING_MODEL
@@ -11,107 +13,209 @@ logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
-# search through the reviews for a specific product
-def rank_documents(df: pd.DataFrame, query: str, top_k: int = 1, thresh: float = None) -> pd.DataFrame:
-    product_embedding = get_embedding(
-        query,
-        engine=EMBEDDING_MODEL,
-    )
-    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))
-    if thresh:
-        df = df[df.similarity > thresh]
-    if top_k == -1:
-        # return all results
-        n = len(df)
-    results = df.sort_values("similarity", ascending=False).head(top_k)
-    return results
-def engineer_prompt(question: str, documents: list[str]) -> str:
-    documents_str = " ".join(documents)
-    if len(documents_str) > 3000:
-        logger.info("truncating documents to fit...")
-        documents_str = documents_str[0:3000]
-    return documents_str + "\nNow answer the following question:\n" + question
-def format_response(response_text, sources_url=None):
-    response = f"{response_text}\n"
-    if sources_url:
-        response += f"<br><br>Here are the sources I used to answer your question:\n"
-        for url in sources_url:
-            response += f"<br>[{url}]({url})\n"
-    response += "<br><br>"
-    response += """
-    ```
-    I'm a bot 🤖 and not always perfect.
-    For more info, view the full documentation here (https://docs.mila.quebec/) or contact [email protected]
-    ```
-    """
-    return response
-def answer_question(question: str, df, top_k: int = 1, thresh: float = None) -> str:
-    # rank the documents, get the highest scoring doc and generate the prompt
-    candidates = rank_documents(df, query=question, top_k=top_k, thresh=thresh)
-    logger.info(f"candidate responses: {candidates}")
-    if len(candidates) == 0:
-        return format_response("I did not find any relevant documentation related to your question.")
-    documents = candidates.text.to_list()
-    sources_url = candidates.url.to_list()
-    prompt = engineer_prompt(question, documents)
-    logger.info(f"querying GPT...")
-    logger.info(f"User Question:\n{question}")
-    # Call the API to generate a response
-    try:
-        response = openai.Completion.create(
-            engine="text-davinci-003",
-            prompt=prompt,
-            max_tokens=200,
-            #  temperature=0,
-            #  top_p=0,
-            frequency_penalty=1,
-            presence_penalty=1,
-        )
-        # Get the response text
-        response_text = response["choices"][0]["text"]
-        logger.info(
-            f"""
-        GPT Response:\n{response_text}
         """
-        )
-        return format_response(response_text, sources_url)
-    except Exception as e:
-        import traceback
-        logging.error(traceback.format_exc())
-        response = "Oops, something went wrong. Try again later!"
-        return format_response(response)
-def load_embeddings(path: str) -> pd.DataFrame:
-    logger.info(f"loading embeddings from {path}...")
-    df = pd.read_csv(path)
-    df["embedding"] = df.embedding.apply(eval).apply(np.array)
-    logger.info(f"embeddings loaded.")
-    return df
-if __name__ == "__main__":
-    # we generate the embeddings using docparser.py
-    df = load_embeddings("data/document_embeddings.csv")
-    question = "Where should I put my datasets when I am running a job?"
-    response = answer_question(question, df)

 import logging
+from dataclasses import dataclass, field
 import numpy as np
 import openai
 import pandas as pd
+from omegaconf import OmegaConf
 from openai.embeddings_utils import cosine_similarity, get_embedding
 from buster.docparser import EMBEDDING_MODEL
 logging.basicConfig(level=logging.INFO)
+def load_documents(path: str) -> pd.DataFrame:
+    logger.info(f"loading embeddings from {path}...")
+    df = pd.read_csv(path)
+    df["embedding"] = df.embedding.apply(eval).apply(np.array)
+    logger.info(f"embeddings loaded.")
+    return df
+class Chatbot:
+    def __init__(self, cfg: OmegaConf):
+        # TODO: right now, the cfg is being passed as an omegaconf, is this what we want?
+        self.cfg = cfg
+        self._init_documents()
+        self._init_unk_embedding()
+    def _init_documents(self):
+        self.documents = load_documents(self.cfg.documents_csv)
+    def _init_unk_embedding(self):
+        logger.info("Generating UNK token...")
+        unknown_prompt = self.cfg.unknown_prompt
+        engine = self.cfg.embedding_model
+        self.unk_embedding = get_embedding(
+            unknown_prompt,
+            engine=engine,
+        )
+    def rank_documents(
+        self,
+        documents: pd.DataFrame,
+        query: str,
+    ) -> pd.DataFrame:
+        """
+        Compare the question to the series of documents and return the best matching documents.
+        """
+        top_k = self.cfg.top_k
+        thresh = self.cfg.thresh
+        engine = self.cfg.embedding_model  # EMBEDDING_MODEL
+        query_embedding = get_embedding(
+            query,
+            engine=engine,
+        )
+        documents["similarity"] = documents.embedding.apply(lambda x: cosine_similarity(x, query_embedding))
+        # sort the matched_documents by score
+        matched_documents = documents.sort_values("similarity", ascending=False)
+        # limit search to top_k matched_documents.
+        top_k = len(matched_documents) if top_k == -1 else top_k
+        matched_documents = matched_documents.head(top_k)
+        # log matched_documents to the console
+        logger.info(f"matched documents before thresh: {matched_documents}")
+        # filter out matched_documents using a threshold
+        if thresh:
+            matched_documents = matched_documents[matched_documents.similarity > thresh]
+            logger.info(f"matched documents after thresh: {matched_documents}")
+        return matched_documents
+    def prepare_prompt(self, question: str, candidates: pd.DataFrame) -> str:
+        """
+        Prepare the prompt with prompt engineering.
         """
+        max_chars = self.cfg.max_chars
+        text_before_prompt = self.cfg.text_before_prompt
+        documents_list = candidates.text.to_list()
+        documents_str = " ".join(documents_list)
+        if len(documents_str) > max_chars:
+            logger.info("truncating documents to fit...")
+            documents_str = documents_str[0:max_chars]
+        return documents_str + text_before_prompt + question
+    def generate_response(self, prompt: str, matched_documents: pd.DataFrame) -> str:
+        """
+        Generate a response based on the retrieved documents.
+        """
+        if len(matched_documents) == 0:
+            # No matching documents were retrieved, return
+            response_text = "I did not find any relevant documentation related to your question."
+            return response_text
+        logger.info(f"querying GPT...")
+        # Call the API to generate a response
+        try:
+            completion_kwargs = self.cfg.completion_kwargs
+            completion_kwargs["prompt"] = prompt
+            response = openai.Completion.create(**completion_kwargs)
+            # Get the response text
+            response_text = response["choices"][0]["text"]
+            logger.info(f"GPT Response:\n{response_text}")
+            return response_text
+        except Exception as e:
+            # log the error and return a generic response instead.
+            import traceback
+            logging.error(traceback.format_exc())
+            response_text = "Oops, something went wrong. Try again later!"
+            return response_text
+    def add_sources(self, response: str, matched_documents: pd.DataFrame):
+        """
+        Add sources fromt the matched documents to the response.
+        """
+        sep = self.cfg.separator  # \n
+        format = self.cfg.link_format
+        urls = matched_documents.url.to_list()
+        names = matched_documents.name.to_list()
+        similarities = matched_documents.similarity.to_list()
+        response += f"{sep}{sep}Here are the sources I used to answer your question:\n"
+        for url, name, similarity in zip(urls, names, similarities):
+            if format == "markdown":
+                response += f"{sep}[{name}]({url}){sep}"
+            elif format == "slack":
+                response += f"• <{url}|{name}>, score: {similarity:2.3f}{sep}"
+            else:
+                raise ValueError(f"{format} is not a valid URL format.")
+        return response
+    def format_response(self, response: str, matched_documents: pd.DataFrame) -> str:
+        """
+        Format the response by adding the sources if necessary, and a disclaimer prompt.
+        """
+        sep = self.cfg.separator
+        text_after_response = self.cfg.text_after_response
+        if len(matched_documents) > 0:
+            # we have matched documents, now we check to see if the answer is meaningful
+            response_embedding = get_embedding(
+                response,
+                engine=EMBEDDING_MODEL,
+            )
+            score = cosine_similarity(response_embedding, self.unk_embedding)
+            logger.info(f"UNK score: {score}")
+            if score < 0.9:
+                # Liekly that the answer is meaningful, add the top sources
+                response = self.add_sources(response, matched_documents=matched_documents)
+        response += f"{sep}{sep}{sep}{text_after_response}{sep}"
+        return response
+    def process_input(self, question: str) -> str:
+        """
+        Main function to process the input question and generate a formatted output.
+        """
+        logger.info(f"User Question:\n{question}")
+        matched_documents = self.rank_documents(documents=self.documents, query=question)
+        prompt = self.prepare_prompt(question, matched_documents)
+        response = self.generate_response(prompt, matched_documents)
+        formatted_output = self.format_response(response, matched_documents)
+        return formatted_output
+@dataclass
+class ChatbotConfig:
+    """Configuration object for a chatbot.
+    documents_csv: Path to the csv file containing the documents and their embeddings.
+    embedding_model: OpenAI model to use to get embeddings.
+    top_k: Max number of documents to retrieve, ordered by cosine similarity
+    thresh: threshold for cosine similarity to be considered
+    max_chars: maximum number of characters the retrieved documents can be. Will truncate otherwise.
+    completion_kwargs: kwargs for the OpenAI.Completion() method
+    separator: the separator to use, can be either "\n" or <p> depending on rendering.
+    link_format: the type of format to render links with, e.g. slack or markdown
+    unknown_prompt: Prompt to use to generate the "I don't know" embedding to compare to.
+    text_before_prompt: Text to prompt GPT with before the user prompt, but after the documentation.
+    text_after_response: Generic response to add the the chatbot's reply.
+    """
+    documents_csv: str = "buster/data/document_embeddings.csv"
+    embedding_model: str = "text-embedding-ada-002"
+    top_k: int = 3
+    thresh: float = 0.7
+    max_chars: int = 3000
+    completion_kwargs: dict = field(
+        default_factory=lambda: {
+            "engine": "text-davinci-003",
+            "max_tokens": 200,
+            "temperature": None,
+            "top_p": None,
+            "frequency_penalty": 1,
+            "presence_penalty": 1,
+        }
+    )
+    separator: str = "\n"
+    link_format: str = "slack"
+    unknown_prompt: str = "I Don't know how to answer your question."
+    text_before_prompt: str = "I'm a chatbot, bleep bloop."
+    text_after_response: str = "Answer the following question:\n"