jerpint commited on
Commit
e112463
·
unverified ·
1 Parent(s): bed2402

add chatbot (#2)

Browse files

* update requirements

* Add chatbot functionality

* isort

* put embedding caching in docparser, add error handling

* add typehints

* black

* package the project

* isort

buster/chatbot.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pickle
3
+
4
+ import numpy as np
5
+ import openai
6
+ import pandas as pd
7
+ from docparser import EMBEDDING_MODEL
8
+ from openai.embeddings_utils import cosine_similarity, get_embedding
9
+
10
+ logger = logging.getLogger(__name__)
11
+ logging.basicConfig(level=logging.INFO)
12
+
13
+ # search through the reviews for a specific product
14
+ def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
15
+ product_embedding = get_embedding(
16
+ query,
17
+ engine=EMBEDDING_MODEL,
18
+ )
19
+ df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))
20
+
21
+ if top_k == -1:
22
+ # return all results
23
+ n = len(df)
24
+
25
+ results = df.sort_values("similarity", ascending=False).head(top_k)
26
+ return results
27
+
28
+
29
+ def engineer_prompt(question: str, documents: list[str]) -> str:
30
+ return " ".join(documents) + "\nNow answer the following question:\n" + question
31
+
32
+
33
+ def get_gpt_response(question: str, df) -> str:
34
+ # rank the documents, get the highest scoring doc and generate the prompt
35
+ candidates = rank_documents(df, query=question, top_k=1)
36
+ documents = candidates.documents.to_list()
37
+ prompt = engineer_prompt(question, documents)
38
+
39
+ logger.info(f"querying GPT...")
40
+ logger.info(f"User Question:\n{question}")
41
+ # Call the API to generate a response
42
+ try:
43
+ response = openai.Completion.create(
44
+ engine="text-davinci-003",
45
+ prompt=prompt,
46
+ max_tokens=200,
47
+ # temperature=0,
48
+ # top_p=0,
49
+ frequency_penalty=1,
50
+ presence_penalty=1,
51
+ )
52
+
53
+ # Get the response text
54
+ response_text = response["choices"][0]["text"]
55
+ logger.info(
56
+ f"""
57
+ GPT Response:\n{response_text}
58
+ """
59
+ )
60
+ return response_text
61
+ except Exception as e:
62
+ import traceback
63
+
64
+ logging.error(traceback.format_exc())
65
+ return "Oops, something went wrong. Try again later!"
66
+
67
+
68
+ def load_embeddings(path: str) -> pd.DataFrame:
69
+ logger.info(f"loading embeddings from {path}...")
70
+ df = pd.read_csv(path)
71
+ df["embedding"] = df.embedding.apply(eval).apply(np.array)
72
+ logger.info(f"embeddings loaded.")
73
+ return df
74
+
75
+
76
+ if __name__ == "__main__":
77
+ # we generate the embeddings using docparser.py
78
+ df = load_embeddings("data/document_embeddings.csv")
79
+
80
+ question = "Where should I put my datasets when I am running a job?"
81
+ response = get_gpt_response(question, df)
buster/data/document_embeddings.csv ADDED
The diff for this file is too large to render. See raw diff
 
buster/data/sections.pkl ADDED
Binary file (276 kB). View file
 
docparser.py → buster/docparser.py RENAMED
@@ -2,7 +2,13 @@ import glob
2
  import os
3
  import pickle
4
 
 
 
5
  from bs4 import BeautifulSoup
 
 
 
 
6
 
7
 
8
  def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
@@ -53,6 +59,35 @@ def read_sections(filepath: str) -> list[str]:
53
  return sections
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  if __name__ == "__main__":
57
  root_dir = "/home/hadrien/perso/mila-docs/output/"
58
  save_filepath = os.path.join(root_dir, "sections.pkl")
@@ -63,3 +98,6 @@ if __name__ == "__main__":
63
 
64
  # How to load
65
  sections = read_sections(save_filepath)
 
 
 
 
2
  import os
3
  import pickle
4
 
5
+ import pandas as pd
6
+ import tiktoken
7
  from bs4 import BeautifulSoup
8
+ from openai.embeddings_utils import cosine_similarity, get_embedding
9
+
10
+ EMBEDDING_MODEL = "text-embedding-ada-002"
11
+ EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
13
 
14
  def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
 
59
  return sections
60
 
61
 
62
+ def load_documents(fname: str) -> pd.DataFrame:
63
+ df = pd.DataFrame()
64
+
65
+ with open(fname, "rb") as fp:
66
+ documents = pickle.load(fp)
67
+ df["documents"] = documents
68
+ return df
69
+
70
+
71
+ def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
72
+ encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
73
+ df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
74
+ return df
75
+
76
+
77
+ def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
78
+ df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
79
+ return df
80
+
81
+
82
+ def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
83
+ # Get all documents and precompute their embeddings
84
+ df = load_documents(filepath)
85
+ df = compute_n_tokens(df)
86
+ df = precompute_embeddings(df)
87
+ df.to_csv(output_csv)
88
+ return df
89
+
90
+
91
  if __name__ == "__main__":
92
  root_dir = "/home/hadrien/perso/mila-docs/output/"
93
  save_filepath = os.path.join(root_dir, "sections.pkl")
 
98
 
99
  # How to load
100
  sections = read_sections(save_filepath)
101
+
102
+ # precopmute the document embeddings
103
+ df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
pyproject.toml CHANGED
@@ -3,11 +3,11 @@ requires = ["setuptools", "setuptools-scm"]
3
  build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
- name = "mila-cluster-chatbot"
7
  version = "0.0.1"
8
- description = "Chatbot to answer Mila cluster questions"
9
  readme = "README.md"
10
- requires-python = ">=3.10"
11
  dynamic = ["dependencies"]
12
 
13
  [tool.setuptools.dynamic]
 
3
  build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
+ name = "buster"
7
  version = "0.0.1"
8
+ description = "buster the bot for the mila cluster"
9
  readme = "README.md"
10
+ requires-python = ">=3.9"
11
  dynamic = ["dependencies"]
12
 
13
  [tool.setuptools.dynamic]
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  pandas
2
  openai
3
  numpy
 
 
1
  pandas
2
  openai
3
  numpy
4
+ tiktoken