Bohui Zhang commited on
Commit
9257999
·
1 Parent(s): 1a4d7a2

Update the fourth version

Browse files
Files changed (4) hide show
  1. app.py +22 -37
  2. data/music_meta_cqs.txt +28 -0
  3. ontochat/analysis.py +19 -84
  4. ontochat/functions.py +69 -26
app.py CHANGED
@@ -37,6 +37,18 @@ with gr.Blocks() as user_story_interface:
37
  label="Chatbot input",
38
  placeholder="Please type your message here and press Enter to interact with the chatbot :)"
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
40
  user_story = gr.TextArea(
41
  label="User story",
42
  interactive=True
@@ -56,33 +68,9 @@ with gr.Blocks() as cq_interface:
56
  gr.Markdown(
57
  """
58
  # OntoChat
59
- This is the second step of OntoChat. Please copy the generated user story from the previous
60
- step and use it here. You can also modify the user story before using it for generating competency questions.
61
- **Recommended prompt workflow:**
62
- 1. Obtain competency questions from the user story.
63
- - Zero-shot learning:
64
- - Prompt template: Given the user story: {user story}, generate {number} competency questions base on it.
65
- - Few-shot learning (i.e., provide examples to give more instructions on how to generate competency questions):
66
- - Prompt template: Here are some good examples of competency questions generated from example data.
67
- Formatted in {"Example data": "Competency questions"}.
68
- {"Yesterday was performed by Armando Rocca.": "Who performs the song?"},
69
- {"The Church was built in 1619.": "When (what year) was the building built?"},
70
- {"The Church is located in a periurban context.": "In which context is the building located?"},
71
- {"The mounting system of the bells is the falling clapper.": "Which is the mounting system of the bell?"}
72
- 2. Clean and refine competency questions.
73
- - Obtain multiple competency questions.
74
- - Prompt template: Take the generated competency questions and check if any of them can be divided into
75
- multiple questions. If they do, split the competency question into multiple competency questions. If it
76
- does not, leave the competency question as it is. For example, the competency question "Who wrote The
77
- Hobbit and in what year was the book written?" must be split into two competency questions: "Who wrote
78
- the book?" and "In what year was the book written?". Another example is the competency question, "When
79
- was the person born?". This competency question cannot be divided into multiple questions.
80
- - Remove specific named entities.
81
- - Prompt template: Take the competency questions and check if they contain real-world entities, like
82
- "Freddy Mercury" or "1837". If they do, change those real-world entities from these competency questions
83
- to more general concepts. For example, the competency question "Which is the author of Harry Potter?"
84
- should be changed to "Which is the author of the book?". Similarly, the competency question "Who wrote
85
- the book in 2018?" should be changed to "Who wrote the book, and in what year was the book written?"
86
  """
87
  )
88
 
@@ -100,7 +88,8 @@ with gr.Blocks() as cq_interface:
100
  with gr.Column():
101
  cq_chatbot = gr.Chatbot([
102
  [None, "I am OntoChat, your conversational ontology engineering assistant. Here is the second step of "
103
- "the system. Please give me your user story and tell me how many competency questions you want."]
 
104
  ])
105
  cq_input = gr.Textbox(
106
  label="Chatbot input",
@@ -145,18 +134,14 @@ clustering_interface = gr.Interface(
145
  ),
146
  gr.Dropdown(
147
  value="LLM clustering",
148
- choices=["LLM clustering", "Agglomerative clustering", "HDBSCAN"],
149
  label="Clustering method",
150
  info="Please select the clustering method."
151
  ),
152
- gr.Slider(
153
- minimum=2,
154
- maximum=50,
155
- step=1,
156
- label="Number of clusters",
157
- info="Please select the number of clusters you want to generate. Please note that for HDBSCAN, this value "
158
- "is used as the minimum size of a cluster. And please do not input a number that exceeds the total "
159
- "number of competency questions."
160
  )
161
  ],
162
  outputs=[
 
37
  label="Chatbot input",
38
  placeholder="Please type your message here and press Enter to interact with the chatbot :)"
39
  )
40
+ # gr.Markdown(
41
+ # """
42
+ # ### User story generation prompt
43
+ # Click the button below to use a user story generation prompt that provides better instructions to the chatbot.
44
+ # """
45
+ # )
46
+ # prompt_btn = gr.Button(value="User story generation prompt")
47
+ # prompt_btn.click(
48
+ # fn=load_user_story_prompt,
49
+ # inputs=[],
50
+ # outputs=[user_story_input]
51
+ # )
52
  user_story = gr.TextArea(
53
  label="User story",
54
  interactive=True
 
68
  gr.Markdown(
69
  """
70
  # OntoChat
71
+ This is the second step of OntoChat. This functionality provides support for the extraction of competency
72
+ questions from a user story. Please, provide a user story to start extracting competency questions with the
73
+ chatbot, or simply load the example story below.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  """
75
  )
76
 
 
88
  with gr.Column():
89
  cq_chatbot = gr.Chatbot([
90
  [None, "I am OntoChat, your conversational ontology engineering assistant. Here is the second step of "
91
+ "the system. Please give me your user story and tell me how many competency questions you want "
92
+ "me to generate from the user story."]
93
  ])
94
  cq_input = gr.Textbox(
95
  label="Chatbot input",
 
134
  ),
135
  gr.Dropdown(
136
  value="LLM clustering",
137
+ choices=["LLM clustering", "Agglomerative clustering"],
138
  label="Clustering method",
139
  info="Please select the clustering method."
140
  ),
141
+ gr.Textbox(
142
+ label="Number of clusters (optional for LLM clustering)",
143
+ info="Please input the number of clusters you want to generate. And please do not input a number that "
144
+ "exceeds the total number of competency questions."
 
 
 
 
145
  )
146
  ],
147
  outputs=[
data/music_meta_cqs.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Which is the composer of a musical piece?
2
+ Is the composer of a musical piece known?
3
+ Which are the members of a music ensemble?
4
+ Which role a music artist played within a music ensemble?
5
+ In which time interval has a music artist been a member of a music ensemble?
6
+ Where was a music ensemble formed?
7
+ Which award was a music artist nominated for?
8
+ Which award was received by a music artist?
9
+ Which music artists has a music artist been influenced by?
10
+ Which music artist has a music artist collaborated with?
11
+ Which is the start date of the activity of a music artist?
12
+ Which is the end date of the activity of a music artist?
13
+ Which is the name of a music artist?
14
+ Which is the alias of a music artist?
15
+ Which is the language of the name/alias of a music artist?
16
+ Which music dataset has a music algorithm been trained on?
17
+ Which is the process that led to the creation of a musical piece?
18
+ In which time interval did the creation process took place?
19
+ Where did the creation process took place?
20
+ Which are the creative actions composing the creation process of a musical piece?
21
+ Which task was executed by a creative action?
22
+ Which are the parts of a musical piece?
23
+ Which collection is a musical piece member of?
24
+ Where was a musical piece performed?
25
+ When was a musical piece performed?
26
+ Which music artists took part to a musical performance?
27
+ Which is the recording process that recorded a musical performance?
28
+ Which is the recording produced by a recording process?
ontochat/analysis.py CHANGED
@@ -26,7 +26,7 @@ def preprocess_competency_questions(cqs):
26
  # # keep index
27
  # cqs = [re.split(r'\.\s', cq, 1) for cq in cqs]
28
  # cqs = [{cq[0]: cq[1]} for cq in cqs]
29
- cqs = [re.split(r'\.\s', cq, 1)[1] for cq in cqs]
30
 
31
  # clean
32
  cleaned_cqs = []
@@ -139,81 +139,6 @@ def plot_dendrogram(model, **kwargs):
139
  return Image.open(buf)
140
 
141
 
142
- def hdbscan_clustering(cqs, embeddings, min_cluster_size=2):
143
- """
144
-
145
- :param cqs:
146
- :param embeddings:
147
- :param min_cluster_size:
148
- :return:
149
- """
150
- clusterer = HDBSCAN(
151
- min_cluster_size=min_cluster_size
152
- )
153
- clusterer.fit(embeddings)
154
- cluster_assignment = clusterer.labels_
155
-
156
- clustered_cqs = defaultdict(list)
157
- for sentence_id, cluster_id in enumerate(cluster_assignment):
158
- clustered_cqs[str(cluster_id)].append(cqs[sentence_id])
159
-
160
- fig, axis = plt.subplots(1, 1)
161
- image = plot_hdbscan_scatter(embeddings, cluster_assignment, parameters={"scale": 3, "eps": 0.9}, ax=axis)
162
- return clustered_cqs, image
163
-
164
-
165
- def plot_hdbscan_scatter(data, labels, probabilities=None, parameters=None, ground_truth=False, ax=None):
166
- """
167
- source: https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html
168
-
169
- :param data:
170
- :param labels:
171
- :param probabilities:
172
- :param parameters:
173
- :param ground_truth:
174
- :param ax:
175
- :return:
176
- """
177
- if ax is None:
178
- _, ax = plt.subplots(figsize=(10, 4))
179
- labels = labels if labels is not None else np.ones(data.shape[0])
180
- probabilities = probabilities if probabilities is not None else np.ones(data.shape[0])
181
- # Black removed and is used for noise instead.
182
- unique_labels = set(labels)
183
- colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
184
- # The probability of a point belonging to its labeled cluster determines
185
- # the size of its marker
186
- proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
187
- for k, col in zip(unique_labels, colors):
188
- if k == -1:
189
- # Black used for noise.
190
- col = [0, 0, 0, 1]
191
-
192
- class_index = np.where(labels == k)[0]
193
- for ci in class_index:
194
- ax.plot(
195
- data[ci, 0],
196
- data[ci, 1],
197
- "x" if k == -1 else "o",
198
- markerfacecolor=tuple(col),
199
- markeredgecolor="k",
200
- markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
201
- )
202
- n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
203
- preamble = "True" if ground_truth else "Estimated"
204
- title = f"{preamble} number of clusters: {n_clusters_}"
205
- if parameters is not None:
206
- parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
207
- title += f" | {parameters_str}"
208
- ax.set_title(title)
209
- plt.tight_layout()
210
- fig = plt.gcf()
211
- buf = io.BytesIO()
212
- fig.savefig(buf)
213
- buf.seek(0)
214
- return Image.open(buf)
215
-
216
-
217
  def response_parser(response):
218
  try:
219
  response = ast.literal_eval(response)
@@ -222,7 +147,7 @@ def response_parser(response):
222
  return response
223
 
224
 
225
- def llm_cq_clustering(cqs: str, n_clusters: int, api_key, paraphrase_detection=False):
226
  """
227
 
228
  :param cqs:
@@ -241,21 +166,31 @@ def llm_cq_clustering(cqs: str, n_clusters: int, api_key, paraphrase_detection=F
241
  "Return a Python list of duplicate competency questions.".format(cqs)
242
 
243
  conversation_history.append({"role": "user", "content": prompt_1})
244
- response = chat_completion(conversation_history)
245
  print("{} CQs remaining after paraphrase detection.".format(len(cqs) - len(response_parser(response))))
246
 
247
  # 2. clustering
248
- prompt_2 = f"Clustering the competency questions into {n_clusters} clusters based on their topics. " \
249
- "Keep the granularity of the topic in each cluster at a similar level. " \
250
- "Return in JSON format, such as: {'cluster 1 topic': " \
251
- "['competency question 1', 'competency question 2']}:"
 
 
 
 
 
 
252
  conversation_history.append({"role": "assistant", "content": response}) # previous response
253
  conversation_history.append({"role": "user", "content": prompt_2})
254
- response = chat_completion(conversation_history)
255
  # print("Output is: \"{}\"".format(response))
256
 
257
  else: # clustering only
258
- prompt_2 = f"Given the competency questions: {cqs}, clustering them into {n_clusters} clusters based on the topics."
 
 
 
 
259
  prompt_2 += "Keep the granularity of the topic in each cluster at a similar level. " \
260
  "Return in JSON format, such as: {'cluster 1 topic': " \
261
  "['competency question 1', 'competency question 2']}:"
 
26
  # # keep index
27
  # cqs = [re.split(r'\.\s', cq, 1) for cq in cqs]
28
  # cqs = [{cq[0]: cq[1]} for cq in cqs]
29
+ # cqs = [re.split(r'\.\s', cq, 1)[1] for cq in cqs]
30
 
31
  # clean
32
  cleaned_cqs = []
 
139
  return Image.open(buf)
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def response_parser(response):
143
  try:
144
  response = ast.literal_eval(response)
 
147
  return response
148
 
149
 
150
+ def llm_cq_clustering(cqs, n_clusters, api_key, paraphrase_detection=False):
151
  """
152
 
153
  :param cqs:
 
166
  "Return a Python list of duplicate competency questions.".format(cqs)
167
 
168
  conversation_history.append({"role": "user", "content": prompt_1})
169
+ response = chat_completion(api_key, conversation_history)
170
  print("{} CQs remaining after paraphrase detection.".format(len(cqs) - len(response_parser(response))))
171
 
172
  # 2. clustering
173
+ if n_clusters:
174
+ prompt_2 = f"Clustering the competency questions into {n_clusters} clusters based on their topics. " \
175
+ "Keep the granularity of the topic in each cluster at a similar level. " \
176
+ "Return in JSON format, such as: {'cluster 1 topic': " \
177
+ "['competency question 1', 'competency question 2']}:"
178
+ else:
179
+ prompt_2 = f"Clustering the competency questions into clusters based on their topics. " \
180
+ "Keep the granularity of the topic in each cluster at a similar level. " \
181
+ "Return in JSON format, such as: {'cluster 1 topic': " \
182
+ "['competency question 1', 'competency question 2']}:"
183
  conversation_history.append({"role": "assistant", "content": response}) # previous response
184
  conversation_history.append({"role": "user", "content": prompt_2})
185
+ response = chat_completion(api_key, conversation_history)
186
  # print("Output is: \"{}\"".format(response))
187
 
188
  else: # clustering only
189
+ if n_clusters:
190
+ prompt_2 = f"Given the competency questions: {cqs}, clustering them into {n_clusters} clusters based on " \
191
+ f"the topics."
192
+ else:
193
+ prompt_2 = f"Given the competency questions: {cqs}, clustering them into clusters based on the topics."
194
  prompt_2 += "Keep the granularity of the topic in each cluster at a similar level. " \
195
  "Return in JSON format, such as: {'cluster 1 topic': " \
196
  "['competency question 1', 'competency question 2']}:"
ontochat/functions.py CHANGED
@@ -5,7 +5,7 @@ Interface functions
5
  import json
6
 
7
  from ontochat.chatbot import chat_completion, build_messages
8
- from ontochat.analysis import compute_embeddings, agglomerative_clustering, hdbscan_clustering, llm_cq_clustering
9
  from ontochat.verbaliser import verbalise_ontology
10
 
11
 
@@ -27,7 +27,9 @@ def user_story_generator(message, history):
27
  "Persona: What are the name, occupation, skills and interests of the user? 2. The Goal: What is "
28
  "the goal of the user? Are they facing specific issues? 3. Example Data: Do you have examples of "
29
  "the specific data available? Make sure you have answers to all three questions before providing "
30
- "a user story. Only ask the next question once I have responded. And you should also ask questions "
 
 
31
  "to elaborate on more information after the user provides the initial information, and ask for "
32
  "feedback and suggestions after the user story is generated."
33
  }]
@@ -37,12 +39,42 @@ def user_story_generator(message, history):
37
  "content": message
38
  })
39
  bot_message = chat_completion(openai_api_key, instructions + messages)
40
- # post-processing response
41
  history.append([message, bot_message])
42
- print(history)
43
  return bot_message, history, ""
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def cq_generator(message, history):
47
  """
48
  generate competency questions based on the user story
@@ -51,25 +83,35 @@ def cq_generator(message, history):
51
  :param history:
52
  :return:
53
  """
54
- if (len(history)) == 1: # initial round
55
- messages = [
56
- {
57
- "role": "system",
58
- "content": "I am OntoChat, your conversational ontology engineering assistant. Here is the second step "
59
- "of the system. Please give me your user story and tell me how many competency questions "
60
- "you want."
61
- }, {
62
- "role": "user",
63
- "content": message
64
- }
65
- ]
66
- else:
67
- messages = build_messages(history)
68
- messages.append({
69
- "role": "user",
70
- "content": message
71
- })
72
- bot_message = chat_completion(openai_api_key, messages)
 
 
 
 
 
 
 
 
 
 
73
  history.append([message, bot_message])
74
  return bot_message, history, ""
75
 
@@ -89,15 +131,16 @@ def clustering_generator(cqs, cluster_method, n_clusters):
89
 
90
  :param cqs:
91
  :param cluster_method:
92
- :param n_clusters:
93
  :return:
94
  """
 
 
 
95
  cqs, cq_embeddings = compute_embeddings(cqs)
96
 
97
  if cluster_method == "Agglomerative clustering":
98
  cq_clusters, cluster_image = agglomerative_clustering(cqs, cq_embeddings, n_clusters)
99
- elif cluster_method == "HDBSCAN":
100
- cq_clusters, cluster_image = hdbscan_clustering(cqs, cq_embeddings, n_clusters)
101
  else: # cluster_method == "LLM clustering"
102
  cq_clusters, cluster_image = llm_cq_clustering(cqs, n_clusters, openai_api_key)
103
 
 
5
  import json
6
 
7
  from ontochat.chatbot import chat_completion, build_messages
8
+ from ontochat.analysis import compute_embeddings, agglomerative_clustering, llm_cq_clustering
9
  from ontochat.verbaliser import verbalise_ontology
10
 
11
 
 
27
  "Persona: What are the name, occupation, skills and interests of the user? 2. The Goal: What is "
28
  "the goal of the user? Are they facing specific issues? 3. Example Data: Do you have examples of "
29
  "the specific data available? Make sure you have answers to all three questions before providing "
30
+ "a user story. The user story should be written in the following structure: title, persona, goal, "
31
+ "scenario (where the user could use a structured knowledge base to help with their work), and "
32
+ "example data. Only ask the next question once I have responded. And you should also ask questions "
33
  "to elaborate on more information after the user provides the initial information, and ask for "
34
  "feedback and suggestions after the user story is generated."
35
  }]
 
39
  "content": message
40
  })
41
  bot_message = chat_completion(openai_api_key, instructions + messages)
 
42
  history.append([message, bot_message])
 
43
  return bot_message, history, ""
44
 
45
 
46
+ # def load_user_story_prompt():
47
+ # """
48
+ #
49
+ # :return:
50
+ # """
51
+ # prompt = """
52
+ # Now create the full user story.The user story should be written in the following structure:
53
+ #
54
+ # Title: Which topics are covered by the user story?
55
+ #
56
+ # Persona: What is the occupation of the user and what are their goals?
57
+ #
58
+ # Goal:
59
+ # Keywords: provide 5-10 keywords related to the user story
60
+ # Provide the issues a user is facing and how our application can help reach their goals.
61
+ #
62
+ # Scenario:
63
+ # Write out a scenario, where the user could use a structured knowledge base to help with their work.
64
+ #
65
+ # Example Data:
66
+ #
67
+ # Think of a list of requirements and provide example data for each requirement. Structure the example data by requirements
68
+ # Example data should by simple sentences.
69
+ # These are possible formats:
70
+ # One sonata is a “Salmo alla Romana”.
71
+ # A concert played in San Pietro di Sturla for exhibition was recorded by ethnomusicologist Mauro Balma in 1994.
72
+ # The Church of San Pietro di Sturla is located in Carasco, Genova Province.
73
+ # The Sistema Ligure is described in the text “Campanari, campane e campanili di Liguria” By Mauro Balma, 1996.
74
+ # """
75
+ # return prompt
76
+
77
+
78
  def cq_generator(message, history):
79
  """
80
  generate competency questions based on the user story
 
83
  :param history:
84
  :return:
85
  """
86
+ instructions = [{
87
+ "role": "system",
88
+ "content": "You are a conversational ontology engineering assistant."
89
+ }, {
90
+ "role": "user",
91
+ "content": "Here are instructions for you on how to generate high-quality competency questions. First, here "
92
+ "are some good examples of competency questions generated from example data. Who performs the song? "
93
+ "from the data Yesterday was performed by Armando Rocca, When (what year) was the building built? "
94
+ "from the data The Church was built in 1619, In which context is the building located? from the "
95
+ "data The Church is located in a periurban context. Second, how to make them less complex. Take the "
96
+ "generated competency questions and check if any of them can be divided into multiple questions. If "
97
+ "they do, split the competency question into multiple competency questions. If it does not, leave "
98
+ "the competency question as it is. For example, the competency question Who wrote The Hobbit and in "
99
+ "what year was the book written? must be split into two competency questions: Who wrote the book? "
100
+ "and In what year was the book written?. Another example is the competency question, When was the "
101
+ "person born?. This competency question cannot be divided into multiple questions. Third, how to "
102
+ "remove real entities to abstract them. Take the competency questions and check if they contain "
103
+ "real-world entities, like Freddy Mercury or 1837. If they do, change those real-world entities "
104
+ "from these competency questions to more general concepts. For example, the competency question "
105
+ "Which is the author of Harry Potter? should be changed to Which is the author of the book?. "
106
+ "Similarly, the competency question Who wrote the book in 2018? should be changed to Who wrote the "
107
+ "book, and in what year was the book written?"
108
+ }]
109
+ messages = build_messages(history)
110
+ messages.append({
111
+ "role": "user",
112
+ "content": message
113
+ })
114
+ bot_message = chat_completion(openai_api_key, instructions + messages)
115
  history.append([message, bot_message])
116
  return bot_message, history, ""
117
 
 
131
 
132
  :param cqs:
133
  :param cluster_method:
134
+ :param n_clusters: default ''
135
  :return:
136
  """
137
+ if n_clusters:
138
+ n_clusters = int(n_clusters)
139
+
140
  cqs, cq_embeddings = compute_embeddings(cqs)
141
 
142
  if cluster_method == "Agglomerative clustering":
143
  cq_clusters, cluster_image = agglomerative_clustering(cqs, cq_embeddings, n_clusters)
 
 
144
  else: # cluster_method == "LLM clustering"
145
  cq_clusters, cluster_image = llm_cq_clustering(cqs, n_clusters, openai_api_key)
146