Bohui Zhang
commited on
Commit
·
9257999
1
Parent(s):
1a4d7a2
Update the fourth version
Browse files- app.py +22 -37
- data/music_meta_cqs.txt +28 -0
- ontochat/analysis.py +19 -84
- ontochat/functions.py +69 -26
app.py
CHANGED
@@ -37,6 +37,18 @@ with gr.Blocks() as user_story_interface:
|
|
37 |
label="Chatbot input",
|
38 |
placeholder="Please type your message here and press Enter to interact with the chatbot :)"
|
39 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
user_story = gr.TextArea(
|
41 |
label="User story",
|
42 |
interactive=True
|
@@ -56,33 +68,9 @@ with gr.Blocks() as cq_interface:
|
|
56 |
gr.Markdown(
|
57 |
"""
|
58 |
# OntoChat
|
59 |
-
This is the second step of OntoChat.
|
60 |
-
|
61 |
-
|
62 |
-
1. Obtain competency questions from the user story.
|
63 |
-
- Zero-shot learning:
|
64 |
-
- Prompt template: Given the user story: {user story}, generate {number} competency questions base on it.
|
65 |
-
- Few-shot learning (i.e., provide examples to give more instructions on how to generate competency questions):
|
66 |
-
- Prompt template: Here are some good examples of competency questions generated from example data.
|
67 |
-
Formatted in {"Example data": "Competency questions"}.
|
68 |
-
{"Yesterday was performed by Armando Rocca.": "Who performs the song?"},
|
69 |
-
{"The Church was built in 1619.": "When (what year) was the building built?"},
|
70 |
-
{"The Church is located in a periurban context.": "In which context is the building located?"},
|
71 |
-
{"The mounting system of the bells is the falling clapper.": "Which is the mounting system of the bell?"}
|
72 |
-
2. Clean and refine competency questions.
|
73 |
-
- Obtain multiple competency questions.
|
74 |
-
- Prompt template: Take the generated competency questions and check if any of them can be divided into
|
75 |
-
multiple questions. If they do, split the competency question into multiple competency questions. If it
|
76 |
-
does not, leave the competency question as it is. For example, the competency question "Who wrote The
|
77 |
-
Hobbit and in what year was the book written?" must be split into two competency questions: "Who wrote
|
78 |
-
the book?" and "In what year was the book written?". Another example is the competency question, "When
|
79 |
-
was the person born?". This competency question cannot be divided into multiple questions.
|
80 |
-
- Remove specific named entities.
|
81 |
-
- Prompt template: Take the competency questions and check if they contain real-world entities, like
|
82 |
-
"Freddy Mercury" or "1837". If they do, change those real-world entities from these competency questions
|
83 |
-
to more general concepts. For example, the competency question "Which is the author of Harry Potter?"
|
84 |
-
should be changed to "Which is the author of the book?". Similarly, the competency question "Who wrote
|
85 |
-
the book in 2018?" should be changed to "Who wrote the book, and in what year was the book written?"
|
86 |
"""
|
87 |
)
|
88 |
|
@@ -100,7 +88,8 @@ with gr.Blocks() as cq_interface:
|
|
100 |
with gr.Column():
|
101 |
cq_chatbot = gr.Chatbot([
|
102 |
[None, "I am OntoChat, your conversational ontology engineering assistant. Here is the second step of "
|
103 |
-
|
|
|
104 |
])
|
105 |
cq_input = gr.Textbox(
|
106 |
label="Chatbot input",
|
@@ -145,18 +134,14 @@ clustering_interface = gr.Interface(
|
|
145 |
),
|
146 |
gr.Dropdown(
|
147 |
value="LLM clustering",
|
148 |
-
choices=["LLM clustering", "Agglomerative clustering"
|
149 |
label="Clustering method",
|
150 |
info="Please select the clustering method."
|
151 |
),
|
152 |
-
gr.
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
label="Number of clusters",
|
157 |
-
info="Please select the number of clusters you want to generate. Please note that for HDBSCAN, this value "
|
158 |
-
"is used as the minimum size of a cluster. And please do not input a number that exceeds the total "
|
159 |
-
"number of competency questions."
|
160 |
)
|
161 |
],
|
162 |
outputs=[
|
|
|
37 |
label="Chatbot input",
|
38 |
placeholder="Please type your message here and press Enter to interact with the chatbot :)"
|
39 |
)
|
40 |
+
# gr.Markdown(
|
41 |
+
# """
|
42 |
+
# ### User story generation prompt
|
43 |
+
# Click the button below to use a user story generation prompt that provides better instructions to the chatbot.
|
44 |
+
# """
|
45 |
+
# )
|
46 |
+
# prompt_btn = gr.Button(value="User story generation prompt")
|
47 |
+
# prompt_btn.click(
|
48 |
+
# fn=load_user_story_prompt,
|
49 |
+
# inputs=[],
|
50 |
+
# outputs=[user_story_input]
|
51 |
+
# )
|
52 |
user_story = gr.TextArea(
|
53 |
label="User story",
|
54 |
interactive=True
|
|
|
68 |
gr.Markdown(
|
69 |
"""
|
70 |
# OntoChat
|
71 |
+
This is the second step of OntoChat. This functionality provides support for the extraction of competency
|
72 |
+
questions from a user story. Please, provide a user story to start extracting competency questions with the
|
73 |
+
chatbot, or simply load the example story below.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
"""
|
75 |
)
|
76 |
|
|
|
88 |
with gr.Column():
|
89 |
cq_chatbot = gr.Chatbot([
|
90 |
[None, "I am OntoChat, your conversational ontology engineering assistant. Here is the second step of "
|
91 |
+
"the system. Please give me your user story and tell me how many competency questions you want "
|
92 |
+
"me to generate from the user story."]
|
93 |
])
|
94 |
cq_input = gr.Textbox(
|
95 |
label="Chatbot input",
|
|
|
134 |
),
|
135 |
gr.Dropdown(
|
136 |
value="LLM clustering",
|
137 |
+
choices=["LLM clustering", "Agglomerative clustering"],
|
138 |
label="Clustering method",
|
139 |
info="Please select the clustering method."
|
140 |
),
|
141 |
+
gr.Textbox(
|
142 |
+
label="Number of clusters (optional for LLM clustering)",
|
143 |
+
info="Please input the number of clusters you want to generate. And please do not input a number that "
|
144 |
+
"exceeds the total number of competency questions."
|
|
|
|
|
|
|
|
|
145 |
)
|
146 |
],
|
147 |
outputs=[
|
data/music_meta_cqs.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Which is the composer of a musical piece?
|
2 |
+
Is the composer of a musical piece known?
|
3 |
+
Which are the members of a music ensemble?
|
4 |
+
Which role a music artist played within a music ensemble?
|
5 |
+
In which time interval has a music artist been a member of a music ensemble?
|
6 |
+
Where was a music ensemble formed?
|
7 |
+
Which award was a music artist nominated for?
|
8 |
+
Which award was received by a music artist?
|
9 |
+
Which music artists has a music artist been influenced by?
|
10 |
+
Which music artist has a music artist collaborated with?
|
11 |
+
Which is the start date of the activity of a music artist?
|
12 |
+
Which is the end date of the activity of a music artist?
|
13 |
+
Which is the name of a music artist?
|
14 |
+
Which is the alias of a music artist?
|
15 |
+
Which is the language of the name/alias of a music artist?
|
16 |
+
Which music dataset has a music algorithm been trained on?
|
17 |
+
Which is the process that led to the creation of a musical piece?
|
18 |
+
In which time interval did the creation process took place?
|
19 |
+
Where did the creation process took place?
|
20 |
+
Which are the creative actions composing the creation process of a musical piece?
|
21 |
+
Which task was executed by a creative action?
|
22 |
+
Which are the parts of a musical piece?
|
23 |
+
Which collection is a musical piece member of?
|
24 |
+
Where was a musical piece performed?
|
25 |
+
When was a musical piece performed?
|
26 |
+
Which music artists took part to a musical performance?
|
27 |
+
Which is the recording process that recorded a musical performance?
|
28 |
+
Which is the recording produced by a recording process?
|
ontochat/analysis.py
CHANGED
@@ -26,7 +26,7 @@ def preprocess_competency_questions(cqs):
|
|
26 |
# # keep index
|
27 |
# cqs = [re.split(r'\.\s', cq, 1) for cq in cqs]
|
28 |
# cqs = [{cq[0]: cq[1]} for cq in cqs]
|
29 |
-
cqs = [re.split(r'\.\s', cq, 1)[1] for cq in cqs]
|
30 |
|
31 |
# clean
|
32 |
cleaned_cqs = []
|
@@ -139,81 +139,6 @@ def plot_dendrogram(model, **kwargs):
|
|
139 |
return Image.open(buf)
|
140 |
|
141 |
|
142 |
-
def hdbscan_clustering(cqs, embeddings, min_cluster_size=2):
|
143 |
-
"""
|
144 |
-
|
145 |
-
:param cqs:
|
146 |
-
:param embeddings:
|
147 |
-
:param min_cluster_size:
|
148 |
-
:return:
|
149 |
-
"""
|
150 |
-
clusterer = HDBSCAN(
|
151 |
-
min_cluster_size=min_cluster_size
|
152 |
-
)
|
153 |
-
clusterer.fit(embeddings)
|
154 |
-
cluster_assignment = clusterer.labels_
|
155 |
-
|
156 |
-
clustered_cqs = defaultdict(list)
|
157 |
-
for sentence_id, cluster_id in enumerate(cluster_assignment):
|
158 |
-
clustered_cqs[str(cluster_id)].append(cqs[sentence_id])
|
159 |
-
|
160 |
-
fig, axis = plt.subplots(1, 1)
|
161 |
-
image = plot_hdbscan_scatter(embeddings, cluster_assignment, parameters={"scale": 3, "eps": 0.9}, ax=axis)
|
162 |
-
return clustered_cqs, image
|
163 |
-
|
164 |
-
|
165 |
-
def plot_hdbscan_scatter(data, labels, probabilities=None, parameters=None, ground_truth=False, ax=None):
|
166 |
-
"""
|
167 |
-
source: https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html
|
168 |
-
|
169 |
-
:param data:
|
170 |
-
:param labels:
|
171 |
-
:param probabilities:
|
172 |
-
:param parameters:
|
173 |
-
:param ground_truth:
|
174 |
-
:param ax:
|
175 |
-
:return:
|
176 |
-
"""
|
177 |
-
if ax is None:
|
178 |
-
_, ax = plt.subplots(figsize=(10, 4))
|
179 |
-
labels = labels if labels is not None else np.ones(data.shape[0])
|
180 |
-
probabilities = probabilities if probabilities is not None else np.ones(data.shape[0])
|
181 |
-
# Black removed and is used for noise instead.
|
182 |
-
unique_labels = set(labels)
|
183 |
-
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
|
184 |
-
# The probability of a point belonging to its labeled cluster determines
|
185 |
-
# the size of its marker
|
186 |
-
proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
|
187 |
-
for k, col in zip(unique_labels, colors):
|
188 |
-
if k == -1:
|
189 |
-
# Black used for noise.
|
190 |
-
col = [0, 0, 0, 1]
|
191 |
-
|
192 |
-
class_index = np.where(labels == k)[0]
|
193 |
-
for ci in class_index:
|
194 |
-
ax.plot(
|
195 |
-
data[ci, 0],
|
196 |
-
data[ci, 1],
|
197 |
-
"x" if k == -1 else "o",
|
198 |
-
markerfacecolor=tuple(col),
|
199 |
-
markeredgecolor="k",
|
200 |
-
markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
|
201 |
-
)
|
202 |
-
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
|
203 |
-
preamble = "True" if ground_truth else "Estimated"
|
204 |
-
title = f"{preamble} number of clusters: {n_clusters_}"
|
205 |
-
if parameters is not None:
|
206 |
-
parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
|
207 |
-
title += f" | {parameters_str}"
|
208 |
-
ax.set_title(title)
|
209 |
-
plt.tight_layout()
|
210 |
-
fig = plt.gcf()
|
211 |
-
buf = io.BytesIO()
|
212 |
-
fig.savefig(buf)
|
213 |
-
buf.seek(0)
|
214 |
-
return Image.open(buf)
|
215 |
-
|
216 |
-
|
217 |
def response_parser(response):
|
218 |
try:
|
219 |
response = ast.literal_eval(response)
|
@@ -222,7 +147,7 @@ def response_parser(response):
|
|
222 |
return response
|
223 |
|
224 |
|
225 |
-
def llm_cq_clustering(cqs
|
226 |
"""
|
227 |
|
228 |
:param cqs:
|
@@ -241,21 +166,31 @@ def llm_cq_clustering(cqs: str, n_clusters: int, api_key, paraphrase_detection=F
|
|
241 |
"Return a Python list of duplicate competency questions.".format(cqs)
|
242 |
|
243 |
conversation_history.append({"role": "user", "content": prompt_1})
|
244 |
-
response = chat_completion(conversation_history)
|
245 |
print("{} CQs remaining after paraphrase detection.".format(len(cqs) - len(response_parser(response))))
|
246 |
|
247 |
# 2. clustering
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
conversation_history.append({"role": "assistant", "content": response}) # previous response
|
253 |
conversation_history.append({"role": "user", "content": prompt_2})
|
254 |
-
response = chat_completion(conversation_history)
|
255 |
# print("Output is: \"{}\"".format(response))
|
256 |
|
257 |
else: # clustering only
|
258 |
-
|
|
|
|
|
|
|
|
|
259 |
prompt_2 += "Keep the granularity of the topic in each cluster at a similar level. " \
|
260 |
"Return in JSON format, such as: {'cluster 1 topic': " \
|
261 |
"['competency question 1', 'competency question 2']}:"
|
|
|
26 |
# # keep index
|
27 |
# cqs = [re.split(r'\.\s', cq, 1) for cq in cqs]
|
28 |
# cqs = [{cq[0]: cq[1]} for cq in cqs]
|
29 |
+
# cqs = [re.split(r'\.\s', cq, 1)[1] for cq in cqs]
|
30 |
|
31 |
# clean
|
32 |
cleaned_cqs = []
|
|
|
139 |
return Image.open(buf)
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
def response_parser(response):
|
143 |
try:
|
144 |
response = ast.literal_eval(response)
|
|
|
147 |
return response
|
148 |
|
149 |
|
150 |
+
def llm_cq_clustering(cqs, n_clusters, api_key, paraphrase_detection=False):
|
151 |
"""
|
152 |
|
153 |
:param cqs:
|
|
|
166 |
"Return a Python list of duplicate competency questions.".format(cqs)
|
167 |
|
168 |
conversation_history.append({"role": "user", "content": prompt_1})
|
169 |
+
response = chat_completion(api_key, conversation_history)
|
170 |
print("{} CQs remaining after paraphrase detection.".format(len(cqs) - len(response_parser(response))))
|
171 |
|
172 |
# 2. clustering
|
173 |
+
if n_clusters:
|
174 |
+
prompt_2 = f"Clustering the competency questions into {n_clusters} clusters based on their topics. " \
|
175 |
+
"Keep the granularity of the topic in each cluster at a similar level. " \
|
176 |
+
"Return in JSON format, such as: {'cluster 1 topic': " \
|
177 |
+
"['competency question 1', 'competency question 2']}:"
|
178 |
+
else:
|
179 |
+
prompt_2 = f"Clustering the competency questions into clusters based on their topics. " \
|
180 |
+
"Keep the granularity of the topic in each cluster at a similar level. " \
|
181 |
+
"Return in JSON format, such as: {'cluster 1 topic': " \
|
182 |
+
"['competency question 1', 'competency question 2']}:"
|
183 |
conversation_history.append({"role": "assistant", "content": response}) # previous response
|
184 |
conversation_history.append({"role": "user", "content": prompt_2})
|
185 |
+
response = chat_completion(api_key, conversation_history)
|
186 |
# print("Output is: \"{}\"".format(response))
|
187 |
|
188 |
else: # clustering only
|
189 |
+
if n_clusters:
|
190 |
+
prompt_2 = f"Given the competency questions: {cqs}, clustering them into {n_clusters} clusters based on " \
|
191 |
+
f"the topics."
|
192 |
+
else:
|
193 |
+
prompt_2 = f"Given the competency questions: {cqs}, clustering them into clusters based on the topics."
|
194 |
prompt_2 += "Keep the granularity of the topic in each cluster at a similar level. " \
|
195 |
"Return in JSON format, such as: {'cluster 1 topic': " \
|
196 |
"['competency question 1', 'competency question 2']}:"
|
ontochat/functions.py
CHANGED
@@ -5,7 +5,7 @@ Interface functions
|
|
5 |
import json
|
6 |
|
7 |
from ontochat.chatbot import chat_completion, build_messages
|
8 |
-
from ontochat.analysis import compute_embeddings, agglomerative_clustering,
|
9 |
from ontochat.verbaliser import verbalise_ontology
|
10 |
|
11 |
|
@@ -27,7 +27,9 @@ def user_story_generator(message, history):
|
|
27 |
"Persona: What are the name, occupation, skills and interests of the user? 2. The Goal: What is "
|
28 |
"the goal of the user? Are they facing specific issues? 3. Example Data: Do you have examples of "
|
29 |
"the specific data available? Make sure you have answers to all three questions before providing "
|
30 |
-
"a user story.
|
|
|
|
|
31 |
"to elaborate on more information after the user provides the initial information, and ask for "
|
32 |
"feedback and suggestions after the user story is generated."
|
33 |
}]
|
@@ -37,12 +39,42 @@ def user_story_generator(message, history):
|
|
37 |
"content": message
|
38 |
})
|
39 |
bot_message = chat_completion(openai_api_key, instructions + messages)
|
40 |
-
# post-processing response
|
41 |
history.append([message, bot_message])
|
42 |
-
print(history)
|
43 |
return bot_message, history, ""
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def cq_generator(message, history):
|
47 |
"""
|
48 |
generate competency questions based on the user story
|
@@ -51,25 +83,35 @@ def cq_generator(message, history):
|
|
51 |
:param history:
|
52 |
:return:
|
53 |
"""
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
history.append([message, bot_message])
|
74 |
return bot_message, history, ""
|
75 |
|
@@ -89,15 +131,16 @@ def clustering_generator(cqs, cluster_method, n_clusters):
|
|
89 |
|
90 |
:param cqs:
|
91 |
:param cluster_method:
|
92 |
-
:param n_clusters:
|
93 |
:return:
|
94 |
"""
|
|
|
|
|
|
|
95 |
cqs, cq_embeddings = compute_embeddings(cqs)
|
96 |
|
97 |
if cluster_method == "Agglomerative clustering":
|
98 |
cq_clusters, cluster_image = agglomerative_clustering(cqs, cq_embeddings, n_clusters)
|
99 |
-
elif cluster_method == "HDBSCAN":
|
100 |
-
cq_clusters, cluster_image = hdbscan_clustering(cqs, cq_embeddings, n_clusters)
|
101 |
else: # cluster_method == "LLM clustering"
|
102 |
cq_clusters, cluster_image = llm_cq_clustering(cqs, n_clusters, openai_api_key)
|
103 |
|
|
|
5 |
import json
|
6 |
|
7 |
from ontochat.chatbot import chat_completion, build_messages
|
8 |
+
from ontochat.analysis import compute_embeddings, agglomerative_clustering, llm_cq_clustering
|
9 |
from ontochat.verbaliser import verbalise_ontology
|
10 |
|
11 |
|
|
|
27 |
"Persona: What are the name, occupation, skills and interests of the user? 2. The Goal: What is "
|
28 |
"the goal of the user? Are they facing specific issues? 3. Example Data: Do you have examples of "
|
29 |
"the specific data available? Make sure you have answers to all three questions before providing "
|
30 |
+
"a user story. The user story should be written in the following structure: title, persona, goal, "
|
31 |
+
"scenario (where the user could use a structured knowledge base to help with their work), and "
|
32 |
+
"example data. Only ask the next question once I have responded. And you should also ask questions "
|
33 |
"to elaborate on more information after the user provides the initial information, and ask for "
|
34 |
"feedback and suggestions after the user story is generated."
|
35 |
}]
|
|
|
39 |
"content": message
|
40 |
})
|
41 |
bot_message = chat_completion(openai_api_key, instructions + messages)
|
|
|
42 |
history.append([message, bot_message])
|
|
|
43 |
return bot_message, history, ""
|
44 |
|
45 |
|
46 |
+
# def load_user_story_prompt():
|
47 |
+
# """
|
48 |
+
#
|
49 |
+
# :return:
|
50 |
+
# """
|
51 |
+
# prompt = """
|
52 |
+
# Now create the full user story.The user story should be written in the following structure:
|
53 |
+
#
|
54 |
+
# Title: Which topics are covered by the user story?
|
55 |
+
#
|
56 |
+
# Persona: What is the occupation of the user and what are their goals?
|
57 |
+
#
|
58 |
+
# Goal:
|
59 |
+
# Keywords: provide 5-10 keywords related to the user story
|
60 |
+
# Provide the issues a user is facing and how our application can help reach their goals.
|
61 |
+
#
|
62 |
+
# Scenario:
|
63 |
+
# Write out a scenario, where the user could use a structured knowledge base to help with their work.
|
64 |
+
#
|
65 |
+
# Example Data:
|
66 |
+
#
|
67 |
+
# Think of a list of requirements and provide example data for each requirement. Structure the example data by requirements
|
68 |
+
# Example data should by simple sentences.
|
69 |
+
# These are possible formats:
|
70 |
+
# One sonata is a “Salmo alla Romana”.
|
71 |
+
# A concert played in San Pietro di Sturla for exhibition was recorded by ethnomusicologist Mauro Balma in 1994.
|
72 |
+
# The Church of San Pietro di Sturla is located in Carasco, Genova Province.
|
73 |
+
# The Sistema Ligure is described in the text “Campanari, campane e campanili di Liguria” By Mauro Balma, 1996.
|
74 |
+
# """
|
75 |
+
# return prompt
|
76 |
+
|
77 |
+
|
78 |
def cq_generator(message, history):
|
79 |
"""
|
80 |
generate competency questions based on the user story
|
|
|
83 |
:param history:
|
84 |
:return:
|
85 |
"""
|
86 |
+
instructions = [{
|
87 |
+
"role": "system",
|
88 |
+
"content": "You are a conversational ontology engineering assistant."
|
89 |
+
}, {
|
90 |
+
"role": "user",
|
91 |
+
"content": "Here are instructions for you on how to generate high-quality competency questions. First, here "
|
92 |
+
"are some good examples of competency questions generated from example data. Who performs the song? "
|
93 |
+
"from the data Yesterday was performed by Armando Rocca, When (what year) was the building built? "
|
94 |
+
"from the data The Church was built in 1619, In which context is the building located? from the "
|
95 |
+
"data The Church is located in a periurban context. Second, how to make them less complex. Take the "
|
96 |
+
"generated competency questions and check if any of them can be divided into multiple questions. If "
|
97 |
+
"they do, split the competency question into multiple competency questions. If it does not, leave "
|
98 |
+
"the competency question as it is. For example, the competency question Who wrote The Hobbit and in "
|
99 |
+
"what year was the book written? must be split into two competency questions: Who wrote the book? "
|
100 |
+
"and In what year was the book written?. Another example is the competency question, When was the "
|
101 |
+
"person born?. This competency question cannot be divided into multiple questions. Third, how to "
|
102 |
+
"remove real entities to abstract them. Take the competency questions and check if they contain "
|
103 |
+
"real-world entities, like Freddy Mercury or 1837. If they do, change those real-world entities "
|
104 |
+
"from these competency questions to more general concepts. For example, the competency question "
|
105 |
+
"Which is the author of Harry Potter? should be changed to Which is the author of the book?. "
|
106 |
+
"Similarly, the competency question Who wrote the book in 2018? should be changed to Who wrote the "
|
107 |
+
"book, and in what year was the book written?"
|
108 |
+
}]
|
109 |
+
messages = build_messages(history)
|
110 |
+
messages.append({
|
111 |
+
"role": "user",
|
112 |
+
"content": message
|
113 |
+
})
|
114 |
+
bot_message = chat_completion(openai_api_key, instructions + messages)
|
115 |
history.append([message, bot_message])
|
116 |
return bot_message, history, ""
|
117 |
|
|
|
131 |
|
132 |
:param cqs:
|
133 |
:param cluster_method:
|
134 |
+
:param n_clusters: default ''
|
135 |
:return:
|
136 |
"""
|
137 |
+
if n_clusters:
|
138 |
+
n_clusters = int(n_clusters)
|
139 |
+
|
140 |
cqs, cq_embeddings = compute_embeddings(cqs)
|
141 |
|
142 |
if cluster_method == "Agglomerative clustering":
|
143 |
cq_clusters, cluster_image = agglomerative_clustering(cqs, cq_embeddings, n_clusters)
|
|
|
|
|
144 |
else: # cluster_method == "LLM clustering"
|
145 |
cq_clusters, cluster_image = llm_cq_clustering(cqs, n_clusters, openai_api_key)
|
146 |
|