TGRL-bad-smells / app.py
nouf-sst's picture
Update app.py
06e1dab
raw
history blame
18.2 kB
import gradio as gr
import re
import json
import numpy as np
import nltk
import stanza
from stanza.models.constituency.parse_tree import Tree
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from sentence_transformers import CrossEncoder
from autocorrect import Speller
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.utils.rnn import pad_sequence
# ***************************** Load needed models *****************************
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
#sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
sentences_similarity_model = CrossEncoder('WillHeld/roberta-base-stsb')
nli_model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
nli_tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)
# ***************************** TGRL Parsing *****************************
def parse_tgrl(file_obj):
with open(file_obj.name, 'r') as f:
tgrl_text = f.read()
tgrl_text = tgrl_text.replace('\t', '')
tgrl_text = tgrl_text.replace('\n', '')
return tgrl_text
def extract_elements(tgrl_text):
# Extract actors
actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
# Extract goals
goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
# Extract softGoals
softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
# Extract tasks
tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
# Extract resources
resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
elements = {
"actors": actors,
"goals": goals,
"softGoals": softGoals,
"tasks": tasks,
"resources": resources
}
# get elements per actor
elements_per_actor = {}
for goal in goals:
corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal))
corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
if corresponding_actor not in elements_per_actor:
elements_per_actor[corresponding_actor] = []
elements_per_actor[corresponding_actor].append(goal)
for softGoal in softGoals:
corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal))
corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
if corresponding_actor not in elements_per_actor:
elements_per_actor[corresponding_actor] = []
elements_per_actor[corresponding_actor].append(softGoal)
for task in tasks:
corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task))
corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
if corresponding_actor not in elements_per_actor:
elements_per_actor[corresponding_actor] = []
elements_per_actor[corresponding_actor].append(task)
# get decomposed elements
new_lines = tgrl_text
decomposed_elements = {}
main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines)
for main_element in main_elements:
sub_elements = []
sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
sub_elements.append(sub_element)
new_lines = new_lines.replace(sub_element+', ', '')
temp = main_element + " decomposedBy "
for idx, sub_element in enumerate(sub_elements):
if idx+1 == len (sub_elements):
temp = temp + sub_element + ";"
else:
temp = temp + sub_element + ", "
while temp not in tgrl_text:
sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
sub_elements.append(sub_element)
new_lines = new_lines.replace(sub_element+', ', '')
temp = main_element + " decomposedBy "
for idx, sub_element in enumerate(sub_elements):
if idx+1 == len (sub_elements):
temp = temp + sub_element + ";"
else:
temp = temp + sub_element + ", "
decomposed_elements[main_element] = sub_elements
# Replace elements IDs with names
new_decomposed_elements = {}
for key, _ in decomposed_elements.items():
new_key = re.findall("(?:"+key+"\s*{\s*name\s=\s\")([A-Za-z\s;.,!?:-]*)", tgrl_text)[0]
new_values = []
for element in decomposed_elements[key]:
new_value = re.findall("(?:"+element+"\s*{\s*name\s=\s\")([A-Za-z\s;.,!?:-]*)", tgrl_text)[0]
new_values.append(new_value)
new_decomposed_elements[new_key] = new_values
return elements, elements_per_actor, new_decomposed_elements
# ************************************************************************
# ************************* Bad Smells Detection *************************
# ########### Long Elements ###########
def get_long_elements(elements, size_threshold): # Using RegEx
long_elements = []
for key, value in elements.items():
for i in range(0, len(elements[key])):
if len(re. findall(r'\w+', elements[key][i])) > size_threshold:
long_elements.append(elements[key][i])
if long_elements:
long_elements = "\n".join(long_elements)
return "Long elements:\n" + long_elements
else:
return "Long elements:\nNone."
# #####################################
# ######### Complex Sentences #########
def is_complex_sentence(sentence):
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
doc = nlp(sentence)
for sentence in doc.sentences:
unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency)
if 'SBAR' in unique_constituent_labels:
return True
else:
return False
def get_complex_sentences(elements):
complex_sentences = []
for key, value in elements.items():
for i in range(0, len(elements[key])):
if is_complex_sentence(elements[key][i]):
complex_sentences.append(elements[key][i])
if complex_sentences:
complex_sentences = "\n".join(complex_sentences)
return "Complex sentences:\n" + complex_sentences
else:
return "Complex sentences:\nNone."
# #####################################
# ########## Punctuations #########
def get_punctuations(elements):
punctuations = []
for key, value in elements.items():
for i in range(0, len(elements[key])):
if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0:
punctuations.append(elements[key][i])
if punctuations:
punctuations = "\n".join(punctuations)
return "Punctuations:\n" + punctuations
else:
return "Punctuations:\nNone."
# #################################
# ########## Incorrect Actor Syntax ##########
def find_non_NPs(sentences):
pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer)
outputs = pipeline(sentences)
Non_NPs = []
for idx, output in enumerate(outputs):
if output[0]['entity'].startswith('V'):
Non_NPs.append(sentences[idx])
return Non_NPs
def check_actor_syntax(actors):
incorrect_actor_syntax = find_non_NPs(actors)
if incorrect_actor_syntax:
incorrect_actor_syntax = "\n".join(incorrect_actor_syntax)
return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax
else:
return "All actors are syntactically correct."
# ############################################
# ########## Incorrect Goal Syntax ###########
def check_goal_syntax(goals):
incorrect_goal_syntax = find_non_NPs(goals)
if incorrect_goal_syntax:
incorrect_goal_syntax = "\n".join(incorrect_goal_syntax)
return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax
else:
return "All goals are syntactically correct."
# ############################################
# ########## Incorrect Softgoal Syntax ###########
def check_softgoal_syntax(softgoals):
incorrect_softgoal_syntax = find_non_NPs(softgoals)
if incorrect_softgoal_syntax:
incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax)
return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax
else:
return "All softgoal are syntactically correct."
# ############################################
# ########## Incorrect Task Syntax ###########
def find_NPs(sentences):
pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
outputs = pipeline(sentences)
NPs = []
for idx, output in enumerate(outputs):
if not output[0]['entity'].startswith('V'):
NPs.append(sentences[idx])
return NPs
def check_task_syntax(tasks):
incorrect_task_syntax = find_NPs(tasks)
if incorrect_task_syntax:
incorrect_task_syntax = "\n".join(incorrect_task_syntax)
return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax
else:
return "All tasks are syntactically correct."
# ############################################
# ########## Incorrect Resource Syntax ###########
def check_resource_syntax(resources):
incorrect_resource_syntax = find_non_NPs(resources)
if incorrect_resource_syntax:
incorrect_resource_syntax = "\n".join(incorrect_resource_syntax)
return "Incorrect Resources Syntax:\n" + incorrect_resource_syntax
else:
return "All resources are syntactically correct."
# ############################################
# ########## Similarity ###########
def get_similar_elements(elements_per_actor, similarity_threshold):
# Prepare sentence pair array
sentence_pairs = []
for key, value in elements_per_actor.items():
for i in range(len(elements_per_actor[key])):
for j in range(i+1,len(elements_per_actor[key])):
sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
# Predict semantic similarity
semantic_similarity_scores = sentences_similarity_model.predict(sentence_pairs, show_progress_bar=True)
similar_elements = []
for index, value in enumerate(sentence_pairs):
if semantic_similarity_scores[index] > similarity_threshold:
similar_elements.append(value)
#semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]]
if similar_elements:
similar_elements = [' and '.join(ele) for ele in similar_elements]
similar_elements = "\n".join(similar_elements)
return "The following elements are semantically similar:\n" + similar_elements
else:
return "There are no similar elements."
return semantic_similarity
# #################################
# ########## Misspelling ###########
def get_misspelled_words(sentence):
spell = Speller(only_replacements=True)
misspelled= []
for word in sentence.split():
correct_word = spell(word)
if word != correct_word:
misspelled.append([word, correct_word])
return misspelled
def check_spelling(elements):
spelling_mistakes = []
spelling_mistakes_string = ""
for key, value in elements.items():
for i in range(0, len(elements[key])):
if get_misspelled_words(elements[key][i]):
spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])])
for idx, element in enumerate(spelling_mistakes):
for spelling_mistake in element[1]:
temp = ' should be written as '.join(spelling_mistake)
spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp
return spelling_mistakes_string
# ##################################
# ########## NLI ###########
def do_nli(premise, hypothesis):
# Tokenization
token_ids = []
seg_ids = []
mask_ids = []
premise_id = nli_tokenizer.encode(premise, add_special_tokens = False)
hypothesis_id = nli_tokenizer.encode(hypothesis, add_special_tokens = False)
pair_token_ids = [nli_tokenizer.cls_token_id] + premise_id + [nli_tokenizer.sep_token_id] + hypothesis_id + [nli_tokenizer.sep_token_id]
premise_len = len(premise_id)
hypothesis_len = len(hypothesis_id)
segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1)) # sentence 0 and sentence 1
attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3)) # mask padded values
token_ids.append(torch.tensor(pair_token_ids))
seg_ids.append(segment_ids)
mask_ids.append(attention_mask_ids)
# Forward pass
token_ids = pad_sequence(token_ids, batch_first=True)
mask_ids = pad_sequence(mask_ids, batch_first=True)
seg_ids = pad_sequence(seg_ids, batch_first=True)
with torch.no_grad():
output = nli_model(token_ids,
token_type_ids=seg_ids,
attention_mask=mask_ids)
# Output predication
result = ""
prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()
if prediction == 0:
result = "Entailment"
#print("Entailment")
elif prediction == 1:
result = "Contradiction"
#print("Contradiction")
elif prediction == 2:
result = "Neutral"
#print("Neutral")
return result
# Entailment
def check_entailment(decomposed_elements):
sentence_pairs = []
non_matching_elements = []
for key, value in decomposed_elements.items():
#print(key, value)
for i in decomposed_elements[key]:
#print(key, i)
sentence_pairs.append([key, i])
for sentence_pair in sentence_pairs:
result = do_nli(sentence_pair[0], sentence_pair[1])
print(result)
if result != "Entailment":
non_matching_elements.append(sentence_pair)
if non_matching_elements:
non_matching_elements = [' and '.join(ele) for ele in non_matching_elements]
non_matching_elements = "\n".join(non_matching_elements)
return "The following elements are miss matching:\n" + non_matching_elements
else:
return "There are no miss matched elements."
return result
# Contradiction
def check_contradiction(elements_per_actor):
sentence_pairs = []
contradicting_elements = []
for key, value in elements_per_actor.items():
for i in range(len(elements_per_actor[key])):
for j in range(i+1,len(elements_per_actor[key])):
sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
#print(sentence_pairs)
# Check contradiction
for sentence_pair in sentence_pairs:
result = do_nli(sentence_pair[0], sentence_pair[1])
#print(result)
if result == "Contradiction":
contradicting_elements.append(sentence_pair)
if contradicting_elements:
contradicting_elements = [' and '.join(ele) for ele in contradicting_elements]
contradicting_elements = "\n".join(contradicting_elements)
return "The following elements are contradicting:\n" + contradicting_elements
else:
return "There are no contradicting elements."
# ##########################
# ************************* User Interface *************************
def identify_bad_smells(tgrl_file, selected_bad_smells, size_threshold, similarity_threshold):
output = ""
tgrl_text = parse_tgrl(tgrl_file)
elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text)
if 'Size' in selected_bad_smells:
output = output + get_long_elements(elements, size_threshold) + "\n\n"
if 'Complexity' in selected_bad_smells:
output = output + get_complex_sentences(elements) + "\n\n"
if 'Punctuations' in selected_bad_smells:
output = output + get_punctuations(elements) + "\n\n"
if 'Actors Syntax' in selected_bad_smells:
output = output + check_actor_syntax(elements['actors']) + "\n\n"
if 'Goals Syntax' in selected_bad_smells:
output = output + check_goal_syntax(elements['goals']) + "\n\n"
if 'Softgoals Syntax' in selected_bad_smells:
output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n"
if 'Tasks Syntax' in selected_bad_smells:
output = output + check_task_syntax(elements['tasks']) + "\n\n"
if 'Resources Syntax' in selected_bad_smells:
output = output + check_resource_syntax(elements['resources']) + "\n\n"
if 'Similar Elements' in selected_bad_smells:
output = output + get_similar_elements(elements_per_actor, similarity_threshold) + "\n\n"
if 'Spelling Mistakes' in selected_bad_smells:
output = output + check_spelling(elements) + "\n\n"
if 'Goal-Subgoal Mismatch' in selected_bad_smells:
output = output + check_entailment(decomposed_elements) + "\n\n"
if 'Contradicting Elements' in selected_bad_smells:
output = output + check_contradiction(elements_per_actor) + "\n\n"
return output
interface = gr.Interface(fn = identify_bad_smells,
inputs = [gr.File(label="TGRL File"),
gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Resources Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
label="Which bad smells you want to detect?"),
gr.Slider(label= "Size threshold", value = 5, minimum = 2, maximum = 10, step = 1),
gr.Slider(label= "Similarity threshold", value = 0.9, minimum = 0, maximum = 1, step = 0.1)],
outputs = [gr.Textbox(label= "Detected bad smells:")],
title = "TGRL Bad Smells Detection",
description = "Upload your .xgrl file and we will find the bad smells for you!",
theme = gr.themes.Soft())
interface.launch(inline = False)