Spaces:
Runtime error
Runtime error
import gradio as gr | |
import re | |
import json | |
import numpy as np | |
import nltk | |
import stanza | |
from stanza.models.constituency.parse_tree import Tree | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline | |
from sentence_transformers import CrossEncoder | |
from autocorrect import Speller | |
from transformers import BertTokenizer, BertForSequenceClassification | |
import torch | |
from torch.nn.utils.rnn import pad_sequence | |
# ***************************** Load needed models ***************************** | |
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency') | |
pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english") | |
pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english") | |
#sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base') | |
sentences_similarity_model = CrossEncoder('WillHeld/roberta-base-stsb') | |
nli_model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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") | |
nli_tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True) | |
# ***************************** TGRL Parsing ***************************** | |
def parse_tgrl(file_obj): | |
with open(file_obj.name, 'r') as f: | |
tgrl_text = f.read() | |
tgrl_text = tgrl_text.replace('\t', '') | |
tgrl_text = tgrl_text.replace('\n', '') | |
return tgrl_text | |
def extract_elements(tgrl_text): | |
# Extract actors | |
actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
# Extract goals | |
goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
# Extract softGoals | |
softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
# Extract tasks | |
tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
# Extract resources | |
resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
elements = { | |
"actors": actors, | |
"goals": goals, | |
"softGoals": softGoals, | |
"tasks": tasks, | |
"resources": resources | |
} | |
# get elements per actor | |
elements_per_actor = {} | |
for goal in goals: | |
corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal)) | |
corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1] | |
if corresponding_actor not in elements_per_actor: | |
elements_per_actor[corresponding_actor] = [] | |
elements_per_actor[corresponding_actor].append(goal) | |
for softGoal in softGoals: | |
corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal)) | |
corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1] | |
if corresponding_actor not in elements_per_actor: | |
elements_per_actor[corresponding_actor] = [] | |
elements_per_actor[corresponding_actor].append(softGoal) | |
for task in tasks: | |
corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task)) | |
corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1] | |
if corresponding_actor not in elements_per_actor: | |
elements_per_actor[corresponding_actor] = [] | |
elements_per_actor[corresponding_actor].append(task) | |
# get decomposed elements | |
new_lines = tgrl_text | |
decomposed_elements = {} | |
main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines) | |
for main_element in main_elements: | |
sub_elements = [] | |
sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0]) | |
sub_elements.append(sub_element) | |
new_lines = new_lines.replace(sub_element+', ', '') | |
temp = main_element + " decomposedBy " | |
for idx, sub_element in enumerate(sub_elements): | |
if idx+1 == len (sub_elements): | |
temp = temp + sub_element + ";" | |
else: | |
temp = temp + sub_element + ", " | |
while temp not in tgrl_text: | |
sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0]) | |
sub_elements.append(sub_element) | |
new_lines = new_lines.replace(sub_element+', ', '') | |
temp = main_element + " decomposedBy " | |
for idx, sub_element in enumerate(sub_elements): | |
if idx+1 == len (sub_elements): | |
temp = temp + sub_element + ";" | |
else: | |
temp = temp + sub_element + ", " | |
decomposed_elements[main_element] = sub_elements | |
# Replace elements IDs with names | |
new_decomposed_elements = {} | |
for key, _ in decomposed_elements.items(): | |
new_key = re.findall("(?:"+key+"\s*{\s*name\s=\s\")([A-Za-z\s;.,!?:-]*)", tgrl_text)[0] | |
new_values = [] | |
for element in decomposed_elements[key]: | |
new_value = re.findall("(?:"+element+"\s*{\s*name\s=\s\")([A-Za-z\s;.,!?:-]*)", tgrl_text)[0] | |
new_values.append(new_value) | |
new_decomposed_elements[new_key] = new_values | |
return elements, elements_per_actor, new_decomposed_elements | |
# ************************************************************************ | |
# ************************* Bad Smells Detection ************************* | |
# ########### Long Elements ########### | |
def get_long_elements(elements, size_threshold): # Using RegEx | |
long_elements = [] | |
for key, value in elements.items(): | |
for i in range(0, len(elements[key])): | |
if len(re. findall(r'\w+', elements[key][i])) > size_threshold: | |
long_elements.append(elements[key][i]) | |
if long_elements: | |
long_elements = "\n".join(long_elements) | |
return "Long elements:\n" + long_elements | |
else: | |
return "Long elements:\nNone." | |
# ##################################### | |
# ######### Complex Sentences ######### | |
def is_complex_sentence(sentence): | |
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency') | |
doc = nlp(sentence) | |
for sentence in doc.sentences: | |
unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency) | |
if 'SBAR' in unique_constituent_labels: | |
return True | |
else: | |
return False | |
def get_complex_sentences(elements): | |
complex_sentences = [] | |
for key, value in elements.items(): | |
for i in range(0, len(elements[key])): | |
if is_complex_sentence(elements[key][i]): | |
complex_sentences.append(elements[key][i]) | |
if complex_sentences: | |
complex_sentences = "\n".join(complex_sentences) | |
return "Complex sentences:\n" + complex_sentences | |
else: | |
return "Complex sentences:\nNone." | |
# ##################################### | |
# ########## Punctuations ######### | |
def get_punctuations(elements): | |
punctuations = [] | |
for key, value in elements.items(): | |
for i in range(0, len(elements[key])): | |
if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0: | |
punctuations.append(elements[key][i]) | |
if punctuations: | |
punctuations = "\n".join(punctuations) | |
return "Punctuations:\n" + punctuations | |
else: | |
return "Punctuations:\nNone." | |
# ################################# | |
# ########## Incorrect Actor Syntax ########## | |
def find_non_NPs(sentences): | |
pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer) | |
outputs = pipeline(sentences) | |
Non_NPs = [] | |
for idx, output in enumerate(outputs): | |
if output[0]['entity'].startswith('V'): | |
Non_NPs.append(sentences[idx]) | |
return Non_NPs | |
def check_actor_syntax(actors): | |
incorrect_actor_syntax = find_non_NPs(actors) | |
if incorrect_actor_syntax: | |
incorrect_actor_syntax = "\n".join(incorrect_actor_syntax) | |
return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax | |
else: | |
return "All actors are syntactically correct." | |
# ############################################ | |
# ########## Incorrect Goal Syntax ########### | |
def check_goal_syntax(goals): | |
incorrect_goal_syntax = find_non_NPs(goals) | |
if incorrect_goal_syntax: | |
incorrect_goal_syntax = "\n".join(incorrect_goal_syntax) | |
return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax | |
else: | |
return "All goals are syntactically correct." | |
# ############################################ | |
# ########## Incorrect Softgoal Syntax ########### | |
def check_softgoal_syntax(softgoals): | |
incorrect_softgoal_syntax = find_non_NPs(softgoals) | |
if incorrect_softgoal_syntax: | |
incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax) | |
return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax | |
else: | |
return "All softgoal are syntactically correct." | |
# ############################################ | |
# ########## Incorrect Task Syntax ########### | |
def find_NPs(sentences): | |
pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer) | |
outputs = pipeline(sentences) | |
NPs = [] | |
for idx, output in enumerate(outputs): | |
if not output[0]['entity'].startswith('V'): | |
NPs.append(sentences[idx]) | |
return NPs | |
def check_task_syntax(tasks): | |
incorrect_task_syntax = find_NPs(tasks) | |
if incorrect_task_syntax: | |
incorrect_task_syntax = "\n".join(incorrect_task_syntax) | |
return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax | |
else: | |
return "All tasks are syntactically correct." | |
# ############################################ | |
# ########## Incorrect Resource Syntax ########### | |
def check_resource_syntax(resources): | |
incorrect_resource_syntax = find_non_NPs(resources) | |
if incorrect_resource_syntax: | |
incorrect_resource_syntax = "\n".join(incorrect_resource_syntax) | |
return "Incorrect Resources Syntax:\n" + incorrect_resource_syntax | |
else: | |
return "All resources are syntactically correct." | |
# ############################################ | |
# ########## Similarity ########### | |
def get_similar_elements(elements_per_actor, similarity_threshold): | |
# Prepare sentence pair array | |
sentence_pairs = [] | |
for key, value in elements_per_actor.items(): | |
for i in range(len(elements_per_actor[key])): | |
for j in range(i+1,len(elements_per_actor[key])): | |
sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]]) | |
# Predict semantic similarity | |
semantic_similarity_scores = sentences_similarity_model.predict(sentence_pairs, show_progress_bar=True) | |
similar_elements = [] | |
for index, value in enumerate(sentence_pairs): | |
if semantic_similarity_scores[index] > similarity_threshold: | |
similar_elements.append(value) | |
#semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]] | |
if similar_elements: | |
similar_elements = [' and '.join(ele) for ele in similar_elements] | |
similar_elements = "\n".join(similar_elements) | |
return "The following elements are semantically similar:\n" + similar_elements | |
else: | |
return "There are no similar elements." | |
return semantic_similarity | |
# ################################# | |
# ########## Misspelling ########### | |
def get_misspelled_words(sentence): | |
spell = Speller(only_replacements=True) | |
misspelled= [] | |
for word in sentence.split(): | |
correct_word = spell(word) | |
if word != correct_word: | |
misspelled.append([word, correct_word]) | |
return misspelled | |
def check_spelling(elements): | |
spelling_mistakes = [] | |
spelling_mistakes_string = "" | |
for key, value in elements.items(): | |
for i in range(0, len(elements[key])): | |
if get_misspelled_words(elements[key][i]): | |
spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])]) | |
for idx, element in enumerate(spelling_mistakes): | |
for spelling_mistake in element[1]: | |
temp = ' should be written as '.join(spelling_mistake) | |
spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp | |
return spelling_mistakes_string | |
# ################################## | |
# ########## NLI ########### | |
def do_nli(premise, hypothesis): | |
# Tokenization | |
token_ids = [] | |
seg_ids = [] | |
mask_ids = [] | |
premise_id = nli_tokenizer.encode(premise, add_special_tokens = False) | |
hypothesis_id = nli_tokenizer.encode(hypothesis, add_special_tokens = False) | |
pair_token_ids = [nli_tokenizer.cls_token_id] + premise_id + [nli_tokenizer.sep_token_id] + hypothesis_id + [nli_tokenizer.sep_token_id] | |
premise_len = len(premise_id) | |
hypothesis_len = len(hypothesis_id) | |
segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1)) # sentence 0 and sentence 1 | |
attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3)) # mask padded values | |
token_ids.append(torch.tensor(pair_token_ids)) | |
seg_ids.append(segment_ids) | |
mask_ids.append(attention_mask_ids) | |
# Forward pass | |
token_ids = pad_sequence(token_ids, batch_first=True) | |
mask_ids = pad_sequence(mask_ids, batch_first=True) | |
seg_ids = pad_sequence(seg_ids, batch_first=True) | |
with torch.no_grad(): | |
output = nli_model(token_ids, | |
token_type_ids=seg_ids, | |
attention_mask=mask_ids) | |
# Output predication | |
result = "" | |
prediction = np.argmax(output.logits.cpu().numpy()).flatten().item() | |
if prediction == 0: | |
result = "Entailment" | |
#print("Entailment") | |
elif prediction == 1: | |
result = "Contradiction" | |
#print("Contradiction") | |
elif prediction == 2: | |
result = "Neutral" | |
#print("Neutral") | |
return result | |
# Entailment | |
def check_entailment(decomposed_elements): | |
sentence_pairs = [] | |
non_matching_elements = [] | |
for key, value in decomposed_elements.items(): | |
#print(key, value) | |
for i in decomposed_elements[key]: | |
#print(key, i) | |
sentence_pairs.append([key, i]) | |
for sentence_pair in sentence_pairs: | |
result = do_nli(sentence_pair[0], sentence_pair[1]) | |
print(result) | |
if result != "Entailment": | |
non_matching_elements.append(sentence_pair) | |
if non_matching_elements: | |
non_matching_elements = [' and '.join(ele) for ele in non_matching_elements] | |
non_matching_elements = "\n".join(non_matching_elements) | |
return "The following elements are miss matching:\n" + non_matching_elements | |
else: | |
return "There are no miss matched elements." | |
return result | |
# Contradiction | |
def check_contradiction(elements_per_actor): | |
sentence_pairs = [] | |
contradicting_elements = [] | |
for key, value in elements_per_actor.items(): | |
for i in range(len(elements_per_actor[key])): | |
for j in range(i+1,len(elements_per_actor[key])): | |
sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]]) | |
#print(sentence_pairs) | |
# Check contradiction | |
for sentence_pair in sentence_pairs: | |
result = do_nli(sentence_pair[0], sentence_pair[1]) | |
#print(result) | |
if result == "Contradiction": | |
contradicting_elements.append(sentence_pair) | |
if contradicting_elements: | |
contradicting_elements = [' and '.join(ele) for ele in contradicting_elements] | |
contradicting_elements = "\n".join(contradicting_elements) | |
return "The following elements are contradicting:\n" + contradicting_elements | |
else: | |
return "There are no contradicting elements." | |
# ########################## | |
# ************************* User Interface ************************* | |
def identify_bad_smells(tgrl_file, selected_bad_smells, size_threshold, similarity_threshold): | |
output = "" | |
tgrl_text = parse_tgrl(tgrl_file) | |
elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text) | |
if 'Size' in selected_bad_smells: | |
output = output + get_long_elements(elements, size_threshold) + "\n\n" | |
if 'Complexity' in selected_bad_smells: | |
output = output + get_complex_sentences(elements) + "\n\n" | |
if 'Punctuations' in selected_bad_smells: | |
output = output + get_punctuations(elements) + "\n\n" | |
if 'Actors Syntax' in selected_bad_smells: | |
output = output + check_actor_syntax(elements['actors']) + "\n\n" | |
if 'Goals Syntax' in selected_bad_smells: | |
output = output + check_goal_syntax(elements['goals']) + "\n\n" | |
if 'Softgoals Syntax' in selected_bad_smells: | |
output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n" | |
if 'Tasks Syntax' in selected_bad_smells: | |
output = output + check_task_syntax(elements['tasks']) + "\n\n" | |
if 'Resources Syntax' in selected_bad_smells: | |
output = output + check_resource_syntax(elements['resources']) + "\n\n" | |
if 'Similar Elements' in selected_bad_smells: | |
output = output + get_similar_elements(elements_per_actor, similarity_threshold) + "\n\n" | |
if 'Spelling Mistakes' in selected_bad_smells: | |
output = output + check_spelling(elements) + "\n\n" | |
if 'Goal-Subgoal Mismatch' in selected_bad_smells: | |
output = output + check_entailment(decomposed_elements) + "\n\n" | |
if 'Contradicting Elements' in selected_bad_smells: | |
output = output + check_contradiction(elements_per_actor) + "\n\n" | |
return output | |
interface = gr.Interface(fn = identify_bad_smells, | |
inputs = [gr.File(label="TGRL File"), | |
gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Resources Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"], | |
label="Which bad smells you want to detect?"), | |
gr.Slider(label= "Size threshold", value = 5, minimum = 2, maximum = 10, step = 1), | |
gr.Slider(label= "Similarity threshold", value = 0.9, minimum = 0, maximum = 1, step = 0.1)], | |
outputs = [gr.Textbox(label= "Detected bad smells:")], | |
title = "TGRL Bad Smells Detection", | |
description = "Upload your .xgrl file and we will find the bad smells for you!", | |
theme = gr.themes.Soft()) | |
interface.launch(inline = False) |