Spaces:

nouf-sst
/

TGRL-bad-smells

Runtime error

File size: 18,184 Bytes

import gradio as gr
import re
import json
import numpy as np
import nltk
import stanza
from stanza.models.constituency.parse_tree import Tree
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from sentence_transformers import CrossEncoder
from autocorrect import Speller
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.utils.rnn import pad_sequence

# ***************************** Load needed models *****************************
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
#sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
sentences_similarity_model = CrossEncoder('WillHeld/roberta-base-stsb')
nli_model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
nli_tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)

# ***************************** TGRL Parsing *****************************

def parse_tgrl(file_obj):

  with open(file_obj.name, 'r') as f:
    tgrl_text = f.read()
    tgrl_text = tgrl_text.replace('\t', '')
    tgrl_text = tgrl_text.replace('\n', '')
    
  return tgrl_text

def extract_elements(tgrl_text):

  # Extract actors
  actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
  # Extract goals
  goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
  # Extract softGoals
  softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
  # Extract tasks
  tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)
  # Extract resources
  resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text)

  elements = {
    "actors": actors,
    "goals": goals,
    "softGoals": softGoals,
    "tasks": tasks,
    "resources": resources
  }

  # get elements per actor
  elements_per_actor = {}

  for goal in goals:
    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal))
    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
    if corresponding_actor not in elements_per_actor:
        elements_per_actor[corresponding_actor] = []
    elements_per_actor[corresponding_actor].append(goal)
  
  for softGoal in softGoals:
    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal))
    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
    if corresponding_actor not in elements_per_actor:
        elements_per_actor[corresponding_actor] = []
    elements_per_actor[corresponding_actor].append(softGoal)

  for task in tasks:
    corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task))
    corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1]
    if corresponding_actor not in elements_per_actor:
        elements_per_actor[corresponding_actor] = []
    elements_per_actor[corresponding_actor].append(task)

  # get decomposed elements

  new_lines = tgrl_text
  decomposed_elements = {}

  main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines)

  for main_element in main_elements:
      
      sub_elements = []
      
      sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
      sub_elements.append(sub_element)
      new_lines = new_lines.replace(sub_element+', ', '')
          
      temp = main_element + " decomposedBy "
      for idx, sub_element in enumerate(sub_elements):
          if idx+1 == len (sub_elements):
              temp = temp + sub_element + ";"
          else:
              temp = temp + sub_element + ", "
      
      while temp not in tgrl_text:
          
          sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
          sub_elements.append(sub_element)
          new_lines = new_lines.replace(sub_element+', ', '')
          
          temp = main_element + " decomposedBy "
          for idx, sub_element in enumerate(sub_elements):
              if idx+1 == len (sub_elements):
                  temp = temp + sub_element + ";"
              else:
                  temp = temp + sub_element + ", "

      decomposed_elements[main_element] = sub_elements

      # Replace elements IDs with names
      new_decomposed_elements = {}

      for key, _ in decomposed_elements.items():
          
          new_key = re.findall("(?:"+key+"\s*{\s*name\s=\s\")([A-Za-z\s;.,!?:-]*)", tgrl_text)[0]
          new_values = []
          
          for element in decomposed_elements[key]:
              new_value = re.findall("(?:"+element+"\s*{\s*name\s=\s\")([A-Za-z\s;.,!?:-]*)", tgrl_text)[0]
              new_values.append(new_value)
          
          new_decomposed_elements[new_key] = new_values

  return elements, elements_per_actor, new_decomposed_elements

# ************************************************************************

# ************************* Bad Smells Detection *************************

# ########### Long Elements ###########
def get_long_elements(elements, size_threshold): # Using RegEx 

  long_elements = []

  for key, value in elements.items():
    for i in range(0, len(elements[key])):
        if len(re. findall(r'\w+', elements[key][i])) > size_threshold:
            long_elements.append(elements[key][i])
  
  if long_elements:
    long_elements = "\n".join(long_elements)
    return "Long elements:\n" + long_elements
  else:
    return "Long elements:\nNone."
# #####################################

# ######### Complex Sentences #########
def is_complex_sentence(sentence):

  nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
  doc = nlp(sentence)
  for sentence in doc.sentences:
      unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency)
      if 'SBAR' in unique_constituent_labels:
        return True
      else:
        return False

def get_complex_sentences(elements):

  complex_sentences = []
      
  for key, value in elements.items():
      for i in range(0, len(elements[key])):
          if is_complex_sentence(elements[key][i]):
              complex_sentences.append(elements[key][i])
              
  if complex_sentences:
    complex_sentences = "\n".join(complex_sentences)
    return "Complex sentences:\n" + complex_sentences
  else:
    return "Complex sentences:\nNone."

# #####################################
            
# ########## Punctuations ######### 
def get_punctuations(elements):

  punctuations = []

  for key, value in elements.items():
      for i in range(0, len(elements[key])):
          if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0:
              punctuations.append(elements[key][i])

  if punctuations:
    punctuations = "\n".join(punctuations)
    return "Punctuations:\n" + punctuations
  else:
    return "Punctuations:\nNone."
# #################################

# ########## Incorrect Actor Syntax ##########
def find_non_NPs(sentences):

  pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer)

  outputs = pipeline(sentences)

  Non_NPs = []

  for idx, output in enumerate(outputs):
    if output[0]['entity'].startswith('V'):
      Non_NPs.append(sentences[idx])
    
  return Non_NPs

def check_actor_syntax(actors):

  incorrect_actor_syntax = find_non_NPs(actors)
  
  if incorrect_actor_syntax:
    incorrect_actor_syntax = "\n".join(incorrect_actor_syntax)
    return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax
  else:
    return "All actors are syntactically correct."
# ############################################

# ########## Incorrect Goal Syntax ###########
def check_goal_syntax(goals):

  incorrect_goal_syntax = find_non_NPs(goals)

  if incorrect_goal_syntax:
    incorrect_goal_syntax = "\n".join(incorrect_goal_syntax)
    return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax
  else:
    return "All goals are syntactically correct."
# ############################################

# ########## Incorrect Softgoal Syntax ###########
def check_softgoal_syntax(softgoals):

  incorrect_softgoal_syntax = find_non_NPs(softgoals)

  if incorrect_softgoal_syntax:
    incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax)
    return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax
  else:
    return "All softgoal are syntactically correct."
# ############################################

# ########## Incorrect Task Syntax ###########
def find_NPs(sentences):

  pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

  outputs = pipeline(sentences)

  NPs = []
    
  for idx, output in enumerate(outputs):
    if not output[0]['entity'].startswith('V'):
      NPs.append(sentences[idx])
    
  return NPs

def check_task_syntax(tasks):

  incorrect_task_syntax = find_NPs(tasks)

  if incorrect_task_syntax:
    incorrect_task_syntax = "\n".join(incorrect_task_syntax)
    return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax
  else:
    return "All tasks are syntactically correct."
# ############################################

# ########## Incorrect Resource Syntax ###########
def check_resource_syntax(resources):

  incorrect_resource_syntax = find_non_NPs(resources)

  if incorrect_resource_syntax:
    incorrect_resource_syntax = "\n".join(incorrect_resource_syntax)
    return "Incorrect Resources Syntax:\n" + incorrect_resource_syntax
  else:
    return "All resources are syntactically correct."
# ############################################

# ########## Similarity ###########
def get_similar_elements(elements_per_actor, similarity_threshold):

  # Prepare sentence pair array
  sentence_pairs = []

  for key, value in elements_per_actor.items():

      for i in range(len(elements_per_actor[key])):
          for j in range(i+1,len(elements_per_actor[key])):
              sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
  
  # Predict semantic similarity 
  semantic_similarity_scores = sentences_similarity_model.predict(sentence_pairs, show_progress_bar=True)

  similar_elements = []
  for index, value in enumerate(sentence_pairs):
    if semantic_similarity_scores[index] > similarity_threshold:
      similar_elements.append(value)
      #semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]]

  if similar_elements:
    similar_elements = [' and '.join(ele) for ele in similar_elements]
    similar_elements = "\n".join(similar_elements)
    return "The following elements are semantically similar:\n" + similar_elements
  else:
    return "There are no similar elements."

  return semantic_similarity
# #################################

# ########## Misspelling ###########
def get_misspelled_words(sentence):

  spell = Speller(only_replacements=True)
    
  misspelled= []
  
  for word in sentence.split():
      correct_word = spell(word)
      if word != correct_word:
          misspelled.append([word, correct_word])

  return misspelled

def check_spelling(elements):

  spelling_mistakes = []
  spelling_mistakes_string = ""

  for key, value in elements.items():
    for i in range(0, len(elements[key])):
        if get_misspelled_words(elements[key][i]):
            spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])])
  
  for idx, element in enumerate(spelling_mistakes):
    for spelling_mistake in element[1]:
      temp = ' should be written as '.join(spelling_mistake) 
      spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp 

  return spelling_mistakes_string
# ##################################

# ########## NLI ###########
def do_nli(premise, hypothesis):

  # Tokenization 
  token_ids = []
  seg_ids = []
  mask_ids = []

  premise_id = nli_tokenizer.encode(premise, add_special_tokens = False)
  hypothesis_id = nli_tokenizer.encode(hypothesis, add_special_tokens = False)
  pair_token_ids = [nli_tokenizer.cls_token_id] + premise_id + [nli_tokenizer.sep_token_id] + hypothesis_id + [nli_tokenizer.sep_token_id]
  premise_len = len(premise_id)
  hypothesis_len = len(hypothesis_id)

  segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
  attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

  token_ids.append(torch.tensor(pair_token_ids))
  seg_ids.append(segment_ids)
  mask_ids.append(attention_mask_ids)

  # Forward pass 
  token_ids = pad_sequence(token_ids, batch_first=True)
  mask_ids = pad_sequence(mask_ids, batch_first=True)
  seg_ids = pad_sequence(seg_ids, batch_first=True)

  with torch.no_grad():
    output = nli_model(token_ids,
                  token_type_ids=seg_ids,
                  attention_mask=mask_ids)

  # Output predication 
  result = ""
  prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()
  if prediction == 0:
    result = "Entailment"
    #print("Entailment")
  elif prediction == 1:
    result = "Contradiction"
    #print("Contradiction")
  elif prediction == 2:
    result = "Neutral"
    #print("Neutral")
  
  return result

# Entailment
def check_entailment(decomposed_elements):

  sentence_pairs = []
  non_matching_elements = []

  for key, value in decomposed_elements.items():
      #print(key, value)
      for i in decomposed_elements[key]:
          #print(key, i)
          sentence_pairs.append([key, i])
              
  for sentence_pair in sentence_pairs:
    result = do_nli(sentence_pair[0], sentence_pair[1])
    print(result)
    if result != "Entailment":
      non_matching_elements.append(sentence_pair)
  
  if non_matching_elements:
    non_matching_elements = [' and '.join(ele) for ele in non_matching_elements]
    non_matching_elements = "\n".join(non_matching_elements)
    return "The following elements are miss matching:\n" + non_matching_elements
  else:
    return "There are no miss matched elements."

  return result

# Contradiction
def check_contradiction(elements_per_actor):

  sentence_pairs = [] 
  contradicting_elements = []

  for key, value in elements_per_actor.items():

      for i in range(len(elements_per_actor[key])):
          for j in range(i+1,len(elements_per_actor[key])):
              sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])
  
  #print(sentence_pairs)
  # Check contradiction
  for sentence_pair in sentence_pairs:
    result = do_nli(sentence_pair[0], sentence_pair[1])
    #print(result)
    if result == "Contradiction":
      contradicting_elements.append(sentence_pair)

  if contradicting_elements:
    contradicting_elements = [' and '.join(ele) for ele in contradicting_elements]
    contradicting_elements = "\n".join(contradicting_elements)
    return "The following elements are contradicting:\n" + contradicting_elements
  else:
    return "There are no contradicting elements."
# ##########################

# ************************* User Interface *************************

def identify_bad_smells(tgrl_file, selected_bad_smells, size_threshold, similarity_threshold):

  output = ""

  tgrl_text = parse_tgrl(tgrl_file)

  elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text)

  if 'Size' in selected_bad_smells:
    output = output + get_long_elements(elements, size_threshold) + "\n\n"

  if 'Complexity' in selected_bad_smells:
    output = output + get_complex_sentences(elements) + "\n\n"
  
  if 'Punctuations' in selected_bad_smells:
    output = output + get_punctuations(elements) + "\n\n"

  if 'Actors Syntax' in selected_bad_smells:
    output = output + check_actor_syntax(elements['actors']) + "\n\n"

  if 'Goals Syntax' in selected_bad_smells:
    output = output + check_goal_syntax(elements['goals']) + "\n\n"

  if 'Softgoals Syntax' in selected_bad_smells:
    output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n"
  
  if 'Tasks Syntax' in selected_bad_smells:
    output = output + check_task_syntax(elements['tasks']) + "\n\n"
      
  if 'Resources Syntax' in selected_bad_smells:
    output = output + check_resource_syntax(elements['resources']) + "\n\n"

  if 'Similar Elements' in selected_bad_smells:
    output = output + get_similar_elements(elements_per_actor, similarity_threshold) + "\n\n"

  if 'Spelling Mistakes' in selected_bad_smells:
    output = output + check_spelling(elements) + "\n\n"

  if 'Goal-Subgoal Mismatch' in selected_bad_smells:
    output = output + check_entailment(decomposed_elements) + "\n\n"
  
  if 'Contradicting Elements' in selected_bad_smells:
    output = output + check_contradiction(elements_per_actor) + "\n\n"
    
  return output


interface = gr.Interface(fn = identify_bad_smells, 
                         inputs = [gr.File(label="TGRL File"),
                         gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Resources Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
                                           label="Which bad smells you want to detect?"),
                                   gr.Slider(label= "Size threshold", value = 5, minimum = 2, maximum = 10, step = 1),
                                   gr.Slider(label= "Similarity threshold", value = 0.9, minimum = 0, maximum = 1, step = 0.1)],
                         outputs = [gr.Textbox(label= "Detected bad smells:")],
                         title = "TGRL Bad Smells Detection",
                         description = "Upload your .xgrl file and we will find the bad smells for you!",
                         theme = gr.themes.Soft())


interface.launch(inline = False)