Spaces:

chris32
/

Text-Intelligence-Real-State

Sleeping

File size: 13,394 Bytes

# Datetime
import datetime
# Manipulate
import re
import json
import pandas as pd
# App
import gradio as gr
# GLiNER Model
from gliner import GLiNER
# Transformers
from transformers import pipeline

# Load GLiNER Model
model = GLiNER.from_pretrained("chris32/gliner_multi_pii_real_state-v2")
model.eval()

# BERT Model
model_name = "chris32/distilbert-base-spanish-uncased-finetuned-text-intelligence"
pipe = pipeline(model = model_name, device = "cpu")

# Global Variables: For Post Cleaning Inferences
YEAR_OF_REMODELING_LIMIT = 100
CURRENT_YEAR = int(datetime.date.today().year)
SCORE_LIMIT_SIMILARITY_NAMES = 70

def clean_text(text):
    # Replace HTML line breaks with the specified character
    replacement_char = " # "
    text = re.sub(r'<br\s*\/?>', replacement_char, text)
    
    # Remove HTML tags and special characters
    cleaned_text = re.sub(r'<[^>]*>', '', text)
    cleaned_text = re.sub(r'&nbsp;', ' ', cleaned_text)
    cleaned_text = re.sub(r'&amp;', '&', cleaned_text)
    
    # Drop punctuation marks
    #regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
    #cleaned_text = re.sub(regex , ' ', cleaned_text)
    
    # Replace multiple spaces with a single one
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Remove leading and trailing spaces
    cleaned_text = cleaned_text.strip()
    
    # Replace Duplicated "." and ","
    cleaned_text = cleaned_text.replace("..", ".").replace(",,", ",")
    
    return cleaned_text

def format_gliner_predictions(prediction):
    if len(prediction) > 0:    
        # Select the Entity value with the Greater Score for each Entity Name
        prediction_df = pd.DataFrame(prediction)\
                          .sort_values("score", ascending = False)\
                          .drop_duplicates(subset = "label", keep = "first")

        # Add Position Column
        prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)

        # Add Columns Label for Text and Probability
        prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
        prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
        prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")

        # Format Predictions
        entities = prediction_df.set_index("label_text")["text"].to_dict()
        entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
        entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
        predictions_formatted = {**entities, **entities_probs, **entities_positions}

        return predictions_formatted
    else:
        return dict()
    
def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
    # Prediction and Probability
    prediction = row[f"pred_{feature_name}"]
    prob = row[f"prob_{feature_name}"]
    
    # Clean and Return Prediction only if the Threshold is lower.
    if prob > threshols_dict[feature_name]:
        clean_function = clean_functions_dict[feature_name]
        prediction_clean = clean_function(prediction)
        return prediction_clean
    else:
        return None
    
surfaces_words_to_omit = ["ha", "hect", "lts", "litros", "mil"]
tower_name_key_words_to_keep = ["torr", "towe"]

def has_number(string):
    return bool(re.search(r'\d', string))

def contains_multiplication(string):
    # Regular expression pattern to match a multiplication operation
    pattern = r'\b([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*[xX]\s*([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*\b'
    
    # Search for the pattern in the string
    match = re.search(pattern, string)
    
    # If a match is found, return True, otherwise False
    if match:
        return True
    else:
        return False

def extract_first_number_from_string(text):
    if isinstance(text, str):
        match = re.search(r'\b\d*\.?\d+\b|\d*\.?\d+', text)
        if match:
            start_pos = match.start()
            end_pos = match.end()
            number = int(float(match.group()))
            return number, start_pos, end_pos
        else:
            return None, None, None
    else:
        return None, None, None
    
def get_character(string, index):
    if len(string) > index:
        return string[index]
    else:
        return None
    
def find_valid_comma_separated_number(string):
    # This regular expression matches strings starting with 1 to 3 digits followed by a comma and 3 digits. It ensures no other digits or commas follow or the string ends.
    match = re.match(r'^(\d{1,3},\d{3})(?:[^0-9,]|$)', string)
    if match:
        valid_number = int(match.group(1).replace(",", ""))
        return valid_number
    else:
        return None

def extract_surface_from_string(string: str) -> int:
    if isinstance(string, str):
        # 1. Validate if it Contains a Number
        if not(has_number(string)): return None

        # 2. Validate if it No Contains Multiplication
        if contains_multiplication(string): return None

        # 3. Validate if it No Contains Words to Omit
        if any([word in string.lower() for word in surfaces_words_to_omit]): return None

        # 4. Extract First Number
        number, start_pos, end_pos = extract_first_number_from_string(string)

        # 5. Extract Valid Comma Separated Number
        if isinstance(number, int):
            if get_character(string, end_pos) == ",": 
                valid_comma_separated_number = find_valid_comma_separated_number(string[start_pos: -1])
                return valid_comma_separated_number
            else:
                return number
        else:
            return None
    else:
        return None
    
def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
    # Prediction and Probability
    prediction = row[f"pred_{feature_name}"]
    prob = row[f"prob_{feature_name}"]
    
    # Clean and Return Prediction only if the Threshold is lower.
    if prob > threshols_dict[feature_name]:
        clean_function = clean_functions_dict[feature_name]
        prediction_clean = clean_function(prediction)
        return prediction_clean
    else:
        return None

def extract_remodeling_year_from_string(string):
    if isinstance(string, str):
        # 1. Detect 4-digit year
        match = re.search(r'\b\d{4}\b', string)
        if match:
            year_predicted = int(match.group())
        else:
            # 2. Detect quantity of years followed by "year", "years", "anio", "año", or "an"
            match = re.search(r'(\d+) (year|years|anio|año|an|añ)', string.lower(), re.IGNORECASE)
            if match:
                past_years_predicted = int(match.group(1))
                year_predicted = CURRENT_YEAR - past_years_predicted
            else:
                return None
        
        # 3. Detect if it is a valid year
        is_valid_year = (year_predicted <= CURRENT_YEAR) and (YEAR_OF_REMODELING_LIMIT > CURRENT_YEAR - year_predicted)
        return year_predicted if is_valid_year else None
        
    return None

def extract_valid_string_left_dotted(string, text, pos):
    if isinstance(string, str):
        # String Position 
        left_pos, rigth_pos = pos

        # Verify if the Left Position is not too close to the beginning of the text.
        if left_pos < 5:
            return None

        if string[0].isdigit():
            # 1. Take a subtext with 5 more characters to the left of the string.
            sub_text = text[left_pos - 5: rigth_pos]

            # 2. If the string has no dots to the left, return the original string.
            if text[left_pos - 1] == ".":

                # 3. If the string has a left dot but no preceding digit, return the original string.
                if text[left_pos - 2].isdigit():

                    # 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
                    pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
                    match = re.search(pattern, sub_text)
                    if match:
                        return match.group(0)
                    else:
                        return None
                else:
                    return string
            else:
                return string
        else:
            return string
    else:
        return None
    
# Cleaning
clean_functions_dict = {
    "SUPERFICIE_TERRAZA": extract_surface_from_string,
    "SUPERFICIE_JARDIN": extract_surface_from_string,
    "SUPERFICIE_TERRENO": extract_surface_from_string,
    "SUPERFICIE_HABITABLE": extract_surface_from_string,
    "SUPERFICIE_BALCON": extract_surface_from_string,
    "AÑO_REMODELACIÓN": extract_remodeling_year_from_string, 
    "NOMBRE_COMPLETO_ARQUITECTO": lambda x: x,
    'NOMBRE_CLUB_GOLF': lambda x: x, 
    'NOMBRE_TORRE': lambda x: x,
    'NOMBRE_CONDOMINIO': lambda x: x,
    'NOMBRE_DESARROLLO': lambda x: x,
}

threshols_dict = {
    "SUPERFICIE_TERRAZA": 0.9,
    "SUPERFICIE_JARDIN": 0.9,
    "SUPERFICIE_TERRENO": 0.9,
    "SUPERFICIE_HABITABLE": 0.9,
    "SUPERFICIE_BALCON": 0.9,
    "AÑO_REMODELACIÓN": 0.9,
    "NOMBRE_COMPLETO_ARQUITECTO": 0.9,
    'NOMBRE_CLUB_GOLF': 0.9, 
    'NOMBRE_TORRE': 0.9,
    'NOMBRE_CONDOMINIO': 0.9,
    'NOMBRE_DESARROLLO': 0.9,
}

threshols_dict = {
    "SUPERFICIE_BALCON": 0.7697697697697697,
    "SUPERFICIE_TERRAZA": 0.953953953953954,
    "SUPERFICIE_JARDIN": 0.9519519519519519, #idk
    "SUPERFICIE_TERRENO": 0.980980980980981 - 0.05,
    "SUPERFICIE_HABITABLE": 0.978978978978979 - 0.02, #idk if not "SUPERFICIE_HABITABLE": 0.988988988988989,
    "AÑO_REMODELACIÓN": 0.996996996996997 - 0.01,
    "NOMBRE_COMPLETO_ARQUITECTO": 0.8878878878878879,
    "NOMBRE_CLUB_GOLF": 0.8708708708708709, #idk if not "NOMBRE_CLUB_GOLF": 0.9729729729729729,
    "NOMBRE_TORRE": 0.8458458458458459 - 0.04,
    "NOMBRE_CONDOMINIO": 0.965965965965966,
    "NOMBRE_DESARROLLO": 0.9229229229229229
}

label_names_dict = {
    'LABEL_0': None, 
    'LABEL_1': 1,
    'LABEL_2': 2, 
    'LABEL_3': 3,
}
BERT_SCORE_LIMIT = 0.980819808198082

def extract_max_label_score(probabilities):
    # Find the dictionary with the maximum score
    max_item = max(probabilities, key=lambda x: x['score'])
    # Extract the label and the score
    label = max_item['label']
    score = max_item['score']

    return label, score

def clean_prediction_bert(label, score):
    if score > BERT_SCORE_LIMIT:
        label_formatted = label_names_dict.get(label, None)
        return  label_formatted
    else:
        return None
    
# BERT Inference Config
pipe_config = {
    "batch_size": 8,
    "truncation": True,
    "max_length": 250,
    "add_special_tokens": True,
    "return_all_scores": True,
    "padding": True,
}

def generate_answer(text):
    labels = [
    'SUPERFICIE_JARDIN',
    'NOMBRE_CLUB_GOLF',
    'SUPERFICIE_TERRENO',
    'SUPERFICIE_HABITABLE',
    'SUPERFICIE_TERRAZA',
    'NOMBRE_COMPLETO_ARQUITECTO',
    'SUPERFICIE_BALCON',
    'NOMBRE_DESARROLLO',
    'NOMBRE_TORRE',
    'NOMBRE_CONDOMINIO',
    'AÑO_REMODELACIÓN'
    ]

    # Clean Text
    text = clean_text(text)
    
    # Inference
    entities = model.predict_entities(text, labels, threshold=0.4)

    # Format Prediction Entities
    entities_formatted = format_gliner_predictions(entities)

    # Extract valid string left dotted
    feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
    for feature_name in feature_surfaces:
        if entities_formatted.get(f"pred_{feature_name}", None) != None:
           entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])

    # Clean Entities
    entities_names = list({c.replace("pred_", "").replace("prob_", "").replace("pos_", "") for c in list(entities_formatted.keys())})
    entities_cleaned = dict()
    for feature_name in entities_names:
        entity_prediction_cleaned = clean_prediction(entities_formatted, feature_name, threshols_dict, clean_functions_dict)
        if isinstance(entity_prediction_cleaned, str) or isinstance(entity_prediction_cleaned, int):
            entities_cleaned[feature_name] = entity_prediction_cleaned
    
    # BERT Inference
    predictions = pipe([text], **pipe_config)

    # Format Prediction
    label, score = extract_max_label_score(predictions[0])
    entities_formatted["NIVELES_CASA"] = label
    entities_formatted["prob_NIVELES_CASA"] = score
    prediction_cleaned = clean_prediction_bert(label, score)
    if isinstance(prediction_cleaned, int):
        entities_cleaned["NIVELES_CASA"] = prediction_cleaned
    

    result_json = json.dumps(entities_cleaned, indent = 4, ensure_ascii = False)

    return "Clean Result:" + result_json + "\n \n" + "Raw Result:" + json.dumps(entities_formatted, indent = 4, ensure_ascii = False)

# Cambiar a entrada de texto
#text_input = gr.inputs.Textbox(lines=15, label="Input Text")

iface = gr.Interface(
    fn=generate_answer, 
    inputs="text", 
    outputs="text",
    title="Text Intelligence for Real State",
    description="Input text describing the property."
)

iface.launch()