Spaces:

chris32
/

Text-Intelligence-Real-State

Sleeping

App Files Files Community

Christopher Román Jaimes commited on May 23, 2024

Commit

1ef8976

1 Parent(s): 518184e

fix: add cleaning post inference.

Browse files

Files changed (1) hide show

app.py +232 -8

app.py CHANGED Viewed

@@ -1,14 +1,225 @@
 import gradio as gr
 from gliner import GLiNER
 model = GLiNER.from_pretrained("chris32/gliner_multi_pii_real_state-v2")
 model.eval()
-#text = """
-#Casa en venta Valle de San Ángel, San Pedro Garza García**Para remodelar o demoler Casa de 1220 m2 de terreno y 1400 m2 de construcciónCasa de 3 niveles-Sala-Comedor-Cocina-Estancia-Preparación para alberca-Cochera para 3 autos techada -4 recamaras -Lavandería"Esta es una de las varias opciones que tenemos para ti. Somos una agencia de bienes raíces especializada en la venta y renta de vivienda residencial, te brindamos un servicio personalizado y de alta calidad. Si necesitas ayuda para comprar o rentar, contáctanos y uno de nuestros asesores te atenderá.
-#"""
-#for entity in entities:
-#    print(entity["text"], "=>", entity["label"])
 def generate_answer(text):
     labels = [
@@ -24,10 +235,23 @@ def generate_answer(text):
     'NOMBRE_CONDOMINIO',
     'AÑO_REMODELACIÓN'
     ]
     entities = model.predict_entities(text, labels, threshold=0.4)
-    result_dict = entities
-    return result_dict
 # Cambiar a entrada de texto
 #text_input = gr.inputs.Textbox(lines=15, label="Input Text")

+# Datetime
+import datetime
+# Manipulate
+import os
+import re
+import json
+import numpy as np
+import pandas as pd
+# App
 import gradio as gr
+# GLiNER Model
 from gliner import GLiNER
+# Load Model
 model = GLiNER.from_pretrained("chris32/gliner_multi_pii_real_state-v2")
 model.eval()
+# Global Variables: For Post Cleaning Inferences
+YEAR_OF_REMODELING_LIMIT = 100
+CURRENT_YEAR = int(datetime.date.today().year)
+SCORE_LIMIT_SIMILARITY_NAMES = 70
+def format_gliner_predictions(prediction):
+    if len(prediction) > 0:
+        # Select the Entity value with the Greater Score for each Entity Name
+        prediction_df = pd.DataFrame(prediction)\
+                          .sort_values("score", ascending = False)\
+                          .drop_duplicates(subset = "label", keep = "first")
+        # Add Columns Label for Text and Probability
+        prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
+        prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
+        # Format Predictions
+        entities = prediction_df.set_index("label_text")["text"].to_dict()
+        entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
+        predictions_formatted = {**entities, **entities_probs}
+        return predictions_formatted
+    else:
+        return dict()
+def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
+    # Prediction and Probability
+    prediction = row[f"pred_{feature_name}"]
+    prob = row[f"prob_{feature_name}"]
+    # Clean and Return Prediction only if the Threshold is lower.
+    if prob > threshols_dict[feature_name]:
+        clean_function = clean_functions_dict[feature_name]
+        prediction_clean = clean_function(prediction)
+        return prediction_clean
+    else:
+        return None
+surfaces_words_to_omit = ["ha", "hect", "lts", "litros", "mil"]
+tower_name_key_words_to_keep = ["torr", "towe"]
+def has_number(string):
+    return bool(re.search(r'\d', string))
+def contains_multiplication(string):
+    # Regular expression pattern to match a multiplication operation
+    pattern = r'\b([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*[xX]\s*([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*\b'
+    # Search for the pattern in the string
+    match = re.search(pattern, string)
+    # If a match is found, return True, otherwise False
+    if match:
+        return True
+    else:
+        return False
+def extract_first_number_from_string(text):
+    if isinstance(text, str):
+        match = re.search(r'\b\d*\.?\d+\b|\d*\.?\d+', text)
+        if match:
+            start_pos = match.start()
+            end_pos = match.end()
+            number = int(float(match.group()))
+            return number, start_pos, end_pos
+        else:
+            return None, None, None
+    else:
+        return None, None, None
+def get_character(string, index):
+    if len(string) > index:
+        return string[index]
+    else:
+        return None
+def find_valid_comma_separated_number(string):
+    # This regular expression matches strings starting with 1 to 3 digits followed by a comma and 3 digits. It ensures no other digits or commas follow or the string ends.
+    match = re.match(r'^(\d{1,3},\d{3})(?:[^0-9,]|$)', string)
+    if match:
+        valid_number = int(match.group(1).replace(",", ""))
+        return valid_number
+    else:
+        return None
+def extract_surface_from_string(string: str) -> int:
+    if isinstance(string, str):
+        # 1. Validate if it Contains a Number
+        if not(has_number(string)): return None
+        # 2. Validate if it No Contains Multiplication
+        if contains_multiplication(string): return None
+        # 3. Validate if it No Contains Words to Omit
+        if any([word in string.lower() for word in surfaces_words_to_omit]): return None
+        # 4. Extract First Number
+        number, start_pos, end_pos = extract_first_number_from_string(string)
+        # 5. Extract Valid Comma Separated Number
+        if isinstance(number, int):
+            if get_character(string, end_pos) == ",":
+                valid_comma_separated_number = find_valid_comma_separated_number(string[start_pos: -1])
+                return valid_comma_separated_number
+            else:
+                return number
+        else:
+            return None
+    else:
+        return None
+def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
+    # Prediction and Probability
+    prediction = row[f"pred_{feature_name}"]
+    prob = row[f"prob_{feature_name}"]
+    # Clean and Return Prediction only if the Threshold is lower.
+    if prob > threshols_dict[feature_name]:
+        clean_function = clean_functions_dict[feature_name]
+        prediction_clean = clean_function(prediction)
+        return prediction_clean
+    else:
+        return None
+def calculate_metrics(X, feature_name, data_type):
+    true_positives = 0
+    true_negatives = 0
+    false_positives = 0
+    false_negatives = 0
+    for pred, true in zip(X[f"clean_pred_{feature_name}"], X[f"clean_{feature_name}"]):
+        if isinstance(pred, data_type):
+            if isinstance(true, data_type):
+                if pred == true:
+                    true_positives += 1
+                else:
+                    false_positives += 1
+            else:
+                false_positives += 1
+        else:
+            if isinstance(true, data_type):
+                false_negatives += 1
+            else:
+                true_negatives += 1
+    # Calculate Metrics
+    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else np.nan
+    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else np.nan
+    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else np.nan
+    metrics = {
+        "precision": precision,
+        "recall": recall,
+        "f1_score": f1_score,
+    }
+    return metrics
+def extract_remodeling_year_from_string(string):
+    if isinstance(string, str):
+        # 1. Detect 4-digit year
+        match = re.search(r'\b\d{4}\b', string)
+        if match:
+            year_predicted = int(match.group())
+        else:
+            # 2. Detect quantity of years followed by "year", "years", "anio", "año", or "an"
+            match = re.search(r'(\d+) (year|years|anio|año|an|añ)', string.lower(), re.IGNORECASE)
+            if match:
+                past_years_predicted = int(match.group(1))
+                year_predicted = CURRENT_YEAR - past_years_predicted
+            else:
+                return None
+        # 3. Detect if it is a valid year
+        is_valid_year = (year_predicted <= CURRENT_YEAR) and (YEAR_OF_REMODELING_LIMIT > CURRENT_YEAR - year_predicted)
+        return year_predicted if is_valid_year else None
+    return None
+# Cleaning
+clean_functions_dict = {
+    "SUPERFICIE_TERRAZA": extract_surface_from_string,
+    "SUPERFICIE_JARDIN": extract_surface_from_string,
+    "SUPERFICIE_TERRENO": extract_surface_from_string,
+    "SUPERFICIE_HABITABLE": extract_surface_from_string,
+    "SUPERFICIE_BALCON": extract_surface_from_string,
+    "AÑO_REMODELACIÓN": extract_remodeling_year_from_string,
+    "NOMBRE_COMPLETO_ARQUITECTO": lambda x: x,
+    'NOMBRE_CLUB_GOLF': lambda x: x,
+    'NOMBRE_TORRE': lambda x: x,
+    'NOMBRE_CONDOMINIO': lambda x: x,
+    'NOMBRE_DESARROLLO': lambda x: x,
+}
+threshols_dict = {
+    "SUPERFICIE_TERRAZA": 0.9,
+    "SUPERFICIE_JARDIN": 0.9,
+    "SUPERFICIE_TERRENO": 0.9,
+    "SUPERFICIE_HABITABLE": 0.9,
+    "SUPERFICIE_BALCON": 0.9,
+    "AÑO_REMODELACIÓN": 0.9,
+    "NOMBRE_COMPLETO_ARQUITECTO": 0.9,
+    'NOMBRE_CLUB_GOLF': 0.9,
+    'NOMBRE_TORRE': 0.9,
+    'NOMBRE_CONDOMINIO': 0.9,
+    'NOMBRE_DESARROLLO': 0.9,
+}
 def generate_answer(text):
     labels = [
     'NOMBRE_CONDOMINIO',
     'AÑO_REMODELACIÓN'
     ]
+    # Inference
     entities = model.predict_entities(text, labels, threshold=0.4)
+    # Format Prediction Entities
+    entities_formatted = format_gliner_predictions(entities)
+    # Clean Entities
+    entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
+    entities_cleaned = dict()
+    for feature_name in entities_names:
+        entity_prediction_cleaned = clean_prediction(entities_formatted, feature_name, threshols_dict, clean_functions_dict)
+        if isinstance(entity_prediction_cleaned, str) or isinstance(entity_prediction_cleaned, int):
+            entities_cleaned[feature_name] = entity_prediction_cleaned
+    result_json = json.dumps(entities_cleaned, indent = 4, ensure_ascii = False)
+    return result_json
 # Cambiar a entrada de texto
 #text_input = gr.inputs.Textbox(lines=15, label="Input Text")