File size: 13,394 Bytes
1ef8976
 
 
 
 
 
 
a54c5a9
1ef8976
a54c5a9
7f254c6
 
a54c5a9
7f254c6
a54c5a9
 
 
7f254c6
 
 
 
1ef8976
 
 
 
 
40d9906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ef8976
 
 
 
 
 
 
ac89c56
 
 
1ef8976
 
 
ac89c56
1ef8976
 
 
 
ac89c56
 
1ef8976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac89c56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ef8976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a54c5a9
a2897e4
 
 
 
abdb6f5
b531590
0579617
a2897e4
 
4193cf5
a2897e4
 
 
 
7f254c6
 
 
 
 
 
a6006b3
7f254c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a54c5a9
 
 
 
 
 
 
 
 
 
 
 
 
 
40d9906
 
 
 
1ef8976
a54c5a9
1ef8976
 
 
 
ac89c56
 
 
1683f88
 
ac89c56
1ef8976
c52a63e
1ef8976
 
 
 
 
7f254c6
 
 
 
 
 
9c88bfd
 
7f254c6
 
 
9c88bfd
7f254c6
1ef8976
 
e2740ad
a54c5a9
 
89e9fa4
a54c5a9
 
 
4d0d80e
a54c5a9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# Datetime
import datetime
# Manipulate
import re
import json
import pandas as pd
# App
import gradio as gr
# GLiNER Model
from gliner import GLiNER
# Transformers
from transformers import pipeline

# Load GLiNER Model
model = GLiNER.from_pretrained("chris32/gliner_multi_pii_real_state-v2")
model.eval()

# BERT Model
model_name = "chris32/distilbert-base-spanish-uncased-finetuned-text-intelligence"
pipe = pipeline(model = model_name, device = "cpu")

# Global Variables: For Post Cleaning Inferences
YEAR_OF_REMODELING_LIMIT = 100
CURRENT_YEAR = int(datetime.date.today().year)
SCORE_LIMIT_SIMILARITY_NAMES = 70

def clean_text(text):
    # Replace HTML line breaks with the specified character
    replacement_char = " # "
    text = re.sub(r'<br\s*\/?>', replacement_char, text)
    
    # Remove HTML tags and special characters
    cleaned_text = re.sub(r'<[^>]*>', '', text)
    cleaned_text = re.sub(r'&nbsp;', ' ', cleaned_text)
    cleaned_text = re.sub(r'&amp;', '&', cleaned_text)
    
    # Drop punctuation marks
    #regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
    #cleaned_text = re.sub(regex , ' ', cleaned_text)
    
    # Replace multiple spaces with a single one
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Remove leading and trailing spaces
    cleaned_text = cleaned_text.strip()
    
    # Replace Duplicated "." and ","
    cleaned_text = cleaned_text.replace("..", ".").replace(",,", ",")
    
    return cleaned_text

def format_gliner_predictions(prediction):
    if len(prediction) > 0:    
        # Select the Entity value with the Greater Score for each Entity Name
        prediction_df = pd.DataFrame(prediction)\
                          .sort_values("score", ascending = False)\
                          .drop_duplicates(subset = "label", keep = "first")

        # Add Position Column
        prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)

        # Add Columns Label for Text and Probability
        prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
        prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
        prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")

        # Format Predictions
        entities = prediction_df.set_index("label_text")["text"].to_dict()
        entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
        entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
        predictions_formatted = {**entities, **entities_probs, **entities_positions}

        return predictions_formatted
    else:
        return dict()
    
def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
    # Prediction and Probability
    prediction = row[f"pred_{feature_name}"]
    prob = row[f"prob_{feature_name}"]
    
    # Clean and Return Prediction only if the Threshold is lower.
    if prob > threshols_dict[feature_name]:
        clean_function = clean_functions_dict[feature_name]
        prediction_clean = clean_function(prediction)
        return prediction_clean
    else:
        return None
    
surfaces_words_to_omit = ["ha", "hect", "lts", "litros", "mil"]
tower_name_key_words_to_keep = ["torr", "towe"]

def has_number(string):
    return bool(re.search(r'\d', string))

def contains_multiplication(string):
    # Regular expression pattern to match a multiplication operation
    pattern = r'\b([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*[xX]\s*([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*\b'
    
    # Search for the pattern in the string
    match = re.search(pattern, string)
    
    # If a match is found, return True, otherwise False
    if match:
        return True
    else:
        return False

def extract_first_number_from_string(text):
    if isinstance(text, str):
        match = re.search(r'\b\d*\.?\d+\b|\d*\.?\d+', text)
        if match:
            start_pos = match.start()
            end_pos = match.end()
            number = int(float(match.group()))
            return number, start_pos, end_pos
        else:
            return None, None, None
    else:
        return None, None, None
    
def get_character(string, index):
    if len(string) > index:
        return string[index]
    else:
        return None
    
def find_valid_comma_separated_number(string):
    # This regular expression matches strings starting with 1 to 3 digits followed by a comma and 3 digits. It ensures no other digits or commas follow or the string ends.
    match = re.match(r'^(\d{1,3},\d{3})(?:[^0-9,]|$)', string)
    if match:
        valid_number = int(match.group(1).replace(",", ""))
        return valid_number
    else:
        return None

def extract_surface_from_string(string: str) -> int:
    if isinstance(string, str):
        # 1. Validate if it Contains a Number
        if not(has_number(string)): return None

        # 2. Validate if it No Contains Multiplication
        if contains_multiplication(string): return None

        # 3. Validate if it No Contains Words to Omit
        if any([word in string.lower() for word in surfaces_words_to_omit]): return None

        # 4. Extract First Number
        number, start_pos, end_pos = extract_first_number_from_string(string)

        # 5. Extract Valid Comma Separated Number
        if isinstance(number, int):
            if get_character(string, end_pos) == ",": 
                valid_comma_separated_number = find_valid_comma_separated_number(string[start_pos: -1])
                return valid_comma_separated_number
            else:
                return number
        else:
            return None
    else:
        return None
    
def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
    # Prediction and Probability
    prediction = row[f"pred_{feature_name}"]
    prob = row[f"prob_{feature_name}"]
    
    # Clean and Return Prediction only if the Threshold is lower.
    if prob > threshols_dict[feature_name]:
        clean_function = clean_functions_dict[feature_name]
        prediction_clean = clean_function(prediction)
        return prediction_clean
    else:
        return None

def extract_remodeling_year_from_string(string):
    if isinstance(string, str):
        # 1. Detect 4-digit year
        match = re.search(r'\b\d{4}\b', string)
        if match:
            year_predicted = int(match.group())
        else:
            # 2. Detect quantity of years followed by "year", "years", "anio", "año", or "an"
            match = re.search(r'(\d+) (year|years|anio|año|an|añ)', string.lower(), re.IGNORECASE)
            if match:
                past_years_predicted = int(match.group(1))
                year_predicted = CURRENT_YEAR - past_years_predicted
            else:
                return None
        
        # 3. Detect if it is a valid year
        is_valid_year = (year_predicted <= CURRENT_YEAR) and (YEAR_OF_REMODELING_LIMIT > CURRENT_YEAR - year_predicted)
        return year_predicted if is_valid_year else None
        
    return None

def extract_valid_string_left_dotted(string, text, pos):
    if isinstance(string, str):
        # String Position 
        left_pos, rigth_pos = pos

        # Verify if the Left Position is not too close to the beginning of the text.
        if left_pos < 5:
            return None

        if string[0].isdigit():
            # 1. Take a subtext with 5 more characters to the left of the string.
            sub_text = text[left_pos - 5: rigth_pos]

            # 2. If the string has no dots to the left, return the original string.
            if text[left_pos - 1] == ".":

                # 3. If the string has a left dot but no preceding digit, return the original string.
                if text[left_pos - 2].isdigit():

                    # 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
                    pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
                    match = re.search(pattern, sub_text)
                    if match:
                        return match.group(0)
                    else:
                        return None
                else:
                    return string
            else:
                return string
        else:
            return string
    else:
        return None
    
# Cleaning
clean_functions_dict = {
    "SUPERFICIE_TERRAZA": extract_surface_from_string,
    "SUPERFICIE_JARDIN": extract_surface_from_string,
    "SUPERFICIE_TERRENO": extract_surface_from_string,
    "SUPERFICIE_HABITABLE": extract_surface_from_string,
    "SUPERFICIE_BALCON": extract_surface_from_string,
    "AÑO_REMODELACIÓN": extract_remodeling_year_from_string, 
    "NOMBRE_COMPLETO_ARQUITECTO": lambda x: x,
    'NOMBRE_CLUB_GOLF': lambda x: x, 
    'NOMBRE_TORRE': lambda x: x,
    'NOMBRE_CONDOMINIO': lambda x: x,
    'NOMBRE_DESARROLLO': lambda x: x,
}

threshols_dict = {
    "SUPERFICIE_TERRAZA": 0.9,
    "SUPERFICIE_JARDIN": 0.9,
    "SUPERFICIE_TERRENO": 0.9,
    "SUPERFICIE_HABITABLE": 0.9,
    "SUPERFICIE_BALCON": 0.9,
    "AÑO_REMODELACIÓN": 0.9,
    "NOMBRE_COMPLETO_ARQUITECTO": 0.9,
    'NOMBRE_CLUB_GOLF': 0.9, 
    'NOMBRE_TORRE': 0.9,
    'NOMBRE_CONDOMINIO': 0.9,
    'NOMBRE_DESARROLLO': 0.9,
}

threshols_dict = {
    "SUPERFICIE_BALCON": 0.7697697697697697,
    "SUPERFICIE_TERRAZA": 0.953953953953954,
    "SUPERFICIE_JARDIN": 0.9519519519519519, #idk
    "SUPERFICIE_TERRENO": 0.980980980980981 - 0.05,
    "SUPERFICIE_HABITABLE": 0.978978978978979 - 0.02, #idk if not "SUPERFICIE_HABITABLE": 0.988988988988989,
    "AÑO_REMODELACIÓN": 0.996996996996997 - 0.01,
    "NOMBRE_COMPLETO_ARQUITECTO": 0.8878878878878879,
    "NOMBRE_CLUB_GOLF": 0.8708708708708709, #idk if not "NOMBRE_CLUB_GOLF": 0.9729729729729729,
    "NOMBRE_TORRE": 0.8458458458458459 - 0.04,
    "NOMBRE_CONDOMINIO": 0.965965965965966,
    "NOMBRE_DESARROLLO": 0.9229229229229229
}

label_names_dict = {
    'LABEL_0': None, 
    'LABEL_1': 1,
    'LABEL_2': 2, 
    'LABEL_3': 3,
}
BERT_SCORE_LIMIT = 0.980819808198082

def extract_max_label_score(probabilities):
    # Find the dictionary with the maximum score
    max_item = max(probabilities, key=lambda x: x['score'])
    # Extract the label and the score
    label = max_item['label']
    score = max_item['score']

    return label, score

def clean_prediction_bert(label, score):
    if score > BERT_SCORE_LIMIT:
        label_formatted = label_names_dict.get(label, None)
        return  label_formatted
    else:
        return None
    
# BERT Inference Config
pipe_config = {
    "batch_size": 8,
    "truncation": True,
    "max_length": 250,
    "add_special_tokens": True,
    "return_all_scores": True,
    "padding": True,
}

def generate_answer(text):
    labels = [
    'SUPERFICIE_JARDIN',
    'NOMBRE_CLUB_GOLF',
    'SUPERFICIE_TERRENO',
    'SUPERFICIE_HABITABLE',
    'SUPERFICIE_TERRAZA',
    'NOMBRE_COMPLETO_ARQUITECTO',
    'SUPERFICIE_BALCON',
    'NOMBRE_DESARROLLO',
    'NOMBRE_TORRE',
    'NOMBRE_CONDOMINIO',
    'AÑO_REMODELACIÓN'
    ]

    # Clean Text
    text = clean_text(text)
    
    # Inference
    entities = model.predict_entities(text, labels, threshold=0.4)

    # Format Prediction Entities
    entities_formatted = format_gliner_predictions(entities)

    # Extract valid string left dotted
    feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
    for feature_name in feature_surfaces:
        if entities_formatted.get(f"pred_{feature_name}", None) != None:
           entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])

    # Clean Entities
    entities_names = list({c.replace("pred_", "").replace("prob_", "").replace("pos_", "") for c in list(entities_formatted.keys())})
    entities_cleaned = dict()
    for feature_name in entities_names:
        entity_prediction_cleaned = clean_prediction(entities_formatted, feature_name, threshols_dict, clean_functions_dict)
        if isinstance(entity_prediction_cleaned, str) or isinstance(entity_prediction_cleaned, int):
            entities_cleaned[feature_name] = entity_prediction_cleaned
    
    # BERT Inference
    predictions = pipe([text], **pipe_config)

    # Format Prediction
    label, score = extract_max_label_score(predictions[0])
    entities_formatted["NIVELES_CASA"] = label
    entities_formatted["prob_NIVELES_CASA"] = score
    prediction_cleaned = clean_prediction_bert(label, score)
    if isinstance(prediction_cleaned, int):
        entities_cleaned["NIVELES_CASA"] = prediction_cleaned
    

    result_json = json.dumps(entities_cleaned, indent = 4, ensure_ascii = False)

    return "Clean Result:" + result_json + "\n \n" + "Raw Result:" + json.dumps(entities_formatted, indent = 4, ensure_ascii = False)

# Cambiar a entrada de texto
#text_input = gr.inputs.Textbox(lines=15, label="Input Text")

iface = gr.Interface(
    fn=generate_answer, 
    inputs="text", 
    outputs="text",
    title="Text Intelligence for Real State",
    description="Input text describing the property."
)

iface.launch()