Christopher Román Jaimes
commited on
Commit
·
ac89c56
1
Parent(s):
e2740ad
feat: add extract a valid string number left dotted.
Browse files
app.py
CHANGED
@@ -33,14 +33,19 @@ def format_gliner_predictions(prediction):
|
|
33 |
.sort_values("score", ascending = False)\
|
34 |
.drop_duplicates(subset = "label", keep = "first")
|
35 |
|
|
|
|
|
|
|
36 |
# Add Columns Label for Text and Probability
|
37 |
prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
|
38 |
prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
|
|
|
39 |
|
40 |
# Format Predictions
|
41 |
entities = prediction_df.set_index("label_text")["text"].to_dict()
|
42 |
entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
|
43 |
-
|
|
|
44 |
|
45 |
return predictions_formatted
|
46 |
else:
|
@@ -166,6 +171,41 @@ def extract_remodeling_year_from_string(string):
|
|
166 |
|
167 |
return None
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
# Cleaning
|
170 |
clean_functions_dict = {
|
171 |
"SUPERFICIE_TERRAZA": extract_surface_from_string,
|
@@ -249,6 +289,11 @@ def generate_answer(text):
|
|
249 |
# Format Prediction Entities
|
250 |
entities_formatted = format_gliner_predictions(entities)
|
251 |
|
|
|
|
|
|
|
|
|
|
|
252 |
# Clean Entities
|
253 |
entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
|
254 |
entities_cleaned = dict()
|
|
|
33 |
.sort_values("score", ascending = False)\
|
34 |
.drop_duplicates(subset = "label", keep = "first")
|
35 |
|
36 |
+
# Add Position Column
|
37 |
+
prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)
|
38 |
+
|
39 |
# Add Columns Label for Text and Probability
|
40 |
prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
|
41 |
prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
|
42 |
+
prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")
|
43 |
|
44 |
# Format Predictions
|
45 |
entities = prediction_df.set_index("label_text")["text"].to_dict()
|
46 |
entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
|
47 |
+
entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
|
48 |
+
predictions_formatted = {**entities, **entities_probs, **entities_positions}
|
49 |
|
50 |
return predictions_formatted
|
51 |
else:
|
|
|
171 |
|
172 |
return None
|
173 |
|
174 |
+
def extract_valid_string_left_dotted(string, text, pos):
|
175 |
+
if isinstance(string, str):
|
176 |
+
# String Position
|
177 |
+
left_pos, rigth_pos = pos
|
178 |
+
|
179 |
+
# Verify if the Left Position is not too close to the beginning of the text.
|
180 |
+
if left_pos < 5:
|
181 |
+
return None
|
182 |
+
|
183 |
+
if string[0].isdigit():
|
184 |
+
# 1. Take a subtext with 5 more characters to the left of the string.
|
185 |
+
sub_text = text[left_pos - 5: rigth_pos]
|
186 |
+
|
187 |
+
# 2. If the string has no dots to the left, return the original string.
|
188 |
+
if text[left_pos - 1] == ".":
|
189 |
+
|
190 |
+
# 3. If the string has a left dot but no preceding digit, return the original string.
|
191 |
+
if text[left_pos - 2].isdigit():
|
192 |
+
|
193 |
+
# 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
|
194 |
+
pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
|
195 |
+
match = re.search(pattern, sub_text)
|
196 |
+
if match:
|
197 |
+
return match.group(0)
|
198 |
+
else:
|
199 |
+
return None
|
200 |
+
else:
|
201 |
+
return string
|
202 |
+
else:
|
203 |
+
return string
|
204 |
+
else:
|
205 |
+
return string
|
206 |
+
else:
|
207 |
+
return None
|
208 |
+
|
209 |
# Cleaning
|
210 |
clean_functions_dict = {
|
211 |
"SUPERFICIE_TERRAZA": extract_surface_from_string,
|
|
|
289 |
# Format Prediction Entities
|
290 |
entities_formatted = format_gliner_predictions(entities)
|
291 |
|
292 |
+
# Extract valid string left dotted
|
293 |
+
feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
|
294 |
+
for feature_name in feature_surfaces:
|
295 |
+
entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])
|
296 |
+
|
297 |
# Clean Entities
|
298 |
entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
|
299 |
entities_cleaned = dict()
|