Christopher Román Jaimes commited on
Commit
ac89c56
·
1 Parent(s): e2740ad

feat: add extract a valid string number left dotted.

Browse files
Files changed (1) hide show
  1. app.py +46 -1
app.py CHANGED
@@ -33,14 +33,19 @@ def format_gliner_predictions(prediction):
33
  .sort_values("score", ascending = False)\
34
  .drop_duplicates(subset = "label", keep = "first")
35
 
 
 
 
36
  # Add Columns Label for Text and Probability
37
  prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
38
  prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
 
39
 
40
  # Format Predictions
41
  entities = prediction_df.set_index("label_text")["text"].to_dict()
42
  entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
43
- predictions_formatted = {**entities, **entities_probs}
 
44
 
45
  return predictions_formatted
46
  else:
@@ -166,6 +171,41 @@ def extract_remodeling_year_from_string(string):
166
 
167
  return None
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  # Cleaning
170
  clean_functions_dict = {
171
  "SUPERFICIE_TERRAZA": extract_surface_from_string,
@@ -249,6 +289,11 @@ def generate_answer(text):
249
  # Format Prediction Entities
250
  entities_formatted = format_gliner_predictions(entities)
251
 
 
 
 
 
 
252
  # Clean Entities
253
  entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
254
  entities_cleaned = dict()
 
33
  .sort_values("score", ascending = False)\
34
  .drop_duplicates(subset = "label", keep = "first")
35
 
36
+ # Add Position Column
37
+ prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)
38
+
39
  # Add Columns Label for Text and Probability
40
  prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
41
  prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
42
+ prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")
43
 
44
  # Format Predictions
45
  entities = prediction_df.set_index("label_text")["text"].to_dict()
46
  entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
47
+ entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
48
+ predictions_formatted = {**entities, **entities_probs, **entities_positions}
49
 
50
  return predictions_formatted
51
  else:
 
171
 
172
  return None
173
 
174
+ def extract_valid_string_left_dotted(string, text, pos):
175
+ if isinstance(string, str):
176
+ # String Position
177
+ left_pos, rigth_pos = pos
178
+
179
+ # Verify if the Left Position is not too close to the beginning of the text.
180
+ if left_pos < 5:
181
+ return None
182
+
183
+ if string[0].isdigit():
184
+ # 1. Take a subtext with 5 more characters to the left of the string.
185
+ sub_text = text[left_pos - 5: rigth_pos]
186
+
187
+ # 2. If the string has no dots to the left, return the original string.
188
+ if text[left_pos - 1] == ".":
189
+
190
+ # 3. If the string has a left dot but no preceding digit, return the original string.
191
+ if text[left_pos - 2].isdigit():
192
+
193
+ # 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
194
+ pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
195
+ match = re.search(pattern, sub_text)
196
+ if match:
197
+ return match.group(0)
198
+ else:
199
+ return None
200
+ else:
201
+ return string
202
+ else:
203
+ return string
204
+ else:
205
+ return string
206
+ else:
207
+ return None
208
+
209
  # Cleaning
210
  clean_functions_dict = {
211
  "SUPERFICIE_TERRAZA": extract_surface_from_string,
 
289
  # Format Prediction Entities
290
  entities_formatted = format_gliner_predictions(entities)
291
 
292
+ # Extract valid string left dotted
293
+ feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
294
+ for feature_name in feature_surfaces:
295
+ entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])
296
+
297
  # Clean Entities
298
  entities_names = list({c.replace("pred_", "").replace("prob_", "") for c in list(entities_formatted.keys())})
299
  entities_cleaned = dict()