Christopher Román Jaimes commited on
Commit
40d9906
·
1 Parent(s): c52a63e

chore: add clean text.

Browse files
Files changed (1) hide show
  1. app.py +29 -2
app.py CHANGED
@@ -1,10 +1,8 @@
1
  # Datetime
2
  import datetime
3
  # Manipulate
4
- import os
5
  import re
6
  import json
7
- import numpy as np
8
  import pandas as pd
9
  # App
10
  import gradio as gr
@@ -26,6 +24,31 @@ YEAR_OF_REMODELING_LIMIT = 100
26
  CURRENT_YEAR = int(datetime.date.today().year)
27
  SCORE_LIMIT_SIMILARITY_NAMES = 70
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def format_gliner_predictions(prediction):
30
  if len(prediction) > 0:
31
  # Select the Entity value with the Greater Score for each Entity Name
@@ -283,6 +306,10 @@ def generate_answer(text):
283
  'NOMBRE_CONDOMINIO',
284
  'AÑO_REMODELACIÓN'
285
  ]
 
 
 
 
286
  # Inference
287
  entities = model.predict_entities(text, labels, threshold=0.4)
288
 
 
1
  # Datetime
2
  import datetime
3
  # Manipulate
 
4
  import re
5
  import json
 
6
  import pandas as pd
7
  # App
8
  import gradio as gr
 
24
  CURRENT_YEAR = int(datetime.date.today().year)
25
  SCORE_LIMIT_SIMILARITY_NAMES = 70
26
 
27
+ def clean_text(text):
28
+ # Replace HTML line breaks with the specified character
29
+ replacement_char = " # "
30
+ text = re.sub(r'<br\s*\/?>', replacement_char, text)
31
+
32
+ # Remove HTML tags and special characters
33
+ cleaned_text = re.sub(r'<[^>]*>', '', text)
34
+ cleaned_text = re.sub(r'&nbsp;', ' ', cleaned_text)
35
+ cleaned_text = re.sub(r'&amp;', '&', cleaned_text)
36
+
37
+ # Drop punctuation marks
38
+ #regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
39
+ #cleaned_text = re.sub(regex , ' ', cleaned_text)
40
+
41
+ # Replace multiple spaces with a single one
42
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
43
+
44
+ # Remove leading and trailing spaces
45
+ cleaned_text = cleaned_text.strip()
46
+
47
+ # Replace Duplicated "." and ","
48
+ cleaned_text = cleaned_text.replace("..", ".").replace(",,", ",")
49
+
50
+ return cleaned_text
51
+
52
  def format_gliner_predictions(prediction):
53
  if len(prediction) > 0:
54
  # Select the Entity value with the Greater Score for each Entity Name
 
306
  'NOMBRE_CONDOMINIO',
307
  'AÑO_REMODELACIÓN'
308
  ]
309
+
310
+ # Clean Text
311
+ text = clean_text(text)
312
+
313
  # Inference
314
  entities = model.predict_entities(text, labels, threshold=0.4)
315