Spaces:

chris32
/

Text-Intelligence-Real-State

Sleeping

Christopher Román Jaimes commited on May 30, 2024

Commit

40d9906

1 Parent(s): c52a63e

chore: add clean text.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,10 +1,8 @@
 # Datetime
 import datetime
 # Manipulate
-import os
 import re
 import json
-import numpy as np
 import pandas as pd
 # App
 import gradio as gr
@@ -26,6 +24,31 @@ YEAR_OF_REMODELING_LIMIT = 100
 CURRENT_YEAR = int(datetime.date.today().year)
 SCORE_LIMIT_SIMILARITY_NAMES = 70
 def format_gliner_predictions(prediction):
     if len(prediction) > 0:
         # Select the Entity value with the Greater Score for each Entity Name
@@ -283,6 +306,10 @@ def generate_answer(text):
     'NOMBRE_CONDOMINIO',
     'AÑO_REMODELACIÓN'
     ]
     # Inference
     entities = model.predict_entities(text, labels, threshold=0.4)

 # Datetime
 import datetime
 # Manipulate
 import re
 import json
 import pandas as pd
 # App
 import gradio as gr
 CURRENT_YEAR = int(datetime.date.today().year)
 SCORE_LIMIT_SIMILARITY_NAMES = 70
+def clean_text(text):
+    # Replace HTML line breaks with the specified character
+    replacement_char = " # "
+    text = re.sub(r'<br\s*\/?>', replacement_char, text)
+    # Remove HTML tags and special characters
+    cleaned_text = re.sub(r'<[^>]*>', '', text)
+    cleaned_text = re.sub(r'&nbsp;', ' ', cleaned_text)
+    cleaned_text = re.sub(r'&amp;', '&', cleaned_text)
+    # Drop punctuation marks
+    #regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
+    #cleaned_text = re.sub(regex , ' ', cleaned_text)
+    # Replace multiple spaces with a single one
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
+    # Remove leading and trailing spaces
+    cleaned_text = cleaned_text.strip()
+    # Replace Duplicated "." and ","
+    cleaned_text = cleaned_text.replace("..", ".").replace(",,", ",")
+    return cleaned_text
 def format_gliner_predictions(prediction):
     if len(prediction) > 0:
         # Select the Entity value with the Greater Score for each Entity Name
     'NOMBRE_CONDOMINIO',
     'AÑO_REMODELACIÓN'
     ]
+    # Clean Text
+    text = clean_text(text)
     # Inference
     entities = model.predict_entities(text, labels, threshold=0.4)