Spaces:
Runtime error
Runtime error
| import re | |
| import re | |
| from sentence_transformers import SentenceTransformer, util | |
| import re | |
| from unidecode import unidecode | |
| from transformers import AutoTokenizer | |
| import yaml | |
| import fitz | |
| def remove_accents(input_str): | |
| text_no_accents = unidecode(input_str) | |
| return text_no_accents | |
| def remove_special_characters(text): | |
| text = text.replace("<s>", "").replace("</s>", "") | |
| text = remove_accents(text) | |
| pattern = r'[^\w\s\d.,!?\'"()-;]+' | |
| text = re.sub(pattern, "", text) | |
| return text | |
| def remove_special_characters_2(text): | |
| pattern = r"[^a-zA-Z0-9 ]+" | |
| text = re.sub(pattern, "", text) | |
| return text | |
| def update_character_count(text): | |
| return f"{len(text)} characters" | |
| with open("config.yaml", "r") as file: | |
| params = yaml.safe_load(file) | |
| text_bc_model_path = params["TEXT_BC_MODEL_PATH"] | |
| text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) | |
| def len_validator(text): | |
| min_tokens = 200 | |
| lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) | |
| if lengt < min_tokens: | |
| return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." | |
| else: | |
| return f"Input length ({lengt}) is satisified." | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| WORD = re.compile(r"\w+") | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |