Spaces:
Running
Running
| import re | |
| # words_to_remove = ["FC","HEVC","ษดแดแดแด:","-","BuLMoviee" ,"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ","SIDHUU 591","๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด","Tษชแดสแด :"] | |
| words_to_remove = [ | |
| "Fษชสแด", | |
| "Fษชสแด ษดแดแดแด :", | |
| "FC", | |
| "HEVC", | |
| "ษดแดแดแด", | |
| "Tษชแดสแด :", | |
| "BuLMoviee", | |
| "๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ", | |
| "๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๐บ", | |
| "SIDHUU 591", | |
| "๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด", | |
| "Tษชแดสแด :", | |
| "Bollywood", | |
| "mkv", | |
| "Mแดแด ษชแด", | |
| "ษขสแดแดแด", | |
| "TGxMALLU_MOVIE", | |
| "[Tg-@New_Movies_OnTG]", | |
| "[@ClipmateEmpire]", | |
| "@Horek_Rokom2020", | |
| "ClipmateEmpire" | |
| ] | |
| def remove_words(text, words_to_remove): | |
| # Join the words to remove into a single regex pattern | |
| pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b' | |
| # Use re.sub() to replace the pattern with an empty string | |
| cleaned_text = re.sub(pattern, '', text) | |
| # Remove extra spaces that might have been left after removing the words | |
| cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() | |
| return cleaned_text | |
| def convert_special_to_normal(text): | |
| # Unescape HTML entities | |
| text = html.unescape(text) | |
| # Normalize Unicode characters | |
| text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') | |
| # Remove special characters | |
| text = re.sub(r'[^A-Za-z0-9 ]+', '', text) | |
| return text | |
| def clean_string_special_old(input_string): | |
| # This pattern keeps only alphanumeric characters (A-Z, a-z, 0-9) and spaces. | |
| cleaned_string = re.sub(r'[^\w\s]+', '', input_string) | |
| return cleaned_string | |
| def clean_string_special(input_string): | |
| # This pattern replaces underscores and all special characters with a whitespace. | |
| cleaned_string = re.sub(r'[_\W]+', ' ', input_string) | |
| return cleaned_string | |
| def clean_text(input_text): | |
| # Remove new line characters | |
| text = input_text.replace('\n', '').replace('@', '') | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F700-\U0001F77F" # alchemical symbols | |
| "\U00002600-\U000026FF" # Miscellaneous Symbols | |
| "\U00002700-\U000027BF" # Dingbats | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
| "\U0001F1E0-\U0001F1FF" # Flags (iOS) | |
| "]+", | |
| flags=re.UNICODE) | |
| output_text = emoji_pattern.sub(r'', text) | |
| return output_text | |
| def Get_Title_Year(name): | |
| # Regex to match title and year | |
| words_to_remove = [ | |
| "Fษชสแด", | |
| "Fษชสแด ษดแดแดแด :", | |
| "FC", | |
| "HEVC", | |
| "ษดแดแดแด", | |
| "Tษชแดสแด :", | |
| "BuLMoviee", | |
| "๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ", | |
| "๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๐บ", | |
| "SIDHUU 591", | |
| "๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด", | |
| "Tษชแดสแด :", | |
| "Bollywood", | |
| "mkv", | |
| "Mแดแด ษชแด", | |
| "ษขสแดแดแด", | |
| "TGxMALLU_MOVIE", | |
| "[Tg-@New_Movies_OnTG]", | |
| "[@ClipmateEmpire]", | |
| "@Horek_Rokom2020", | |
| "ClipmateEmpire" | |
| ] | |
| name = remove_words(name, words_to_remove) | |
| match = re.search(r'(?P<title>.+?)[\s\.\(\)]*(?P<year>\d{4})',name ) | |
| if match: | |
| return clean_string_special(match.group('title').strip()), int(match.group('year')) | |
| return None, None | |