import re # words_to_remove = ["FC","HEVC","ษดแด€แดแด‡:","-","BuLMoviee" ,"๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๏ฟฝ","SIDHUU 591","๐‘ฑ๐’๐’Š๐’ ๐‘ผ๐’” ๐‘ถษด ๐‘ปแด‡สŸแด‡ษขส€แด€แด","Tษชแด›สŸแด‡ :"] words_to_remove = [ "FษชสŸแด‡", "FษชสŸแด‡ ษดแด€แดแด‡ :", "FC", "HEVC", "ษดแด€แดแด‡", "Tษชแด›สŸแด‡ :", "BuLMoviee", "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๏ฟฝ", "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๐—บ", "SIDHUU 591", "๐‘ฑ๐’๐’Š๐’ ๐‘ผ๐’” ๐‘ถษด ๐‘ปแด‡สŸแด‡ษขส€แด€แด", "Tษชแด›สŸแด‡ :", "Bollywood", "mkv", "Mแดแด ษชแด‡", "ษขส€แดแดœแด˜", "TGxMALLU_MOVIE", "[Tg-@New_Movies_OnTG]", "[@ClipmateEmpire]", "@Horek_Rokom2020", "ClipmateEmpire" ] def remove_words(text, words_to_remove): # Join the words to remove into a single regex pattern pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b' # Use re.sub() to replace the pattern with an empty string cleaned_text = re.sub(pattern, '', text) # Remove extra spaces that might have been left after removing the words cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() return cleaned_text def convert_special_to_normal(text): # Unescape HTML entities text = html.unescape(text) # Normalize Unicode characters text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') # Remove special characters text = re.sub(r'[^A-Za-z0-9 ]+', '', text) return text def clean_string_special_old(input_string): # This pattern keeps only alphanumeric characters (A-Z, a-z, 0-9) and spaces. cleaned_string = re.sub(r'[^\w\s]+', '', input_string) return cleaned_string def clean_string_special(input_string): # This pattern replaces underscores and all special characters with a whitespace. cleaned_string = re.sub(r'[_\W]+', ' ', input_string) return cleaned_string def clean_text(input_text): # Remove new line characters text = input_text.replace('\n', '').replace('@', '') emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F700-\U0001F77F" # alchemical symbols "\U00002600-\U000026FF" # Miscellaneous Symbols "\U00002700-\U000027BF" # Dingbats "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A "\U0001F1E0-\U0001F1FF" # Flags (iOS) "]+", flags=re.UNICODE) output_text = emoji_pattern.sub(r'', text) return output_text def Get_Title_Year(name): # Regex to match title and year words_to_remove = [ "FษชสŸแด‡", "FษชสŸแด‡ ษดแด€แดแด‡ :", "FC", "HEVC", "ษดแด€แดแด‡", "Tษชแด›สŸแด‡ :", "BuLMoviee", "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๏ฟฝ", "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๐—บ", "SIDHUU 591", "๐‘ฑ๐’๐’Š๐’ ๐‘ผ๐’” ๐‘ถษด ๐‘ปแด‡สŸแด‡ษขส€แด€แด", "Tษชแด›สŸแด‡ :", "Bollywood", "mkv", "Mแดแด ษชแด‡", "ษขส€แดแดœแด˜", "TGxMALLU_MOVIE", "[Tg-@New_Movies_OnTG]", "[@ClipmateEmpire]", "@Horek_Rokom2020", "ClipmateEmpire" ] name = remove_words(name, words_to_remove) match = re.search(r'(?P.+?)[\s\.\(\)]*(?P<year>\d{4})',name ) if match: return clean_string_special(match.group('title').strip()), int(match.group('year')) return None, None