|
import re |
|
|
|
|
|
words_to_remove = [ |
|
"Fษชสแด", |
|
"Fษชสแด ษดแดแดแด :", |
|
"FC", |
|
"HEVC", |
|
"ษดแดแดแด", |
|
"Tษชแดสแด :", |
|
"BuLMoviee", |
|
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ", |
|
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๐บ", |
|
"SIDHUU 591", |
|
"๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด", |
|
"Tษชแดสแด :", |
|
"Bollywood", |
|
"mkv", |
|
"Mแดแด ษชแด", |
|
"ษขสแดแดแด", |
|
"TGxMALLU_MOVIE", |
|
"[Tg-@New_Movies_OnTG]", |
|
"[@ClipmateEmpire]", |
|
"@Horek_Rokom2020", |
|
"ClipmateEmpire" |
|
] |
|
|
|
def remove_words(text, words_to_remove): |
|
|
|
pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b' |
|
|
|
cleaned_text = re.sub(pattern, '', text) |
|
|
|
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() |
|
return cleaned_text |
|
|
|
def convert_special_to_normal(text): |
|
|
|
text = html.unescape(text) |
|
|
|
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') |
|
|
|
text = re.sub(r'[^A-Za-z0-9 ]+', '', text) |
|
return text |
|
|
|
def clean_string_special_old(input_string): |
|
|
|
cleaned_string = re.sub(r'[^\w\s]+', '', input_string) |
|
return cleaned_string |
|
|
|
def clean_string_special(input_string): |
|
|
|
cleaned_string = re.sub(r'[_\W]+', ' ', input_string) |
|
return cleaned_string |
|
|
|
def clean_text(input_text): |
|
|
|
text = input_text.replace('\n', '').replace('@', '') |
|
emoji_pattern = re.compile( |
|
"[" |
|
"\U0001F600-\U0001F64F" |
|
"\U0001F300-\U0001F5FF" |
|
"\U0001F680-\U0001F6FF" |
|
"\U0001F700-\U0001F77F" |
|
"\U00002600-\U000026FF" |
|
"\U00002700-\U000027BF" |
|
"\U0001F900-\U0001F9FF" |
|
"\U0001FA70-\U0001FAFF" |
|
"\U0001F1E0-\U0001F1FF" |
|
"]+", |
|
flags=re.UNICODE) |
|
output_text = emoji_pattern.sub(r'', text) |
|
return output_text |
|
|
|
|
|
def Get_Title_Year(name): |
|
|
|
words_to_remove = [ |
|
"Fษชสแด", |
|
"Fษชสแด ษดแดแดแด :", |
|
"FC", |
|
"HEVC", |
|
"ษดแดแดแด", |
|
"Tษชแดสแด :", |
|
"BuLMoviee", |
|
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ", |
|
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๐บ", |
|
"SIDHUU 591", |
|
"๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด", |
|
"Tษชแดสแด :", |
|
"Bollywood", |
|
"mkv", |
|
"Mแดแด ษชแด", |
|
"ษขสแดแดแด", |
|
"TGxMALLU_MOVIE", |
|
"[Tg-@New_Movies_OnTG]", |
|
"[@ClipmateEmpire]", |
|
"@Horek_Rokom2020", |
|
"ClipmateEmpire" |
|
] |
|
name = remove_words(name, words_to_remove) |
|
match = re.search(r'(?P<title>.+?)[\s\.\(\)]*(?P<year>\d{4})',name ) |
|
|
|
if match: |
|
return clean_string_special(match.group('title').strip()), int(match.group('year')) |
|
return None, None |
|
|