File size: 3,867 Bytes
e566133 c140fac 0d8a300 5560e59 0d8a300 e566133 1a39b92 87159d4 1a39b92 e566133 c140fac 5560e59 c140fac e566133 87159d4 e566133 87159d4 e566133 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import re
# words_to_remove = ["FC","HEVC","ษดแดแดแด:","-","BuLMoviee" ,"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ","SIDHUU 591","๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด","Tษชแดสแด :"]
words_to_remove = [
"Fษชสแด",
"Fษชสแด ษดแดแดแด :",
"FC",
"HEVC",
"ษดแดแดแด",
"Tษชแดสแด :",
"BuLMoviee",
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ",
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๐บ",
"SIDHUU 591",
"๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด",
"Tษชแดสแด :",
"Bollywood",
"mkv",
"Mแดแด ษชแด",
"ษขสแดแดแด",
"TGxMALLU_MOVIE",
"[Tg-@New_Movies_OnTG]",
"[@ClipmateEmpire]",
"@Horek_Rokom2020",
"ClipmateEmpire"
]
def remove_words(text, words_to_remove):
# Join the words to remove into a single regex pattern
pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'
# Use re.sub() to replace the pattern with an empty string
cleaned_text = re.sub(pattern, '', text)
# Remove extra spaces that might have been left after removing the words
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
return cleaned_text
def convert_special_to_normal(text):
# Unescape HTML entities
text = html.unescape(text)
# Normalize Unicode characters
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
# Remove special characters
text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
return text
def clean_string_special_old(input_string):
# This pattern keeps only alphanumeric characters (A-Z, a-z, 0-9) and spaces.
cleaned_string = re.sub(r'[^\w\s]+', '', input_string)
return cleaned_string
def clean_string_special(input_string):
# This pattern replaces underscores and all special characters with a whitespace.
cleaned_string = re.sub(r'[_\W]+', ' ', input_string)
return cleaned_string
def clean_text(input_text):
# Remove new line characters
text = input_text.replace('\n', '').replace('@', '')
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U00002600-\U000026FF" # Miscellaneous Symbols
"\U00002700-\U000027BF" # Dingbats
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U0001F1E0-\U0001F1FF" # Flags (iOS)
"]+",
flags=re.UNICODE)
output_text = emoji_pattern.sub(r'', text)
return output_text
def Get_Title_Year(name):
# Regex to match title and year
words_to_remove = [
"Fษชสแด",
"Fษชสแด ษดแดแดแด :",
"FC",
"HEVC",
"ษดแดแดแด",
"Tษชแดสแด :",
"BuLMoviee",
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๏ฟฝ",
"๐๐ผ๐ถ๐ป ๐จ๐ ๐ข๐ป ๐ง๐ฒ๐น๐ฒ๐ด๐ฟ๐ฎ๐บ",
"SIDHUU 591",
"๐ฑ๐๐๐ ๐ผ๐ ๐ถษด ๐ปแดสแดษขสแดแด",
"Tษชแดสแด :",
"Bollywood",
"mkv",
"Mแดแด ษชแด",
"ษขสแดแดแด",
"TGxMALLU_MOVIE",
"[Tg-@New_Movies_OnTG]",
"[@ClipmateEmpire]",
"@Horek_Rokom2020",
"ClipmateEmpire"
]
name = remove_words(name, words_to_remove)
match = re.search(r'(?P<title>.+?)[\s\.\(\)]*(?P<year>\d{4})',name )
if match:
return clean_string_special(match.group('title').strip()), int(match.group('year'))
return None, None
|