Spaces:

privateone
/

teleapi

Running

App Files Files Community

teleapi / FileStream /Tools /cleanup.py

privateone

Update FileStream/Tools/cleanup.py

5560e59 verified 7 months ago

raw

history blame

3.87 kB

	import re

	# words_to_remove = ["FC","HEVC","ɴᴀᴍᴇ:","-","BuLMoviee" ,"𝗝𝗼𝗶𝗻 𝗨𝘀 𝗢𝗻 𝗧𝗲𝗹𝗲𝗴𝗿𝗮�","SIDHUU 591","𝑱𝒐𝒊𝒏 𝑼𝒔 𝑶ɴ 𝑻ᴇʟᴇɢʀᴀᴍ","Tɪᴛʟᴇ :"]
	words_to_remove = [
	"Fɪʟᴇ",
	"Fɪʟᴇ ɴᴀᴍᴇ :",
	"FC",
	"HEVC",
	"ɴᴀᴍᴇ",
	"Tɪᴛʟᴇ :",
	"BuLMoviee",
	"𝗝𝗼𝗶𝗻 𝗨𝘀 𝗢𝗻 𝗧𝗲𝗹𝗲𝗴𝗿𝗮�",
	"𝗝𝗼𝗶𝗻 𝗨𝘀 𝗢𝗻 𝗧𝗲𝗹𝗲𝗴𝗿𝗮𝗺",
	"SIDHUU 591",
	"𝑱𝒐𝒊𝒏 𝑼𝒔 𝑶ɴ 𝑻ᴇʟᴇɢʀᴀᴍ",
	"Tɪᴛʟᴇ :",
	"Bollywood",
	"mkv",
	"Mᴏᴠɪᴇ",
	"ɢʀᴏᴜᴘ",
	"TGxMALLU_MOVIE",
	"[Tg-@New_Movies_OnTG]",
	"[@ClipmateEmpire]",
	"@Horek_Rokom2020",
	"ClipmateEmpire"
	]

	def remove_words(text, words_to_remove):
	# Join the words to remove into a single regex pattern
	pattern = r'\b(?:' + '\|'.join(map(re.escape, words_to_remove)) + r')\b'
	# Use re.sub() to replace the pattern with an empty string
	cleaned_text = re.sub(pattern, '', text)
	# Remove extra spaces that might have been left after removing the words
	cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
	return cleaned_text

	def convert_special_to_normal(text):
	# Unescape HTML entities
	text = html.unescape(text)
	# Normalize Unicode characters
	text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
	# Remove special characters
	text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
	return text

	def clean_string_special_old(input_string):
	# This pattern keeps only alphanumeric characters (A-Z, a-z, 0-9) and spaces.
	cleaned_string = re.sub(r'[^\w\s]+', '', input_string)
	return cleaned_string

	def clean_string_special(input_string):
	# This pattern replaces underscores and all special characters with a whitespace.
	cleaned_string = re.sub(r'[_\W]+', ' ', input_string)
	return cleaned_string

	def clean_text(input_text):
	# Remove new line characters
	text = input_text.replace('\n', '').replace('@', '')
	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # emoticons
	"\U0001F300-\U0001F5FF" # symbols & pictographs
	"\U0001F680-\U0001F6FF" # transport & map symbols
	"\U0001F700-\U0001F77F" # alchemical symbols
	"\U00002600-\U000026FF" # Miscellaneous Symbols
	"\U00002700-\U000027BF" # Dingbats
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	"\U0001F1E0-\U0001F1FF" # Flags (iOS)
	"]+",
	flags=re.UNICODE)
	output_text = emoji_pattern.sub(r'', text)
	return output_text


	def Get_Title_Year(name):
	# Regex to match title and year
	words_to_remove = [
	"Fɪʟᴇ",
	"Fɪʟᴇ ɴᴀᴍᴇ :",
	"FC",
	"HEVC",
	"ɴᴀᴍᴇ",
	"Tɪᴛʟᴇ :",
	"BuLMoviee",
	"𝗝𝗼𝗶𝗻 𝗨𝘀 𝗢𝗻 𝗧𝗲𝗹𝗲𝗴𝗿𝗮�",
	"𝗝𝗼𝗶𝗻 𝗨𝘀 𝗢𝗻 𝗧𝗲𝗹𝗲𝗴𝗿𝗮𝗺",
	"SIDHUU 591",
	"𝑱𝒐𝒊𝒏 𝑼𝒔 𝑶ɴ 𝑻ᴇʟᴇɢʀᴀᴍ",
	"Tɪᴛʟᴇ :",
	"Bollywood",
	"mkv",
	"Mᴏᴠɪᴇ",
	"ɢʀᴏᴜᴘ",
	"TGxMALLU_MOVIE",
	"[Tg-@New_Movies_OnTG]",
	"[@ClipmateEmpire]",
	"@Horek_Rokom2020",
	"ClipmateEmpire"
	]
	name = remove_words(name, words_to_remove)
	match = re.search(r'(?P<title>.+?)[\s\.\(\)]*(?P<year>\d{4})',name )

	if match:
	return clean_string_special(match.group('title').strip()), int(match.group('year'))
	return None, None