File size: 3,867 Bytes
e566133
 
c140fac
0d8a300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5560e59
 
0d8a300
e566133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a39b92
87159d4
 
 
 
1a39b92
 
 
 
 
e566133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c140fac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5560e59
 
c140fac
 
e566133
87159d4
e566133
87159d4
e566133
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re

# words_to_remove = ["FC","HEVC","ษดแด€แดแด‡:","-","BuLMoviee" ,"๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๏ฟฝ","SIDHUU 591","๐‘ฑ๐’๐’Š๐’ ๐‘ผ๐’” ๐‘ถษด ๐‘ปแด‡สŸแด‡ษขส€แด€แด","Tษชแด›สŸแด‡ :"]
words_to_remove = [
        "FษชสŸแด‡",
        "FษชสŸแด‡ ษดแด€แดแด‡ :",
        "FC",
        "HEVC",
        "ษดแด€แดแด‡",
        "Tษชแด›สŸแด‡ :",
        "BuLMoviee",
        "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๏ฟฝ",
        "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๐—บ",
        "SIDHUU 591",
        "๐‘ฑ๐’๐’Š๐’ ๐‘ผ๐’” ๐‘ถษด ๐‘ปแด‡สŸแด‡ษขส€แด€แด",
        "Tษชแด›สŸแด‡ :",
        "Bollywood",
        "mkv",
        "Mแดแด ษชแด‡",
        "ษขส€แดแดœแด˜",
        "TGxMALLU_MOVIE",
        "[Tg-@New_Movies_OnTG]",
        "[@ClipmateEmpire]",
        "@Horek_Rokom2020",
        "ClipmateEmpire"
    ]

def remove_words(text, words_to_remove):
    # Join the words to remove into a single regex pattern
    pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'
    # Use re.sub() to replace the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    # Remove extra spaces that might have been left after removing the words
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def convert_special_to_normal(text):
    # Unescape HTML entities
    text = html.unescape(text)
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return text

def clean_string_special_old(input_string):
    # This pattern keeps only alphanumeric characters (A-Z, a-z, 0-9) and spaces.
    cleaned_string = re.sub(r'[^\w\s]+', '', input_string)
    return cleaned_string

def clean_string_special(input_string):
    # This pattern replaces underscores and all special characters with a whitespace.
    cleaned_string = re.sub(r'[_\W]+', ' ', input_string)
    return cleaned_string

def clean_text(input_text):
    # Remove new line characters
    text = input_text.replace('\n', '').replace('@', '')
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "]+",
        flags=re.UNICODE)
    output_text = emoji_pattern.sub(r'', text)
    return output_text


def Get_Title_Year(name):
    # Regex to match title and year
    words_to_remove = [
        "FษชสŸแด‡",
        "FษชสŸแด‡ ษดแด€แดแด‡ :",
        "FC",
        "HEVC",
        "ษดแด€แดแด‡",
        "Tษชแด›สŸแด‡ :",
        "BuLMoviee",
        "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๏ฟฝ",
        "๐—๐—ผ๐—ถ๐—ป ๐—จ๐˜€ ๐—ข๐—ป ๐—ง๐—ฒ๐—น๐—ฒ๐—ด๐—ฟ๐—ฎ๐—บ",
        "SIDHUU 591",
        "๐‘ฑ๐’๐’Š๐’ ๐‘ผ๐’” ๐‘ถษด ๐‘ปแด‡สŸแด‡ษขส€แด€แด",
        "Tษชแด›สŸแด‡ :",
        "Bollywood",
        "mkv",
        "Mแดแด ษชแด‡",
        "ษขส€แดแดœแด˜",
        "TGxMALLU_MOVIE",
        "[Tg-@New_Movies_OnTG]",
        "[@ClipmateEmpire]",
        "@Horek_Rokom2020",
        "ClipmateEmpire"
    ]
    name = remove_words(name, words_to_remove)
    match = re.search(r'(?P<title>.+?)[\s\.\(\)]*(?P<year>\d{4})',name )

    if match:
        return clean_string_special(match.group('title').strip()), int(match.group('year'))
    return None, None