Spaces:
Running
on
L40S
Running
on
L40S
File size: 1,858 Bytes
05d3571 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import emoji
def legacy_demojizer(x: str) -> str:
return "".join(filter(lambda ch: not emoji.is_emoji(ch), x))
class Demojizer:
"""
based on:
https://github.com/carpedm20/emoji/blob/d8bbfe455c6fcd12b96ed1dce6e0978fe7a47431/emoji/core.py#L141
"""
def _get_search_tree(self):
_SEARCH_TREE = {}
for emj in emoji.unicode_codes.EMOJI_DATA:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
for i, char in enumerate(emj):
if char not in sub_tree:
sub_tree[char] = {}
sub_tree = sub_tree[char]
if i == lastidx:
sub_tree["data"] = emoji.unicode_codes.EMOJI_DATA[emj]
return _SEARCH_TREE
def __init__(self) -> None:
self.search_tree = self._get_search_tree()
def __call__(self, string: str, replace_str: str):
result = []
i = 0
length = len(string)
state = 0
while i < length:
consumed = False
char = string[i]
if char in self.search_tree:
j = i + 1
sub_tree = self.search_tree[char]
while j < length and string[j] in sub_tree:
sub_tree = sub_tree[string[j]]
j += 1
if "data" in sub_tree:
state = 1
consumed = True
result.append(replace_str)
i = j - 1
else:
state = 0
elif state == 1:
if char.isspace():
consumed = True
else:
state = 0
if not consumed and char != "\ufe0e" and char != "\ufe0f":
result.append(char)
i += 1
return "".join(result)
|