Spaces:
Sleeping
Sleeping
import os | |
from typing import Dict, List, Optional | |
import nltk | |
import jieba | |
resource_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources") | |
class NLTKHelper: | |
_stopwords: Dict[str, Optional[List[str]]] = { | |
"english": None, | |
"chinese": None, | |
} | |
def __init__(self): | |
jieba.initialize() | |
def get_stopwords(self, lang: str) -> List[str]: | |
nltk.data.path.append(os.path.join(resource_path, "nltk_data")) | |
if self._stopwords[lang] is None: | |
try: | |
nltk.data.find("corpora/stopwords") | |
except LookupError: | |
nltk.download("stopwords", download_dir=os.path.join(resource_path, "nltk_data")) | |
self._stopwords[lang] = nltk.corpus.stopwords.words(lang) | |
return self._stopwords[lang] | |
def word_tokenize(text: str, lang: str) -> List[str]: | |
if lang == "zh": | |
return jieba.lcut(text) | |
nltk.data.path.append(os.path.join(resource_path, "nltk_data")) | |
try: | |
nltk.data.find("tokenizers/punkt_tab") | |
except LookupError: | |
nltk.download("punkt_tab", download_dir=os.path.join(resource_path, "nltk_data")) | |
return nltk.word_tokenize(text) | |