GraphGen / graphgen /utils /help_nltk.py
chenzihong-gavin
init
acd7cf4
import os
from typing import Dict, List, Optional
import nltk
import jieba
resource_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
class NLTKHelper:
_stopwords: Dict[str, Optional[List[str]]] = {
"english": None,
"chinese": None,
}
def __init__(self):
jieba.initialize()
def get_stopwords(self, lang: str) -> List[str]:
nltk.data.path.append(os.path.join(resource_path, "nltk_data"))
if self._stopwords[lang] is None:
try:
nltk.data.find("corpora/stopwords")
except LookupError:
nltk.download("stopwords", download_dir=os.path.join(resource_path, "nltk_data"))
self._stopwords[lang] = nltk.corpus.stopwords.words(lang)
return self._stopwords[lang]
@staticmethod
def word_tokenize(text: str, lang: str) -> List[str]:
if lang == "zh":
return jieba.lcut(text)
nltk.data.path.append(os.path.join(resource_path, "nltk_data"))
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab", download_dir=os.path.join(resource_path, "nltk_data"))
return nltk.word_tokenize(text)