Spaces:
Runtime error
Runtime error
| """Convert two lists of str (texts) to correlation matrix.""" | |
| # pylint: disable=too-many-arguments, too-many-locals, unused-import | |
| from typing import Dict, Iterable, List, Optional, Union # noqa | |
| import numpy as np | |
| from textacy.representations import Vectorizer | |
| from fastlid import fastlid | |
| from radiobee.en2zh_tokens import en2zh_tokens | |
| from radiobee.insert_spaces import insert_spaces | |
| from radiobee.gen_model import gen_model | |
| from radiobee.smatrix import smatrix | |
| # fmt: off | |
| def lists2cmat( | |
| text1: Union[str, Iterable[str]], | |
| text2: Union[str, Iterable[str]], | |
| # text1: Union[str, List[str]], | |
| # text2: Union[str, List[str]], | |
| lang1: Optional[str] = None, | |
| lang2: Optional[str] = None, | |
| model: Vectorizer = None, | |
| tf_type: str = "linear", | |
| idf_type: Optional[str] = "smooth", | |
| # dl_type: Optional[str] = "sqrt", # "lucene-style tfidf" | |
| dl_type: Optional[str] = None, # | |
| norm: Optional[str] = "l2", # + "l2" | |
| min_df: Union[int, float] = 1, | |
| max_df: Union[int, float] = 1.0, | |
| max_n_terms: Optional[int] = None, | |
| vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None | |
| ) -> np.ndarray: | |
| # fmt: on | |
| """Convert two lists to cmat. | |
| Args: | |
| text1: refer smatrix | |
| text2: refer smatrix | |
| lang1: optional 1st lang code | |
| lang2: optional 2nd lang code | |
| dl_type: doc lenth | |
| idf_type: idf tyoe | |
| max_df: max doc freq | |
| max_n_terms: max n terms | |
| min_df: min doc freq | |
| model: optional model | |
| norm: norm | |
| tf_type: term freq type | |
| vocabulary_terms: vocab refer smatrix | |
| Returs | |
| cmat | |
| """ | |
| if isinstance(text1, str): | |
| text1 = [text1] | |
| if isinstance(text2, str): | |
| text2 = [text2] | |
| set_languages = fastlid.set_languages | |
| fastlid.set_languages = ["en", "zh"] | |
| if lang1 is None: | |
| lang1, _ = fastlid(" ".join(text1)) | |
| if lang2 is None: | |
| lang2, _ = fastlid(" ".join(text2)) | |
| # restore fastlid.set_languages | |
| fastlid.set_languages = set_languages | |
| # en2zh_tokens | |
| def zh_tokens(textzh): | |
| return [insert_spaces(elm).split() for elm in textzh] | |
| if lang1 in ["zh"] and lang2 in ["en"]: | |
| vec1 = zh_tokens(text1) | |
| vec2 = en2zh_tokens(text2) | |
| elif lang1 in ["zh"] and lang2 in ["zh"]: | |
| vec1 = zh_tokens(text1) | |
| vec2 = zh_tokens(text2) | |
| elif lang1 in ["en"] and lang2 in ["en"]: | |
| vec1 = en2zh_tokens(text1) | |
| vec2 = en2zh_tokens(text2) | |
| # if lang1 in ["en"] and lang2 in ["zh"]: | |
| else: | |
| vec1 = en2zh_tokens(text1) | |
| vec2 = zh_tokens(text2) | |
| if model is None: | |
| model = gen_model(vec1) | |
| cmat = smatrix( | |
| vec1, | |
| vec2, | |
| model=model, | |
| tf_type=tf_type, | |
| idf_type=idf_type, | |
| dl_type=dl_type, | |
| norm=norm, | |
| min_df=min_df, | |
| max_df=max_df, | |
| max_n_terms=max_n_terms, | |
| vocabulary_terms=vocabulary_terms, | |
| ) | |
| return np.array(cmat) | |