Subsystem_OS_Command_Access

Build error

File size: 3,118 Bytes

57b1c4f
 
 
 
 
 
 
 
 
a4a35d8
57b1c4f
a4a35d8
 
 
57b1c4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4a35d8
57b1c4f
 
a4a35d8
57b1c4f
 
 
a4a35d8
57b1c4f
 
 
a4a35d8
57b1c4f
a4a35d8
57b1c4f
 
 
a4a35d8
57b1c4f
 
 
 
 
 
a4a35d8
 
 
 
 
57b1c4f
 
a4a35d8
 
57b1c4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4a35d8
 
 
 
57b1c4f
a4a35d8

"""Split text to sentences.

Use sentence_splitter if supported,
else use polyglot.text.Text

!apt install libicu-dev
!install pyicu pycld2 Morfessor
!pip install polyglot sentence_splitter
"""
# pylint: disable=

from typing import List, Optional, Union

import re
from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences

from logzero import logger

# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")


# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
          "el", "hu", "is", "it", "lv", "lt", "no", "pl",
          "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]


def _seg_text(
        text: str,
        lang: Optional[str] = None,
        # qmode: bool = False,
        maxlines: int = 1000
) -> List[str]:
    # fmt: on
    """Split text to sentences.

    Use sentence_splitter if supported,
    else use polyglot.text.Text.sentences
    Blank lines will be removed.

    qmode: quick mode, skip split_text_into_sentences if True, default False
        vectors for all books are based on qmode=False.
        qmode=True is for quick test purpose only

    maxlines (default 1000), threshold for turn on tqdm progressbar
        set to <1 or a large number to turn it off
    """
    if lang is None:
        try:
            lang = Detector(text).language.code
        except Exception as exc:
            logger.info("text[:30]: %s", text[:30])
            logger.warning(
                "polyglot.text.Detector exc: %s, setting to 'en'",
                exc
            )
            lang = "en"

    # if not qmode and lang in LANG_S:
    if lang in LANG_S:
        _ = []
        lines = text.splitlines()
        # if maxlines > 1 and len(lines) > maxlines:
        if len(lines) > maxlines > 1:
            for para in tqdm(lines):
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        else:
            for para in lines:
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        return _

        # return split_text_into_sentences(text, lang)

    # empty "" text or blank to avoid Exception
    if not text.strip():
        return []

    return [elm.string for elm in Text(text, lang).sentences]


# fmt: off
def seg_text(
        lst: Union[str, List[str]],
        lang: Optional[str] = None,
        maxlines: int = 1000,
        extra: Optional[str] = None,
) -> List[str]:
    # fmt:on
    """Split a list of text.

    Arguments:
        lst: text or text list
        extra: re.split(rf"{extra}, text) first
    Returns:
        list of splitted text.
    """
    if isinstance(lst, str):
        lst = [lst]

    if extra:
        # insert \n
        lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]

    res = []
    for elm in lst:
        res.extend(_seg_text(
            elm,
            lang=lang,
            maxlines=maxlines,
        ))

    return res