Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	File size: 3,118 Bytes
			
			| 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 57b1c4f a4a35d8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | """Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text
!apt install libicu-dev
!install pyicu pycld2 Morfessor
!pip install polyglot sentence_splitter
"""
# pylint: disable=
from typing import List, Optional, Union
import re
from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences
from logzero import logger
# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")
# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
          "el", "hu", "is", "it", "lv", "lt", "no", "pl",
          "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
def _seg_text(
        text: str,
        lang: Optional[str] = None,
        # qmode: bool = False,
        maxlines: int = 1000
) -> List[str]:
    # fmt: on
    """Split text to sentences.
    Use sentence_splitter if supported,
    else use polyglot.text.Text.sentences
    Blank lines will be removed.
    qmode: quick mode, skip split_text_into_sentences if True, default False
        vectors for all books are based on qmode=False.
        qmode=True is for quick test purpose only
    maxlines (default 1000), threshold for turn on tqdm progressbar
        set to <1 or a large number to turn it off
    """
    if lang is None:
        try:
            lang = Detector(text).language.code
        except Exception as exc:
            logger.info("text[:30]: %s", text[:30])
            logger.warning(
                "polyglot.text.Detector exc: %s, setting to 'en'",
                exc
            )
            lang = "en"
    # if not qmode and lang in LANG_S:
    if lang in LANG_S:
        _ = []
        lines = text.splitlines()
        # if maxlines > 1 and len(lines) > maxlines:
        if len(lines) > maxlines > 1:
            for para in tqdm(lines):
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        else:
            for para in lines:
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        return _
        # return split_text_into_sentences(text, lang)
    # empty "" text or blank to avoid Exception
    if not text.strip():
        return []
    return [elm.string for elm in Text(text, lang).sentences]
# fmt: off
def seg_text(
        lst: Union[str, List[str]],
        lang: Optional[str] = None,
        maxlines: int = 1000,
        extra: Optional[str] = None,
) -> List[str]:
    # fmt:on
    """Split a list of text.
    Arguments:
        lst: text or text list
        extra: re.split(rf"{extra}, text) first
    Returns:
        list of splitted text.
    """
    if isinstance(lst, str):
        lst = [lst]
    if extra:
        # insert \n
        lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]
    res = []
    for elm in lst:
        res.extend(_seg_text(
            elm,
            lang=lang,
            maxlines=maxlines,
        ))
    return res
 | 
