File size: 1,130 Bytes
844aef2
 
 
 
89d669f
844aef2
 
5ae3f92
89d669f
844aef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae3f92
844aef2
d40a61e
844aef2
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""Align via ubee,"""
# pylint: disable=
from typing import Iterable, List, Tuple
from itertools import zip_longest

from logzero import logger
from ubee.uclas import uclas
from icecream import ic


def ubee(
    sents_zh: Iterable,
    sents_en: Iterable,
    thresh: float = 0.5,
) -> Tuple[List[Tuple[str, str, float]], List[Tuple[str, str]]]:
    """Align blocks.

    Args:
        sents_zh: list of text, can be any langauge supported by clas-l-user
        sents_zh: ditto
    Returns:
        three tuples of aligned blocked
        leftovers (unaligned)
    """
    res = []
    labels = [*sents_en]

    lo1 = []
    lo2 = labels[:]

    for seq in sents_zh:
        ic(seq)
        label, likelihood = uclas(seq, labels, thresh=thresh)
        likelihood = round(float(likelihood), 2)
        if label:
            res.append((seq, label, likelihood))
            try:
                lo2.remove(label)
            except Exception as exc:
                logger.error(exc)
                logger.info("seq: %s, lable: %s", seq, label)
        else:
            lo1.append(seq)
    return res, [*zip_longest(lo1, lo2)]