File size: 3,494 Bytes
bccd6e8
 
4ba22a4
 
5cb1a08
 
bccd6e8
 
 
 
 
 
677f95f
bccd6e8
 
 
 
 
4ba22a4
 
 
 
 
 
bccd6e8
 
 
 
 
 
 
 
 
 
 
6f41e86
b496854
5cb1a08
bccd6e8
5cb1a08
bccd6e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b496854
 
 
 
 
 
 
 
 
bccd6e8
 
8ccf616
 
 
 
 
 
 
 
 
 
 
 
 
bccd6e8
 
fc2ede0
bccd6e8
 
 
 
 
b496854
 
 
 
5cb1a08
fc2ede0
 
 
 
 
 
 
5cb1a08
bccd6e8
145bd44
5cb1a08
bccd6e8
 
 
 
 
 
 
46a9657
b496854
5cb1a08
bccd6e8
964db97
 
 
145bd44
bccd6e8
a943def
bccd6e8
b496854
5cb1a08
 
bccd6e8
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""Create entry."""
# pylint: disbale=invalid-name
import os
import time
from pathlib import Path

import gradio as gr
import pandas as pd
from about_time import about_time
from aset2pairs import aset2pairs
from cmat2aset import cmat2aset
from logzero import logger
from seg_text import seg_text

from radio_mlbee import __version__
from radio_mlbee.gen_cmat import gen_cmat
from radio_mlbee.utils import text1, text2

os.environ["TZ"] = "Asia/Shanghai"
try:
    time.tzset()  # type: ignore
except Exception as _:
    logger.warning("time.tzset() error: %s. Probably running Windows, we let it pass.", _)


def greet(name):
    """Greet."""
    if not name:
        name = "world"
    return "Hello " + name + "!! (coming sooooon...)"


def ml_fn(
    text1: str,
    text2: str,
    split_to_sents: bool = False,
    preview: bool = False,
    download_csv: bool = False,
) -> pd.DataFrame:
    """Align multilingual (50+ pairs) text1 text2."""
    text1 = str(text1)
    text2 = str(text2)
    try:
        paras1 = text1.splitlines()
        paras1 = [_.strip() for _ in paras1 if _.strip()]
    except Exception as exc:
        logger.error(" praras.slpitlines() erros: %s, setting to ['']", exc)
        paras1 = [""]
    try:
        paras2 = text2.splitlines()
        paras2 = [_.strip() for _ in paras2 if _.strip()]
    except Exception as exc:
        logger.error(" praras slpitlines erros: %s, setting to ['']", exc)
        paras2 = [""]

    if split_to_sents:  # TODO
        try:
            paras1 = seg_text(paras1)
        except Exception as exc:
            logger.error(exc)
        try:
            paras2 = seg_text(paras2)
        except Exception as exc:
            logger.error(exc)

    with about_time() as t:
        try:
            cmat = gen_cmat(paras1, paras2)
        except Exception as exc:
            logger.exception(exc)
            logger.info(paras1)
            logger.info(paras2)
            logger.info("len(paras1): %s, len(paras2): %s", len(paras1), len(paras2))
            cmat = [[]]
        try:
            aset = cmat2aset(cmat)
        except Exception as exc:
            logger.exception(exc)
            aset = [["", "", ""]]

    _ = len(paras1) + len(paras2)
    av = f"{t.duration / _ * 1000:.2f}"
    logger.info(" %s blocks, took %s, av. %s s/1000 blk", _, t.duration_human, av)

    pairs = aset2pairs(paras1, paras2, aset)
    df = pd.DataFrame(pairs, columns=["text1", "text2", "llh"])

    html = None
    if preview:
        html = df.to_html()

    dl_csv = None
    try:
        if download_csv:
            dl_csv = Path("aligned-blocks.csv")
            _ = df.to_csv(index=False)
            dl_csv.write_text(_, encoding="utf8")
    except Exception as exc:
        logger.exception(exc)

    # return pd.DataFrame([["", "", ""]])
    # return df.to_html()
    return df, html, dl_csv


mlbee = gr.Interface(
    fn=ml_fn,
    inputs=[
        "textarea",
        "textarea",
        gr.Checkbox(label="Split to sents?"),
        gr.Checkbox(label="Preview?"),
        gr.Checkbox(label="Download csv?"),
    ],
    outputs=["dataframe", "html", gr.outputs.File(
        label="Click to download csv",
    )],
    # outputs="html",
    title=f"radio-mlbee {__version__}",
    description="mlbee rest api on dev ",
    examples=[
        # [text1, text2, False],
        [text1[: len(text1) // 5], text2[: len(text2) // 5], False, False, False],
    ],
)

mlbee.launch(
    show_error=True,
    enable_queue=True,
)