File size: 3,313 Bytes
138996e
 
b27ddf3
138996e
 
 
 
01f1fbb
138996e
 
 
 
 
 
55b7ba9
 
138996e
 
 
 
 
 
 
55b7ba9
 
138996e
 
 
 
01f1fbb
138996e
 
 
 
 
 
 
 
 
 
 
 
b27ddf3
138996e
 
 
b27ddf3
55b7ba9
 
 
 
 
 
 
 
 
 
 
01f1fbb
 
 
55b7ba9
 
 
 
 
 
 
 
01f1fbb
55b7ba9
 
 
 
138996e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json

import gradio as gr
from dingo.exec import Executor
from dingo.io import InputArgs


def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url):
    if not input_path:
        return 'ValueError: input_path can not be empty, please input.'
    if not data_format:
        return 'ValueError: data_format can not be empty, please input.'
    if not column_content:
        return 'ValueError: column_content can not be empty, please input.'
    if not rule_list and not prompt_list:
        return 'ValueError: rule_list and prompt_list can not be empty at the same time.'

    input_data = {
        "input_path": input_path,
        "data_format": data_format,
        "column_content": column_content,
        "custom_config":
            {
                "rule_list": rule_list,
                "prompt_list": prompt_list,
                "llm_config":
                    {
                        "detect_text_quality_detail":
                            {
                                "model": model,
                                "key": key,
                                "api_url": api_url,
                            }
                    }
            }
    }
    input_args = InputArgs(**input_data)
    executor = Executor.exec_map["local"](input_args)
    result = executor.execute()
    summary = result[0].to_dict()
    return json.dumps(summary, indent=4)


if __name__ == '__main__':
    rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl']
    prompt_options = ['PromptRepeat', 'PromptContentChaos']

    with open("header.html", "r") as file:
        header = file.read()
    with gr.Blocks() as demo:
        gr.HTML(header)
        with gr.Row():
            with gr.Column():
                input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path")
                data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format")
                column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content")
                rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list")
                prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list")
                model = gr.Textbox(placeholder="If want to use llm, please input model, such as: deepseek-chat", label="model")
                key = gr.Textbox(placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", label="key")
                api_url = gr.Textbox(placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", label="api_url")
                with gr.Row():
                    submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
            with gr.Column():
                # 输出组件
                output = gr.Textbox(label="output")

        submit_single.click(
            fn=dingo_demo,
            inputs=[input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url],
            outputs=output
        )

    # 启动界面
    demo.launch()