|
import json |
|
|
|
import gradio as gr |
|
from dingo.exec import Executor |
|
from dingo.io import InputArgs |
|
|
|
|
|
def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url): |
|
if not input_path: |
|
return 'ValueError: input_path can not be empty, please input.' |
|
if not data_format: |
|
return 'ValueError: data_format can not be empty, please input.' |
|
if not column_content: |
|
return 'ValueError: column_content can not be empty, please input.' |
|
if not rule_list and not prompt_list: |
|
return 'ValueError: rule_list and prompt_list can not be empty at the same time.' |
|
|
|
input_data = { |
|
"input_path": input_path, |
|
"data_format": data_format, |
|
"column_content": column_content, |
|
"custom_config": |
|
{ |
|
"rule_list": rule_list, |
|
"prompt_list": prompt_list, |
|
"llm_config": |
|
{ |
|
"detect_text_quality_detail": |
|
{ |
|
"model": model, |
|
"key": key, |
|
"api_url": api_url, |
|
} |
|
} |
|
} |
|
} |
|
input_args = InputArgs(**input_data) |
|
executor = Executor.exec_map["local"](input_args) |
|
result = executor.execute() |
|
summary = result[0].to_dict() |
|
return json.dumps(summary, indent=4) |
|
|
|
|
|
if __name__ == '__main__': |
|
rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl'] |
|
prompt_options = ['PromptRepeat', 'PromptContentChaos'] |
|
|
|
with open("header.html", "r") as file: |
|
header = file.read() |
|
with gr.Blocks() as demo: |
|
gr.HTML(header) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path") |
|
data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format") |
|
column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content") |
|
rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list") |
|
prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list") |
|
model = gr.Textbox(placeholder="If want to use llm, please input model, such as: deepseek-chat", label="model") |
|
key = gr.Textbox(placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", label="key") |
|
api_url = gr.Textbox(placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", label="api_url") |
|
with gr.Row(): |
|
submit_single = gr.Button(value="Submit", interactive=True, variant="primary") |
|
with gr.Column(): |
|
|
|
output = gr.Textbox(label="output") |
|
|
|
submit_single.click( |
|
fn=dingo_demo, |
|
inputs=[input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url], |
|
outputs=output |
|
) |
|
|
|
|
|
demo.launch() |
|
|