|
import json |
|
import os |
|
import shutil |
|
|
|
import gradio as gr |
|
from dingo.exec import Executor |
|
from dingo.io import InputArgs |
|
|
|
|
|
def dingo_demo(dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, model, |
|
key, api_url): |
|
if not data_format: |
|
return 'ValueError: data_format can not be empty, please input.', None |
|
if not column_content: |
|
return 'ValueError: column_content can not be empty, please input.', None |
|
if not rule_list and not prompt_list: |
|
return 'ValueError: rule_list and prompt_list can not be empty at the same time.', None |
|
|
|
|
|
if dataset_source == "hugging_face": |
|
if not input_path: |
|
return 'ValueError: input_path can not be empty for hugging_face dataset, please input.', None |
|
final_input_path = input_path |
|
else: |
|
if not uploaded_file: |
|
return 'ValueError: Please upload a file for local dataset.', None |
|
final_input_path = uploaded_file.name |
|
|
|
input_data = { |
|
"dataset": dataset_source, |
|
"input_path": final_input_path, |
|
"output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path), |
|
"save_data": True, |
|
"save_raw": True, |
|
"data_format": data_format, |
|
"column_content": column_content, |
|
"custom_config": |
|
{ |
|
"rule_list": rule_list, |
|
"prompt_list": prompt_list, |
|
"llm_config": |
|
{ |
|
"detect_text_quality_detail": |
|
{ |
|
"model": model, |
|
"key": key, |
|
"api_url": api_url, |
|
} |
|
} |
|
} |
|
} |
|
input_args = InputArgs(**input_data) |
|
executor = Executor.exec_map["local"](input_args) |
|
executor.execute() |
|
summary = executor.get_summary().to_dict() |
|
detail = executor.get_bad_info_list() |
|
new_detail = [] |
|
for item in detail: |
|
new_detail.append(item.to_raw_dict()) |
|
if summary['output_path']: |
|
shutil.rmtree(summary['output_path']) |
|
|
|
|
|
return json.dumps(summary, indent=4), new_detail |
|
|
|
|
|
def update_input_components(dataset_source): |
|
|
|
if dataset_source == "hugging_face": |
|
|
|
return [ |
|
gr.Textbox(visible=True), |
|
gr.File(visible=False), |
|
] |
|
else: |
|
|
|
return [ |
|
gr.Textbox(visible=False), |
|
gr.File(visible=True), |
|
] |
|
|
|
|
|
if __name__ == '__main__': |
|
rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl'] |
|
prompt_options = ['PromptRepeat', 'PromptContentChaos'] |
|
|
|
with open("header.html", "r") as file: |
|
header = file.read() |
|
with gr.Blocks() as demo: |
|
gr.HTML(header) |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Column(): |
|
dataset_source = gr.Dropdown( |
|
choices=["hugging_face", "local"], |
|
value="hugging_face", |
|
label="dataset [source]" |
|
) |
|
input_path = gr.Textbox( |
|
value='chupei/format-jsonl', |
|
placeholder="please input hugging_face dataset path", |
|
label="input_path", |
|
visible=True |
|
) |
|
uploaded_file = gr.File( |
|
label="upload file", |
|
visible=False |
|
) |
|
|
|
data_format = gr.Dropdown( |
|
["jsonl", "json", "plaintext", "listjson"], |
|
label="data_format" |
|
) |
|
column_content = gr.Textbox( |
|
value="content", |
|
placeholder="please input column name of content in dataset", |
|
label="column_content" |
|
) |
|
|
|
rule_list = gr.CheckboxGroup( |
|
choices=rule_options, |
|
value=['RuleAbnormalChar', 'RuleAbnormalHtml'], |
|
label="rule_list" |
|
) |
|
prompt_list = gr.CheckboxGroup( |
|
choices=prompt_options, |
|
label="prompt_list" |
|
) |
|
model = gr.Textbox( |
|
placeholder="If want to use llm, please input model, such as: deepseek-chat", |
|
label="model" |
|
) |
|
key = gr.Textbox( |
|
placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", |
|
label="API KEY" |
|
) |
|
api_url = gr.Textbox( |
|
placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", |
|
label="API URL" |
|
) |
|
|
|
with gr.Row(): |
|
submit_single = gr.Button(value="Submit", interactive=True, variant="primary") |
|
|
|
with gr.Column(): |
|
|
|
with gr.Tabs(): |
|
with gr.Tab("Result Summary"): |
|
summary_output = gr.Textbox(label="summary", max_lines=50) |
|
with gr.Tab("Result Detail"): |
|
detail_output = gr.JSON(label="detail", max_height=800) |
|
|
|
dataset_source.change( |
|
fn=update_input_components, |
|
inputs=dataset_source, |
|
outputs=[input_path, uploaded_file] |
|
) |
|
|
|
submit_single.click( |
|
fn=dingo_demo, |
|
inputs=[dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, |
|
model, key, api_url], |
|
outputs=[summary_output, detail_output] |
|
) |
|
|
|
|
|
demo.launch() |
|
|