import json import os import pprint import shutil from functools import partial from pathlib import Path import gradio as gr from dingo.exec import Executor from dingo.io import InputArgs def dingo_demo( uploaded_file, dataset_source, data_format, input_path, max_workers, batch_size, column_id, column_prompt, column_content, column_image, rule_list, prompt_list, scene_list, model, key, api_url ): if not data_format: raise gr.Error('ValueError: data_format can not be empty, please input.') if not column_content: raise gr.Error('ValueError: column_content can not be empty, please input.') if not rule_list and not prompt_list: raise gr.Error('ValueError: rule_list and prompt_list can not be empty at the same time.') # Handle input path based on dataset source if dataset_source == "hugging_face": if not input_path: raise gr.Error('ValueError: input_path can not be empty for hugging_face dataset, please input.') final_input_path = input_path else: # local if not uploaded_file: raise gr.Error('Please upload a file for local dataset.') file_base_name = os.path.basename(uploaded_file.name) if not str(file_base_name).endswith(('.jsonl', '.json', '.txt')): raise gr.Error('File format must be \'.jsonl\', \'.json\' or \'.txt\'') final_input_path = uploaded_file.name if max_workers <= 0: raise gr.Error('Please input value > 0 in max_workers.') if batch_size <= 0: raise gr.Error('Please input value > 0 in batch_size.') try: input_data = { "dataset": dataset_source, "data_format": data_format, "input_path": final_input_path, "output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path), "save_data": True, "save_raw": True, "max_workers": max_workers, "batch_size": batch_size, "column_content": column_content, "custom_config":{ "rule_list": rule_list, "prompt_list": prompt_list, "llm_config": { scene_list: { "model": model, "key": key, "api_url": api_url, } } } } if column_id: input_data['column_id'] = column_id if column_prompt: input_data['column_prompt'] = column_prompt if column_image: input_data['column_image'] = column_image # print(input_data) # exit(0) input_args = InputArgs(**input_data) executor = Executor.exec_map["local"](input_args) summary = executor.execute().to_dict() detail = executor.get_bad_info_list() new_detail = [] for item in detail: new_detail.append(item) if summary['output_path']: shutil.rmtree(summary['output_path']) # 返回两个值:概要信息和详细信息 return json.dumps(summary, indent=4), new_detail except Exception as e: raise gr.Error(str(e)) def update_input_components(dataset_source): # 根据数据源的不同,返回不同的输入组件 if dataset_source == "hugging_face": # 如果数据源是huggingface,返回一个可见的文本框和一个不可见的文件组件 return [ gr.Textbox(visible=True), gr.File(visible=False), ] else: # local # 如果数据源是本地,返回一个不可见的文本框和一个可见的文件组件 return [ gr.Textbox(visible=False), gr.File(visible=True), ] def update_rule_list(rule_type_mapping, rule_type): return gr.CheckboxGroup( choices=rule_type_mapping.get(rule_type, []), value=[], label="rule_list" ) def update_prompt_list(scene_prompt_mapping, scene): """根据选择的场景更新可用的prompt列表,并清空所有勾选""" return gr.CheckboxGroup( choices=scene_prompt_mapping.get(scene, []), value=[], # 清空所有勾选 label="prompt_list" ) # prompt_list变化时,动态控制model、key、api_url的显示 def toggle_llm_fields(prompt_values): visible = bool(prompt_values) return ( gr.update(visible=visible), gr.update(visible=visible), gr.update(visible=visible) ) # 控制column_id、column_prompt、column_content、column_image的显示 def update_column_fields(rule_list, prompt_list): rule_type_mapping = get_rule_type_mapping() scene_prompt_mapping = get_scene_prompt_mapping() data_column_mapping = get_data_column_mapping() status_mapping = { 'id': False, 'prompt': False, 'content': False, 'image': False, } res = ( gr.update(visible=status_mapping['id']), gr.update(visible=status_mapping['prompt']), gr.update(visible=status_mapping['content']), gr.update(visible=status_mapping['image']) ) if not rule_list and not prompt_list: return res key_list = [] key_list += get_key_by_mapping(rule_type_mapping, rule_list) key_list += get_key_by_mapping(scene_prompt_mapping, prompt_list) data_column = [] for key in key_list: if not data_column: data_column = data_column_mapping[key] else: new_data_column = data_column_mapping[key] if data_column != new_data_column: raise gr.Error(f'ConflictError: {key} need data type is different from other.') for c in data_column: status_mapping[c] = True res = ( gr.update(visible=status_mapping['id']), gr.update(visible=status_mapping['prompt']), gr.update(visible=status_mapping['content']), gr.update(visible=status_mapping['image']) ) return res def get_rule_type_mapping(): return { 'QUALITY_BAD_COMPLETENESS': ['RuleLineEndWithEllipsis', 'RuleLineEndWithTerminal', 'RuleSentenceNumber', 'RuleWordNumber'], 'QUALITY_BAD_EFFECTIVENESS': ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleAlphaWords', 'RuleCharNumber', 'RuleColonEnd', 'RuleContentNull', 'RuleContentShort', 'RuleContentShortMultiLan', 'RuleEnterAndSpace', 'RuleEnterMore', 'RuleEnterRatioMore', 'RuleHtmlEntity', 'RuleHtmlTag', 'RuleInvisibleChar', 'RuleLineJavascriptCount', 'RuleLoremIpsum', 'RuleMeanWordLength', 'RuleSpaceMore', 'RuleSpecialCharacter', 'RuleStopWord', 'RuleSymbolWordRatio', 'RuleOnlyUrl'], 'QUALITY_BAD_FLUENCY': ['RuleAbnormalNumber', 'RuleCharSplit', 'RuleNoPunc', 'RuleWordSplit', 'RuleWordStuck'], 'QUALITY_BAD_RELEVANCE': ['RuleHeadWordAr'], 'QUALITY_BAD_SIMILARITY': ['RuleDocRepeat'], 'QUALITY_BAD_UNDERSTANDABILITY': ['RuleCapitalWords', 'RuleCurlyBracket', 'RuleLineStartWithBulletpoint', 'RuleUniqueWords'], 'QUALITY_BAD_IMG_EFFECTIVENESS': ['RuleImageValid', 'RuleImageSizeValid', 'RuleImageQuality'], 'QUALITY_BAD_IMG_RELEVANCE': ['RuleImageTextSimilarity'], 'QUALITY_BAD_IMG_SIMILARITY': ['RuleImageRepeat'] } def get_scene_prompt_mapping(): return { # 示例映射关系,你可以根据实际需求修改 "LLMTextQualityPromptBase": ['PromptRepeat', 'PromptContentChaos'], 'LLMTextQualityModelBase': ['PromptTextQualityV3', 'PromptTextQualityV4'], 'LLMSecurityPolitics': ['PromptPolitics'], 'LLMSecurityProhibition': ['PromptProhibition'], 'LLMText3HHarmless': ['PromptTextHelpful'], 'LLMText3HHelpful': ['PromptTextHelpful'], 'LLMText3HHonest': ['PromptTextHonest'], 'LLMClassifyTopic': ['PromptClassifyTopic'], 'LLMClassifyQR': ['PromptClassifyQR'], "VLMImageRelevant": ["PromptImageRelevant"], } def get_key_by_mapping(map_dict: dict, value_list: list): key_list = [] for k,v in map_dict.items(): if bool(set(v) & set(value_list)): key_list.append(k) return key_list def get_data_column_mapping(): return { 'LLMTextQualityPromptBase': ['content'], 'LLMTextQualityModelBase': ['content'], 'LLMSecurityPolitics': ['content'], 'LLMSecurityProhibition': ['content'], 'LLMText3HHarmless': ['content'], 'LLMText3HHelpful': ['content'], 'LLMText3HHonest': ['content'], 'LLMClassifyTopic': ['content'], 'LLMClassifyQR': ['content'], 'VLMImageRelevant': ['prompt', 'content'], 'QUALITY_BAD_COMPLETENESS': ['content'], 'QUALITY_BAD_EFFECTIVENESS': ['content'], 'QUALITY_BAD_FLUENCY': ['content'], 'QUALITY_BAD_RELEVANCE': ['content'], 'QUALITY_BAD_SIMILARITY': ['content'], 'QUALITY_BAD_UNDERSTANDABILITY': ['content'], 'QUALITY_BAD_IMG_EFFECTIVENESS': ['image'], 'QUALITY_BAD_IMG_RELEVANCE': ['content','image'], 'QUALITY_BAD_IMG_SIMILARITY': ['content'], } if __name__ == '__main__': rule_type_mapping = get_rule_type_mapping() rule_type_options = list(rule_type_mapping.keys()) scene_prompt_mapping = get_scene_prompt_mapping() scene_options = list(scene_prompt_mapping.keys()) current_dir = Path(__file__).parent with open(os.path.join(current_dir, 'header.html'), "r") as file: header = file.read() with gr.Blocks() as demo: gr.HTML(header) with gr.Row(): with gr.Column(): with gr.Column(): dataset_source = gr.Dropdown( choices=["hugging_face", "local"], value="hugging_face", label="dataset [source]" ) input_path = gr.Textbox( value='chupei/format-jsonl', placeholder="please input hugging_face dataset path", label="input_path", visible=True ) uploaded_file = gr.File( label="upload file", visible=False ) data_format = gr.Dropdown( ["jsonl", "json", "plaintext", "listjson"], label="data_format" ) with gr.Row(): max_workers = gr.Number( value=1, # placeholder="", label="max_workers", precision=0 ) batch_size = gr.Number( value=1, # placeholder="", label="batch_size", precision=0 ) # Add the rule_type dropdown near where scene_list is defined rule_type = gr.Dropdown( choices=rule_type_options, value=rule_type_options[0], label="rule_type", interactive=True ) rule_list = gr.CheckboxGroup( choices=rule_type_mapping.get(rule_type_options[0], []), label="rule_list" ) # 添加场景选择下拉框 scene_list = gr.Dropdown( choices=scene_options, value=scene_options[0], label="scene_list", interactive=True ) prompt_list = gr.CheckboxGroup( choices=scene_prompt_mapping.get(scene_options[0], []), label="prompt_list" ) # LLM模型名 model = gr.Textbox( placeholder="If want to use llm, please input model, such as: deepseek-chat", label="model", visible=False ) # LLM API KEY key = gr.Textbox( placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", label="API KEY", visible=False ) # LLM API URL api_url = gr.Textbox( placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", label="API URL", visible=False ) with gr.Row(): # 字段映射说明文本,带示例链接 with gr.Column(): gr.Markdown("Field Matching: Please input the column name of dataset in the input boxes below ( [examples](https://github.com/MigoXLab/dingo/tree/main/examples) )") column_id = gr.Textbox( value="", placeholder="Column name of id in the input file. If exists multiple levels, use '.' separate", label="column_id", visible=False ) column_prompt = gr.Textbox( value="", placeholder="Column name of prompt in the input file. If exists multiple levels, use '.' separate", label="column_prompt", visible=False ) column_content = gr.Textbox( value="content", placeholder="Column name of content in the input file. If exists multiple levels, use '.' separate", label="column_content", visible=False ) column_image = gr.Textbox( value="", placeholder="Column name of image in the input file. If exists multiple levels, use '.' separate", label="column_image", visible=False ) with gr.Row(): submit_single = gr.Button(value="Submit", interactive=True, variant="primary") with gr.Column(): # 修改输出组件部分,使用Tabs with gr.Tabs(): with gr.Tab("Result Summary"): summary_output = gr.Textbox(label="summary", max_lines=50) with gr.Tab("Result Detail"): detail_output = gr.JSON(label="detail", max_height=800) # 使用JSON组件来更好地展示结构化数据 dataset_source.change( fn=update_input_components, inputs=dataset_source, outputs=[input_path, uploaded_file] ) rule_type.change( fn=partial(update_rule_list, rule_type_mapping), inputs=rule_type, outputs=rule_list ) # 场景变化时更新prompt列表 scene_list.change( fn=partial(update_prompt_list, scene_prompt_mapping), inputs=scene_list, outputs=prompt_list ) prompt_list.change( fn=toggle_llm_fields, inputs=prompt_list, outputs=[model, key, api_url] ) # column字段显示控制 for comp in [rule_list, prompt_list]: comp.change( fn=update_column_fields, inputs=[rule_list, prompt_list], outputs=[column_id, column_prompt, column_content, column_image] ) submit_single.click( fn=dingo_demo, inputs=[ uploaded_file, dataset_source, data_format, input_path, max_workers, batch_size, column_id, column_prompt, column_content, column_image, rule_list, prompt_list, scene_list, model, key, api_url ], outputs=[summary_output, detail_output] # 修改输出为两个组件 ) # 启动界面 demo.launch()