DataEval commited on
Commit
2b4b49c
·
verified ·
1 Parent(s): bea9e81

feat: gradio add new function #123

Browse files
Files changed (1) hide show
  1. app.py +318 -53
app.py CHANGED
@@ -1,67 +1,99 @@
1
  import json
2
  import os
 
3
  import shutil
 
 
4
 
5
  import gradio as gr
 
6
  from dingo.exec import Executor
7
  from dingo.io import InputArgs
8
 
9
 
10
- def dingo_demo(dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, model,
11
- key, api_url):
 
 
 
 
 
12
  if not data_format:
13
- return 'ValueError: data_format can not be empty, please input.', None
14
  if not column_content:
15
- return 'ValueError: column_content can not be empty, please input.', None
16
  if not rule_list and not prompt_list:
17
- return 'ValueError: rule_list and prompt_list can not be empty at the same time.', None
18
 
19
  # Handle input path based on dataset source
20
  if dataset_source == "hugging_face":
21
  if not input_path:
22
- return 'ValueError: input_path can not be empty for hugging_face dataset, please input.', None
23
  final_input_path = input_path
24
  else: # local
25
  if not uploaded_file:
26
- return 'ValueError: Please upload a file for local dataset.', None
 
 
 
 
 
27
  final_input_path = uploaded_file.name
28
 
29
- input_data = {
30
- "dataset": dataset_source,
31
- "input_path": final_input_path,
32
- "output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path),
33
- "save_data": True,
34
- "save_raw": True,
35
- "data_format": data_format,
36
- "column_content": column_content,
37
- "custom_config":
38
- {
 
 
 
 
 
 
 
 
 
39
  "rule_list": rule_list,
40
  "prompt_list": prompt_list,
41
- "llm_config":
42
- {
43
- "detect_text_quality_detail":
44
- {
45
- "model": model,
46
- "key": key,
47
- "api_url": api_url,
48
- }
49
  }
 
50
  }
51
- }
52
- input_args = InputArgs(**input_data)
53
- executor = Executor.exec_map["local"](input_args)
54
- executor.execute()
55
- summary = executor.get_summary().to_dict()
56
- detail = executor.get_bad_info_list()
57
- new_detail = []
58
- for item in detail:
59
- new_detail.append(item.to_raw_dict())
60
- if summary['output_path']:
61
- shutil.rmtree(summary['output_path'])
62
 
63
- # 返回两个值:概要信息和详细信息
64
- return json.dumps(summary, indent=4), new_detail
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  def update_input_components(dataset_source):
@@ -80,11 +112,159 @@ def update_input_components(dataset_source):
80
  ]
81
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  if __name__ == '__main__':
84
  rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl']
85
- prompt_options = ['PromptRepeat', 'PromptContentChaos']
 
86
 
87
- with open("header.html", "r") as file:
 
 
 
 
 
88
  header = file.read()
89
  with gr.Blocks() as demo:
90
  gr.HTML(header)
@@ -111,34 +291,87 @@ if __name__ == '__main__':
111
  ["jsonl", "json", "plaintext", "listjson"],
112
  label="data_format"
113
  )
114
- column_content = gr.Textbox(
115
- value="content",
116
- placeholder="please input column name of content in dataset",
117
- label="column_content"
118
- )
 
 
 
 
 
 
 
 
119
 
 
 
 
 
 
 
 
120
  rule_list = gr.CheckboxGroup(
121
- choices=rule_options,
122
- value=['RuleAbnormalChar', 'RuleAbnormalHtml'],
123
  label="rule_list"
124
  )
 
 
 
 
 
 
 
125
  prompt_list = gr.CheckboxGroup(
126
- choices=prompt_options,
127
  label="prompt_list"
128
  )
 
129
  model = gr.Textbox(
130
  placeholder="If want to use llm, please input model, such as: deepseek-chat",
131
- label="model"
 
132
  )
 
133
  key = gr.Textbox(
134
  placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx",
135
- label="API KEY"
 
136
  )
 
137
  api_url = gr.Textbox(
138
  placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1",
139
- label="API URL"
 
140
  )
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  with gr.Row():
143
  submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
144
 
@@ -156,10 +389,42 @@ if __name__ == '__main__':
156
  outputs=[input_path, uploaded_file]
157
  )
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  submit_single.click(
160
  fn=dingo_demo,
161
- inputs=[dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list,
162
- model, key, api_url],
 
 
 
 
 
163
  outputs=[summary_output, detail_output] # 修改输出为两个组件
164
  )
165
 
 
1
  import json
2
  import os
3
+ import pprint
4
  import shutil
5
+ from functools import partial
6
+ from pathlib import Path
7
 
8
  import gradio as gr
9
+
10
  from dingo.exec import Executor
11
  from dingo.io import InputArgs
12
 
13
 
14
+ def dingo_demo(
15
+ uploaded_file,
16
+ dataset_source, data_format, input_path, max_workers, batch_size,
17
+ column_id, column_prompt, column_content, column_image,
18
+ rule_list, prompt_list, scene_list,
19
+ model, key, api_url
20
+ ):
21
  if not data_format:
22
+ raise gr.Error('ValueError: data_format can not be empty, please input.')
23
  if not column_content:
24
+ raise gr.Error('ValueError: column_content can not be empty, please input.')
25
  if not rule_list and not prompt_list:
26
+ raise gr.Error('ValueError: rule_list and prompt_list can not be empty at the same time.')
27
 
28
  # Handle input path based on dataset source
29
  if dataset_source == "hugging_face":
30
  if not input_path:
31
+ raise gr.Error('ValueError: input_path can not be empty for hugging_face dataset, please input.')
32
  final_input_path = input_path
33
  else: # local
34
  if not uploaded_file:
35
+ raise gr.Error('Please upload a file for local dataset.')
36
+
37
+ file_base_name = os.path.basename(uploaded_file.name)
38
+ if not str(file_base_name).endswith(('.jsonl', '.json', '.txt')):
39
+ raise gr.Error('File format must be \'.jsonl\', \'.json\' or \'.txt\'')
40
+
41
  final_input_path = uploaded_file.name
42
 
43
+ if max_workers <= 0:
44
+ raise gr.Error('Please input value > 0 in max_workers.')
45
+ if batch_size <= 0:
46
+ raise gr.Error('Please input value > 0 in batch_size.')
47
+
48
+ try:
49
+ input_data = {
50
+ "dataset": dataset_source,
51
+ "data_format": data_format,
52
+ "input_path": final_input_path,
53
+ "output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path),
54
+ "save_data": True,
55
+ "save_raw": True,
56
+
57
+ "max_workers": max_workers,
58
+ "batch_size": batch_size,
59
+
60
+ "column_content": column_content,
61
+ "custom_config":{
62
  "rule_list": rule_list,
63
  "prompt_list": prompt_list,
64
+ "llm_config": {
65
+ scene_list: {
66
+ "model": model,
67
+ "key": key,
68
+ "api_url": api_url,
 
 
 
69
  }
70
+ }
71
  }
72
+ }
73
+ if column_id:
74
+ input_data['column_id'] = column_id
75
+ if column_prompt:
76
+ input_data['column_prompt'] = column_prompt
77
+ if column_image:
78
+ input_data['column_image'] = column_image
 
 
 
 
79
 
80
+ # print(input_data)
81
+ # exit(0)
82
+
83
+ input_args = InputArgs(**input_data)
84
+ executor = Executor.exec_map["local"](input_args)
85
+ summary = executor.execute().to_dict()
86
+ detail = executor.get_bad_info_list()
87
+ new_detail = []
88
+ for item in detail:
89
+ new_detail.append(item)
90
+ if summary['output_path']:
91
+ shutil.rmtree(summary['output_path'])
92
+
93
+ # 返回两个值:概要信息和详细信息
94
+ return json.dumps(summary, indent=4), new_detail
95
+ except Exception as e:
96
+ raise gr.Error(str(e))
97
 
98
 
99
  def update_input_components(dataset_source):
 
112
  ]
113
 
114
 
115
+ def update_rule_list(rule_type_mapping, rule_type):
116
+ return gr.CheckboxGroup(
117
+ choices=rule_type_mapping.get(rule_type, []),
118
+ # value=[],
119
+ label="rule_list"
120
+ )
121
+
122
+
123
+ def update_prompt_list(scene_prompt_mapping, scene):
124
+ """根据选择的场景更新可用的prompt列表,并清空所有勾选"""
125
+ return gr.CheckboxGroup(
126
+ choices=scene_prompt_mapping.get(scene, []),
127
+ value=[], # 清空所有勾选
128
+ label="prompt_list"
129
+ )
130
+
131
+
132
+ # prompt_list变化时,动态控制model、key、api_url的显示
133
+ def toggle_llm_fields(prompt_values):
134
+ visible = bool(prompt_values)
135
+ return (
136
+ gr.update(visible=visible),
137
+ gr.update(visible=visible),
138
+ gr.update(visible=visible)
139
+ )
140
+
141
+
142
+ # 控制column_id、column_prompt、column_content、column_image的显示
143
+ def update_column_fields(rule_list, prompt_list):
144
+ rule_type_mapping = get_rule_type_mapping()
145
+ scene_prompt_mapping = get_scene_prompt_mapping()
146
+ data_column_mapping = get_data_column_mapping()
147
+ status_mapping = {
148
+ 'id': False,
149
+ 'prompt': False,
150
+ 'content': False,
151
+ 'image': False,
152
+ }
153
+
154
+ res = (
155
+ gr.update(visible=status_mapping['id']),
156
+ gr.update(visible=status_mapping['prompt']),
157
+ gr.update(visible=status_mapping['content']),
158
+ gr.update(visible=status_mapping['image'])
159
+ )
160
+ if not rule_list and not prompt_list:
161
+ return res
162
+
163
+ key_list = []
164
+ key_list += get_key_by_mapping(rule_type_mapping, rule_list)
165
+ key_list += get_key_by_mapping(scene_prompt_mapping, prompt_list)
166
+
167
+ data_column = []
168
+ for key in key_list:
169
+ if not data_column:
170
+ data_column = data_column_mapping[key]
171
+ else:
172
+ new_data_column = data_column_mapping[key]
173
+ if data_column != new_data_column:
174
+ raise gr.Error(f'ConflictError: {key} need data type is different from other.')
175
+
176
+ for c in data_column:
177
+ status_mapping[c] = True
178
+ res = (
179
+ gr.update(visible=status_mapping['id']),
180
+ gr.update(visible=status_mapping['prompt']),
181
+ gr.update(visible=status_mapping['content']),
182
+ gr.update(visible=status_mapping['image'])
183
+ )
184
+ return res
185
+
186
+
187
+ def get_rule_type_mapping():
188
+ return {
189
+ 'QUALITY_BAD_COMPLETENESS': ['RuleLineEndWithEllipsis', 'RuleLineEndWithTerminal', 'RuleSentenceNumber',
190
+ 'RuleWordNumber'],
191
+ 'QUALITY_BAD_EFFECTIVENESS': ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleAlphaWords', 'RuleCharNumber',
192
+ 'RuleColonEnd', 'RuleContentNull', 'RuleContentShort', 'RuleContentShortMultiLan',
193
+ 'RuleEnterAndSpace', 'RuleEnterMore', 'RuleEnterRatioMore', 'RuleHtmlEntity',
194
+ 'RuleHtmlTag', 'RuleInvisibleChar', 'RuleLineJavascriptCount', 'RuleLoremIpsum',
195
+ 'RuleMeanWordLength', 'RuleSpaceMore', 'RuleSpecialCharacter', 'RuleStopWord',
196
+ 'RuleSymbolWordRatio', 'RuleOnlyUrl'],
197
+ 'QUALITY_BAD_FLUENCY': ['RuleAbnormalNumber', 'RuleCharSplit', 'RuleNoPunc', 'RuleWordSplit', 'RuleWordStuck'],
198
+ 'QUALITY_BAD_RELEVANCE': ['RuleHeadWordAr'],
199
+ 'QUALITY_BAD_SIMILARITY': ['RuleDocRepeat'],
200
+ 'QUALITY_BAD_UNDERSTANDABILITY': ['RuleCapitalWords', 'RuleCurlyBracket', 'RuleLineStartWithBulletpoint',
201
+ 'RuleUniqueWords'],
202
+ 'QUALITY_BAD_IMG_EFFECTIVENESS': ['RuleImageValid', 'RuleImageSizeValid', 'RuleImageQuality'],
203
+ 'QUALITY_BAD_IMG_RELEVANCE': ['RuleImageTextSimilarity'],
204
+ 'QUALITY_BAD_IMG_SIMILARITY': ['RuleImageRepeat']
205
+ }
206
+
207
+
208
+ def get_scene_prompt_mapping():
209
+ return {
210
+ # 示例映射关系,你可以根据实际需求修改
211
+ "LLMTextQualityPromptBase": ['PromptRepeat', 'PromptContentChaos'],
212
+ 'LLMTextQualityModelBase': ['PromptTextQualityV3', 'PromptTextQualityV4'],
213
+ 'LLMSecurityPolitics': ['PromptPolitics'],
214
+ 'LLMSecurityProhibition': ['PromptProhibition'],
215
+ 'LLMText3HHarmless': ['PromptTextHelpful'],
216
+ 'LLMText3HHelpful': ['PromptTextHelpful'],
217
+ 'LLMText3HHonest': ['PromptTextHonest'],
218
+ 'LLMClassifyTopic': ['PromptClassifyTopic'],
219
+ 'LLMClassifyQR': ['PromptClassifyQR'],
220
+ "VLMImageRelevant": ["PromptImageRelevant"],
221
+ }
222
+
223
+
224
+ def get_key_by_mapping(map_dict: dict, value_list: list):
225
+ key_list = []
226
+ for k,v in map_dict.items():
227
+ if bool(set(v) & set(value_list)):
228
+ key_list.append(k)
229
+
230
+ return key_list
231
+
232
+
233
+ def get_data_column_mapping():
234
+ return {
235
+ 'LLMTextQualityPromptBase': ['content'],
236
+ 'LLMTextQualityModelBase': ['content'],
237
+ 'LLMSecurityPolitics': ['content'],
238
+ 'LLMSecurityProhibition': ['content'],
239
+ 'LLMText3HHarmless': ['content'],
240
+ 'LLMText3HHelpful': ['content'],
241
+ 'LLMText3HHonest': ['content'],
242
+ 'LLMClassifyTopic': ['content'],
243
+ 'LLMClassifyQR': ['content'],
244
+ 'VLMImageRelevant': ['prompt', 'content'],
245
+ 'QUALITY_BAD_COMPLETENESS': ['content'],
246
+ 'QUALITY_BAD_EFFECTIVENESS': ['content'],
247
+ 'QUALITY_BAD_FLUENCY': ['content'],
248
+ 'QUALITY_BAD_RELEVANCE': ['content'],
249
+ 'QUALITY_BAD_SIMILARITY': ['content'],
250
+ 'QUALITY_BAD_UNDERSTANDABILITY': ['content'],
251
+ 'QUALITY_BAD_IMG_EFFECTIVENESS': ['image'],
252
+ 'QUALITY_BAD_IMG_RELEVANCE': ['content','image'],
253
+ 'QUALITY_BAD_IMG_SIMILARITY': ['content'],
254
+ }
255
+
256
+
257
  if __name__ == '__main__':
258
  rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl']
259
+ rule_type_mapping = get_rule_type_mapping()
260
+ rule_type_options = list(rule_type_mapping.keys())
261
 
262
+ # prompt_options = ['PromptRepeat', 'PromptContentChaos']
263
+ scene_prompt_mapping = get_scene_prompt_mapping()
264
+ scene_options = list(scene_prompt_mapping.keys())
265
+
266
+ current_dir = Path(__file__).parent
267
+ with open(os.path.join(current_dir, 'header.html'), "r") as file:
268
  header = file.read()
269
  with gr.Blocks() as demo:
270
  gr.HTML(header)
 
291
  ["jsonl", "json", "plaintext", "listjson"],
292
  label="data_format"
293
  )
294
+ with gr.Row():
295
+ max_workers = gr.Number(
296
+ value=1,
297
+ # placeholder="",
298
+ label="max_workers",
299
+ precision=0
300
+ )
301
+ batch_size = gr.Number(
302
+ value=1,
303
+ # placeholder="",
304
+ label="batch_size",
305
+ precision=0
306
+ )
307
 
308
+ # Add the rule_type dropdown near where scene_list is defined
309
+ rule_type = gr.Dropdown(
310
+ choices=rule_type_options,
311
+ value=rule_type_options[0],
312
+ label="rule_type",
313
+ interactive=True
314
+ )
315
  rule_list = gr.CheckboxGroup(
316
+ choices=rule_type_mapping.get(rule_type_options[0], []),
 
317
  label="rule_list"
318
  )
319
+ # 添加场景选择下拉框
320
+ scene_list = gr.Dropdown(
321
+ choices=scene_options,
322
+ value=scene_options[0],
323
+ label="scene_list",
324
+ interactive=True
325
+ )
326
  prompt_list = gr.CheckboxGroup(
327
+ choices=scene_prompt_mapping.get(scene_options[0], []),
328
  label="prompt_list"
329
  )
330
+ # LLM模型名
331
  model = gr.Textbox(
332
  placeholder="If want to use llm, please input model, such as: deepseek-chat",
333
+ label="model",
334
+ visible=False
335
  )
336
+ # LLM API KEY
337
  key = gr.Textbox(
338
  placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx",
339
+ label="API KEY",
340
+ visible=False
341
  )
342
+ # LLM API URL
343
  api_url = gr.Textbox(
344
  placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1",
345
+ label="API URL",
346
+ visible=False
347
  )
348
 
349
+ with gr.Row():
350
+ column_id = gr.Textbox(
351
+ value="",
352
+ # placeholder="please input column name of data id in dataset",
353
+ label="column_id",
354
+ visible=False
355
+ )
356
+ column_prompt = gr.Textbox(
357
+ value="",
358
+ # placeholder="please input column name of prompt in dataset",
359
+ label="column_prompt",
360
+ visible=False
361
+ )
362
+ column_content = gr.Textbox(
363
+ value="content",
364
+ # placeholder="please input column name of content in dataset",
365
+ label="column_content",
366
+ visible=False
367
+ )
368
+ column_image = gr.Textbox(
369
+ value="",
370
+ # placeholder="please input column name of image in dataset",
371
+ label="column_image",
372
+ visible=False
373
+ )
374
+
375
  with gr.Row():
376
  submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
377
 
 
389
  outputs=[input_path, uploaded_file]
390
  )
391
 
392
+ rule_type.change(
393
+ fn=partial(update_rule_list, rule_type_mapping),
394
+ inputs=rule_type,
395
+ outputs=rule_list
396
+ )
397
+
398
+ # 场景变化时更新prompt列表
399
+ scene_list.change(
400
+ fn=partial(update_prompt_list, scene_prompt_mapping),
401
+ inputs=scene_list,
402
+ outputs=prompt_list
403
+ )
404
+
405
+ prompt_list.change(
406
+ fn=toggle_llm_fields,
407
+ inputs=prompt_list,
408
+ outputs=[model, key, api_url]
409
+ )
410
+
411
+ # column字段显示控制
412
+ for comp in [rule_list, prompt_list]:
413
+ comp.change(
414
+ fn=update_column_fields,
415
+ inputs=[rule_list, prompt_list],
416
+ outputs=[column_id, column_prompt, column_content, column_image]
417
+ )
418
+
419
  submit_single.click(
420
  fn=dingo_demo,
421
+ inputs=[
422
+ uploaded_file,
423
+ dataset_source, data_format, input_path, max_workers, batch_size,
424
+ column_id, column_prompt, column_content, column_image,
425
+ rule_list, prompt_list, scene_list,
426
+ model, key, api_url
427
+ ],
428
  outputs=[summary_output, detail_output] # 修改输出为两个组件
429
  )
430