DataEval commited on
Commit
e368b32
·
verified ·
1 Parent(s): 01f1fbb

add datasource local and result detail

Browse files
Files changed (1) hide show
  1. app.py +113 -22
app.py CHANGED
@@ -1,22 +1,37 @@
1
  import json
 
 
2
 
3
  import gradio as gr
4
  from dingo.exec import Executor
5
  from dingo.io import InputArgs
6
 
7
 
8
- def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url):
9
- if not input_path:
10
- return 'ValueError: input_path can not be empty, please input.'
11
  if not data_format:
12
- return 'ValueError: data_format can not be empty, please input.'
13
  if not column_content:
14
- return 'ValueError: column_content can not be empty, please input.'
15
  if not rule_list and not prompt_list:
16
- return 'ValueError: rule_list and prompt_list can not be empty at the same time.'
 
 
 
 
 
 
 
 
 
 
17
 
18
  input_data = {
19
- "input_path": input_path,
 
 
 
 
20
  "data_format": data_format,
21
  "column_content": column_content,
22
  "custom_config":
@@ -36,9 +51,33 @@ def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list,
36
  }
37
  input_args = InputArgs(**input_data)
38
  executor = Executor.exec_map["local"](input_args)
39
- result = executor.execute()
40
- summary = result[0].to_dict()
41
- return json.dumps(summary, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  if __name__ == '__main__':
@@ -51,24 +90,76 @@ if __name__ == '__main__':
51
  gr.HTML(header)
52
  with gr.Row():
53
  with gr.Column():
54
- input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path")
55
- data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format")
56
- column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content")
57
- rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list")
58
- prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list")
59
- model = gr.Textbox(placeholder="If want to use llm, please input model, such as: deepseek-chat", label="model")
60
- key = gr.Textbox(placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", label="key")
61
- api_url = gr.Textbox(placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", label="api_url")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  with gr.Row():
63
  submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
 
64
  with gr.Column():
65
- # 输出组件
66
- output = gr.Textbox(label="output")
 
 
 
 
 
 
 
 
 
 
67
 
68
  submit_single.click(
69
  fn=dingo_demo,
70
- inputs=[input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url],
71
- outputs=output
 
72
  )
73
 
74
  # 启动界面
 
1
  import json
2
+ import os
3
+ import shutil
4
 
5
  import gradio as gr
6
  from dingo.exec import Executor
7
  from dingo.io import InputArgs
8
 
9
 
10
+ def dingo_demo(dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, model,
11
+ key, api_url):
 
12
  if not data_format:
13
+ return 'ValueError: data_format can not be empty, please input.', None
14
  if not column_content:
15
+ return 'ValueError: column_content can not be empty, please input.', None
16
  if not rule_list and not prompt_list:
17
+ return 'ValueError: rule_list and prompt_list can not be empty at the same time.', None
18
+
19
+ # Handle input path based on dataset source
20
+ if dataset_source == "hugging_face":
21
+ if not input_path:
22
+ return 'ValueError: input_path can not be empty for hugging_face dataset, please input.', None
23
+ final_input_path = input_path
24
+ else: # local
25
+ if not uploaded_file:
26
+ return 'ValueError: Please upload a file for local dataset.', None
27
+ final_input_path = uploaded_file.name
28
 
29
  input_data = {
30
+ "dataset": dataset_source,
31
+ "input_path": final_input_path,
32
+ "output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path),
33
+ "save_data": True,
34
+ "save_raw": True,
35
  "data_format": data_format,
36
  "column_content": column_content,
37
  "custom_config":
 
51
  }
52
  input_args = InputArgs(**input_data)
53
  executor = Executor.exec_map["local"](input_args)
54
+ executor.execute()
55
+ summary = executor.get_summary().to_dict()
56
+ detail = executor.get_bad_info_list()
57
+ new_detail = []
58
+ for item in detail:
59
+ new_detail.append(item.to_raw_dict())
60
+ if summary['output_path']:
61
+ shutil.rmtree(summary['output_path'])
62
+
63
+ # 返回两个值:概要信息和详细信息
64
+ return json.dumps(summary, indent=4), new_detail
65
+
66
+
67
+ def update_input_components(dataset_source):
68
+ # 根据数据源的不同,返回不同的输入组件
69
+ if dataset_source == "hugging_face":
70
+ # 如果数据源是huggingface,返回一个可见的文本框和一个不可见的文件组件
71
+ return [
72
+ gr.Textbox(visible=True),
73
+ gr.File(visible=False),
74
+ ]
75
+ else: # local
76
+ # 如果数据源是本地,返回一个不可见的文本框和一个可见的文件组件
77
+ return [
78
+ gr.Textbox(visible=False),
79
+ gr.File(visible=True),
80
+ ]
81
 
82
 
83
  if __name__ == '__main__':
 
90
  gr.HTML(header)
91
  with gr.Row():
92
  with gr.Column():
93
+ with gr.Column():
94
+ dataset_source = gr.Dropdown(
95
+ choices=["hugging_face", "local"],
96
+ value="local",
97
+ label="dataset [source]"
98
+ )
99
+ input_path = gr.Textbox(
100
+ value='chupei/format-jsonl',
101
+ placeholder="please input hugging_face dataset path",
102
+ label="input_path",
103
+ visible=False
104
+ )
105
+ uploaded_file = gr.File(
106
+ label="upload file",
107
+ visible=True
108
+ )
109
+
110
+ data_format = gr.Dropdown(
111
+ ["jsonl", "json", "plaintext", "listjson"],
112
+ label="data_format"
113
+ )
114
+ column_content = gr.Textbox(
115
+ value="content",
116
+ placeholder="please input column name of content in dataset",
117
+ label="column_content"
118
+ )
119
+
120
+ rule_list = gr.CheckboxGroup(
121
+ choices=rule_options,
122
+ label="rule_list"
123
+ )
124
+ prompt_list = gr.CheckboxGroup(
125
+ choices=prompt_options,
126
+ label="prompt_list"
127
+ )
128
+ model = gr.Textbox(
129
+ placeholder="If want to use llm, please input model, such as: deepseek-chat",
130
+ label="model"
131
+ )
132
+ key = gr.Textbox(
133
+ placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx",
134
+ label="API KEY"
135
+ )
136
+ api_url = gr.Textbox(
137
+ placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1",
138
+ label="API URL"
139
+ )
140
+
141
  with gr.Row():
142
  submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
143
+
144
  with gr.Column():
145
+ # 修改输出组件部分,使用Tabs
146
+ with gr.Tabs():
147
+ with gr.Tab("Result Summary"):
148
+ summary_output = gr.Textbox(label="summary", max_lines=50)
149
+ with gr.Tab("Result Detail"):
150
+ detail_output = gr.JSON(label="detail", max_height=800) # 使用JSON组件来更好地展示结构化数据
151
+
152
+ dataset_source.change(
153
+ fn=update_input_components,
154
+ inputs=dataset_source,
155
+ outputs=[input_path, uploaded_file]
156
+ )
157
 
158
  submit_single.click(
159
  fn=dingo_demo,
160
+ inputs=[dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list,
161
+ model, key, api_url],
162
+ outputs=[summary_output, detail_output] # 修改输出为两个组件
163
  )
164
 
165
  # 启动界面