admin commited on
Commit
8cf49e1
·
1 Parent(s): 0f02a44
Files changed (5) hide show
  1. .gitignore +3 -0
  2. README.md +3 -4
  3. app.py +207 -0
  4. requirements.txt +2 -0
  5. utils.py +12 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.gif
2
+ test.*
3
+ *__pycache__*
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
  title: Data Converter
3
- emoji: 📚
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.38.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Data Converter
3
+ emoji: 🔄
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.22.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Convert data format
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from utils import clean_dir, TMP_DIR, EN_US
6
+
7
+
8
+ MODE = {"from": "jsonl", "to": "csv"}
9
+ ZH2EN = {
10
+ "模式": "Mode",
11
+ "上传原数据": "Upload input file",
12
+ "转换": "Convert",
13
+ "下载转换数据": "Download output file",
14
+ "数据预览": "Data viewer",
15
+ "支持的 JSON 格式": "Supported JSON format",
16
+ "支持的 JSON Lines 格式": "Supported jsonl format",
17
+ "支持的 CSV 格式": "Supported CSV format",
18
+ "状态栏": "Status",
19
+ "# 数据文件转换": "# Data Converter",
20
+ }
21
+
22
+
23
+ def _L(zh_txt: str):
24
+ return ZH2EN[zh_txt] if EN_US else zh_txt
25
+
26
+
27
+ def encoder_json(file_path: str):
28
+ with open(file_path, "r", encoding="utf-8") as file:
29
+ data_list = list(json.load(file))
30
+
31
+ return data_list
32
+
33
+
34
+ def encoder_jsonl(file_path: str):
35
+ data_list = []
36
+ with open(file_path, "r", encoding="utf-8") as file:
37
+ for line in file:
38
+ # 加载每一行的 JSON 数据
39
+ json_data = json.loads(line.strip())
40
+ data_list.append(json_data)
41
+
42
+ return data_list
43
+
44
+
45
+ def encoder_csv(file_path: str):
46
+ data_list = []
47
+ try:
48
+ with open(file_path, "r", encoding="utf-8") as file:
49
+ csv_reader = csv.DictReader(file)
50
+ for row in csv_reader:
51
+ data_list.append(dict(row))
52
+
53
+ except UnicodeDecodeError:
54
+ with open(file_path, "r", encoding="GBK") as file:
55
+ csv_reader = csv.DictReader(file)
56
+ for row in csv_reader:
57
+ data_list.append(dict(row))
58
+
59
+ return data_list
60
+
61
+
62
+ def decoder_json(data_list: list, file_path: str):
63
+ if data_list:
64
+ with open(file_path, "w", encoding="utf-8") as file:
65
+ # 将整个列表转换成 JSON 格式并写入文件
66
+ json.dump(data_list, file, ensure_ascii=False, indent=4)
67
+
68
+ return file_path
69
+
70
+
71
+ def decoder_csv(data_list: list, file_path: str):
72
+ if data_list: # 提取第一个字典的键作为表头
73
+ header = list(data_list[0].keys())
74
+ with open(file_path, "w", newline="", encoding="utf-8") as file:
75
+ csv_writer = csv.writer(file) # 写入表头
76
+ csv_writer.writerow(header) # 逐项写入字典的值
77
+ for item in data_list:
78
+ csv_writer.writerow([item[key] for key in header])
79
+
80
+ return file_path
81
+
82
+
83
+ def decoder_jsonl(data_list: list, file_path: str):
84
+ if data_list:
85
+ with open(file_path, "w", encoding="utf-8") as file:
86
+ for data in data_list:
87
+ # 将每个 JSON 对象转换成字符串并写入文件,每行一个对象
88
+ json_line = json.dumps(data, ensure_ascii=False)
89
+ file.write(f"{json_line}\n")
90
+
91
+ return file_path
92
+
93
+
94
+ def change_mode(input: str):
95
+ global MODE
96
+ affix = input.split(" ")
97
+ if affix[1] == "→":
98
+ MODE["from"] = affix[0]
99
+ MODE["to"] = affix[2]
100
+
101
+ else:
102
+ MODE["from"] = affix[2]
103
+ MODE["to"] = affix[0]
104
+
105
+
106
+ # outer func
107
+ def infer(input_file: str, cache=f"{TMP_DIR}/data"):
108
+ status = "Success"
109
+ output_file = previews = None
110
+ try:
111
+ clean_dir(cache)
112
+ src_fmt = MODE["from"]
113
+ dst_fmt = MODE["to"]
114
+ data_list = eval(f"encoder_{src_fmt}")(input_file)
115
+ output_file = eval(f"decoder_{dst_fmt}")(
116
+ data_list, f"{cache}/output.{dst_fmt}")
117
+ previews = pd.DataFrame(data_list)
118
+
119
+ except Exception as e:
120
+ status = f"{e}"
121
+
122
+ return status, output_file, previews
123
+
124
+
125
+ if __name__ == "__main__":
126
+ tab_cfgs = ["jsonl ⇆ csv", "json ⇆ csv", "json ⇆ jsonl"]
127
+ with gr.Blocks() as data:
128
+ gr.Markdown(_L("# 数据文件转换"))
129
+ for item in tab_cfgs:
130
+ types = item.split(" ⇆ ")
131
+ with gr.Tab(item) as tab:
132
+ with gr.Row():
133
+ with gr.Column():
134
+ option = gr.Dropdown(
135
+ choices=[
136
+ f"{types[0]} → {types[1]}",
137
+ f"{types[0]} ← {types[1]}",
138
+ ],
139
+ label=_L("模式"),
140
+ value=f"{types[0]} → {types[1]}",
141
+ )
142
+ input_file = gr.File(
143
+ type="filepath",
144
+ label=_L("上传原数据"),
145
+ file_types=[f".{types[0]}", f".{types[1]}"],
146
+ )
147
+ convert_btn = gr.Button(_L("转换"))
148
+
149
+ with gr.Column():
150
+ status_bar = gr.Textbox(
151
+ label=_L("状态栏"),
152
+ show_copy_button=True,
153
+ )
154
+ output_file = gr.File(
155
+ type="filepath", label=_L("下载转换数据"))
156
+ data_viewer = gr.Dataframe(label=_L("数据���览"))
157
+
158
+ option.change(change_mode, inputs=option)
159
+ tab.select(change_mode, inputs=option)
160
+ convert_btn.click(
161
+ infer,
162
+ inputs=input_file,
163
+ outputs=[status_bar, output_file, data_viewer],
164
+ )
165
+
166
+ with gr.Row():
167
+ with gr.Column():
168
+ gr.Markdown(
169
+ f"""
170
+ ## {_L('支持的 JSON Lines 格式')}
171
+ ```
172
+ {{"key1": "val11", "key2": "val12", ...}}
173
+ {{"key1": "val21", "key2": "val22", ...}}
174
+ ...
175
+ ```
176
+ ## {_L('支持的 CSV 格式')}
177
+ ```
178
+ key1, key2, ...
179
+ val11, val12, ...
180
+ val21, val22, ...
181
+ ...
182
+ ```
183
+ """
184
+ )
185
+
186
+ with gr.Column():
187
+ gr.Markdown(
188
+ f"""
189
+ ## {_L('支持的 JSON 格式')}
190
+ ```
191
+ [
192
+ {{
193
+ "key1": "val11",
194
+ "key2": "val12",
195
+ ...
196
+ }},
197
+ {{
198
+ "key1": "val21",
199
+ "key2": "val22",
200
+ ...
201
+ }},
202
+ ...
203
+ ]
204
+ ```"""
205
+ )
206
+
207
+ data.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ exifread
2
+ moviepy==1.0.3
utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
5
+ TMP_DIR = "./__pycache__"
6
+
7
+
8
+ def clean_dir(dir_path: str):
9
+ if os.path.exists(dir_path):
10
+ shutil.rmtree(dir_path)
11
+
12
+ os.makedirs(dir_path)