yokoha commited on
Commit
4e21102
·
verified ·
1 Parent(s): 7098b45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -53
app.py CHANGED
@@ -1,58 +1,93 @@
1
  import gradio as gr
2
  import pandas as pd
3
  from io import BytesIO
 
4
 
5
- def convert_file(input_file, conversion_type):
6
- # Check if a file was uploaded
7
- if input_file is None:
8
- raise ValueError("Please upload a file.")
9
-
10
- # Determine if input_file is a file-like object or a file path string
 
11
  try:
12
- # Try reading from file-like object
13
- file_bytes = input_file.read()
14
- file_name = input_file.name
15
- except AttributeError:
16
- # If there's an AttributeError, treat input_file as a file path
17
- file_name = input_file
18
- with open(file_name, "rb") as f:
19
- file_bytes = f.read()
20
-
21
- file_extension = file_name.lower().split('.')[-1]
22
- df = None
23
- output_file = None
24
- converted_format = None
25
-
26
- # Conversion: CSV to Parquet
27
- if conversion_type == "CSV to Parquet":
28
- if file_extension != "csv":
29
- raise ValueError("For CSV to Parquet conversion, please upload a CSV file.")
30
- df = pd.read_csv(BytesIO(file_bytes))
31
- output_file = "output.parquet"
32
- df.to_parquet(output_file, index=False)
33
- converted_format = "Parquet"
34
-
35
- # Conversion: Parquet to CSV
36
- elif conversion_type == "Parquet to CSV":
37
- if file_extension != "parquet":
38
- raise ValueError("For Parquet to CSV conversion, please upload a Parquet file.")
39
- df = pd.read_parquet(BytesIO(file_bytes))
40
- output_file = "output.csv"
41
- df.to_csv(output_file, index=False)
42
- converted_format = "CSV"
43
- else:
44
- raise ValueError("Invalid conversion type selected.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Generate a preview of the top 10 rows
47
- preview = df.head(10).to_string(index=False)
48
- info_message = (
49
- f"Input file: {file_name}\n"
50
- f"Converted file format: {converted_format}\n"
51
- f"Total rows: {len(df)}\n"
52
- f"Total columns: {len(df.columns)}\n\n"
53
- f"Preview (Top 10 Rows):\n{preview}"
54
- )
55
- return output_file, info_message
56
 
57
  # Enhanced custom CSS for a more visually appealing interface
58
  custom_css = """
@@ -184,6 +219,24 @@ h2 {
184
  background: linear-gradient(to right, transparent, #ddd, transparent);
185
  margin: 25px 0;
186
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  """
188
 
189
  with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo:
@@ -200,13 +253,14 @@ with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo:
200
  <h3>How It Works</h3>
201
  <div class="instruction-step">1. Upload your CSV or Parquet file</div>
202
  <div class="instruction-step">2. Select the conversion direction</div>
203
- <div class="instruction-step">3. Click "Convert" and download your transformed file</div>
 
204
  </div>
205
 
206
  <div class="info-section">
207
  <div class="info-tag">Fast Conversion</div>
208
  <div class="info-tag">Data Preview</div>
209
- <div class="info-tag">No Size Limits</div>
210
  <div class="info-tag">Maintains Structure</div>
211
  </div>
212
  """)
@@ -240,6 +294,11 @@ with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo:
240
  value="CSV to Parquet",
241
  elem_classes=["conversion-radio"]
242
  )
 
 
 
 
 
243
  convert_button = gr.Button("Convert Now", elem_classes=["convert-button"])
244
  gr.HTML('</div>') # Close the file-box div
245
 
@@ -263,8 +322,22 @@ with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo:
263
 
264
  convert_button.click(
265
  fn=convert_file,
266
- inputs=[input_file, conversion_type],
267
  outputs=[output_file, preview]
268
  )
269
 
270
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from io import BytesIO
4
+ import chardet
5
 
6
+ def detect_encoding(file_bytes):
7
+ """Detect the encoding of a file using chardet"""
8
+ # Only use a sample of the file for detection to improve performance
9
+ result = chardet.detect(file_bytes[:10000])
10
+ return result['encoding']
11
+
12
+ def convert_file(input_file, conversion_type, encoding_option):
13
  try:
14
+ # Check if a file was uploaded
15
+ if input_file is None:
16
+ return None, "Please upload a file."
17
+
18
+ # Determine if input_file is a file-like object or a file path string
19
+ try:
20
+ # Try reading from file-like object
21
+ file_bytes = input_file.read()
22
+ file_name = input_file.name
23
+ except AttributeError:
24
+ # If there's an AttributeError, treat input_file as a file path
25
+ file_name = input_file
26
+ with open(file_name, "rb") as f:
27
+ file_bytes = f.read()
28
+
29
+ file_extension = file_name.lower().split('.')[-1]
30
+ df = None
31
+ output_file = None
32
+ converted_format = None
33
+
34
+ # Handle encoding for CSV files
35
+ if encoding_option == "Auto-detect":
36
+ encoding = detect_encoding(file_bytes)
37
+ else:
38
+ encoding = encoding_option
39
+
40
+ # Conversion: CSV to Parquet
41
+ if conversion_type == "CSV to Parquet":
42
+ if file_extension != "csv":
43
+ return None, "For CSV to Parquet conversion, please upload a CSV file."
44
+
45
+ # Try with the selected/detected encoding
46
+ try:
47
+ df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
48
+ except UnicodeDecodeError:
49
+ # If auto-detection fails, try a few common encodings
50
+ common_encodings = ['latin1', 'iso-8859-1', 'cp1252']
51
+ for enc in common_encodings:
52
+ try:
53
+ df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
54
+ encoding = enc # Update the successful encoding
55
+ break
56
+ except UnicodeDecodeError:
57
+ continue
58
+ if df is None:
59
+ return None, f"Failed to decode the CSV file. Auto-detected encoding was '{encoding}'. Please try selecting a specific encoding."
60
+
61
+ output_file = "output.parquet"
62
+ df.to_parquet(output_file, index=False)
63
+ converted_format = "Parquet"
64
+
65
+ # Conversion: Parquet to CSV
66
+ elif conversion_type == "Parquet to CSV":
67
+ if file_extension != "parquet":
68
+ return None, "For Parquet to CSV conversion, please upload a Parquet file."
69
+
70
+ df = pd.read_parquet(BytesIO(file_bytes))
71
+ output_file = "output.csv"
72
+ df.to_csv(output_file, index=False, encoding=encoding)
73
+ converted_format = "CSV"
74
+ else:
75
+ return None, "Invalid conversion type selected."
76
+
77
+ # Generate a preview of the top 10 rows
78
+ preview = df.head(10).to_string(index=False)
79
+ info_message = (
80
+ f"Input file: {file_name}\n"
81
+ f"Converted file format: {converted_format}\n"
82
+ f"Encoding used: {encoding}\n"
83
+ f"Total rows: {len(df)}\n"
84
+ f"Total columns: {len(df.columns)}\n\n"
85
+ f"Preview (Top 10 Rows):\n{preview}"
86
+ )
87
+ return output_file, info_message
88
 
89
+ except Exception as e:
90
+ return None, f"Error during conversion: {str(e)}"
 
 
 
 
 
 
 
 
91
 
92
  # Enhanced custom CSS for a more visually appealing interface
93
  custom_css = """
 
219
  background: linear-gradient(to right, transparent, #ddd, transparent);
220
  margin: 25px 0;
221
  }
222
+
223
+ .error-message {
224
+ color: #d93025;
225
+ background-color: #fce8e6;
226
+ padding: 10px;
227
+ border-radius: 8px;
228
+ margin-top: 10px;
229
+ font-size: 0.9rem;
230
+ }
231
+
232
+ .success-message {
233
+ color: #188038;
234
+ background-color: #e6f4ea;
235
+ padding: 10px;
236
+ border-radius: 8px;
237
+ margin-top: 10px;
238
+ font-size: 0.9rem;
239
+ }
240
  """
241
 
242
  with gr.Blocks(css=custom_css, title="DataFormat Converter") as demo:
 
253
  <h3>How It Works</h3>
254
  <div class="instruction-step">1. Upload your CSV or Parquet file</div>
255
  <div class="instruction-step">2. Select the conversion direction</div>
256
+ <div class="instruction-step">3. Choose encoding (or leave as auto-detect)</div>
257
+ <div class="instruction-step">4. Click "Convert" and download your transformed file</div>
258
  </div>
259
 
260
  <div class="info-section">
261
  <div class="info-tag">Fast Conversion</div>
262
  <div class="info-tag">Data Preview</div>
263
+ <div class="info-tag">Multi-Encoding Support</div>
264
  <div class="info-tag">Maintains Structure</div>
265
  </div>
266
  """)
 
294
  value="CSV to Parquet",
295
  elem_classes=["conversion-radio"]
296
  )
297
+ encoding_option = gr.Dropdown(
298
+ choices=["Auto-detect", "utf-8", "latin1", "iso-8859-1", "cp1252", "utf-16"],
299
+ value="Auto-detect",
300
+ label="Select CSV Encoding"
301
+ )
302
  convert_button = gr.Button("Convert Now", elem_classes=["convert-button"])
303
  gr.HTML('</div>') # Close the file-box div
304
 
 
322
 
323
  convert_button.click(
324
  fn=convert_file,
325
+ inputs=[input_file, conversion_type, encoding_option],
326
  outputs=[output_file, preview]
327
  )
328
 
329
+ # Add dependency handling to show/hide encoding options based on conversion type
330
+ def update_encoding_visibility(conversion_type):
331
+ if conversion_type == "CSV to Parquet":
332
+ return gr.update(visible=True)
333
+ else:
334
+ return gr.update(visible=False)
335
+
336
+ conversion_type.change(
337
+ fn=update_encoding_visibility,
338
+ inputs=conversion_type,
339
+ outputs=encoding_option
340
+ )
341
+
342
+ if __name__ == "__main__":
343
+ demo.launch()