acecalisto3 commited on
Commit
af05e7c
·
verified ·
1 Parent(s): 829ae99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -10
app.py CHANGED
@@ -2,6 +2,7 @@ import io
2
  import os
3
  import re
4
  import time
 
5
  from itertools import islice
6
  from functools import partial
7
  from multiprocessing.pool import ThreadPool
@@ -79,17 +80,11 @@ tags:
79
  - infinite-dataset-hub
80
  - synthetic
81
  ---
82
-
83
  {title}
84
-
85
  _Note: This is an AI-generated dataset so its content may be inaccurate or false_
86
-
87
  {content}
88
-
89
  **Source of the data:**
90
-
91
  The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
92
-
93
  - **Dataset Generation Page**: {dataset_url}
94
  - **Model**: https://huggingface.co/{model_id}
95
  - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
@@ -99,7 +94,6 @@ css = """
99
  a {
100
  color: var(--body-text-color);
101
  }
102
-
103
  .datasetButton {
104
  justify-content: start;
105
  justify-content: left;
@@ -149,7 +143,6 @@ a {
149
  .insivibleButtonGroup {
150
  display: none;
151
  }
152
-
153
  @keyframes placeHolderShimmer{
154
  0%{
155
  background-position: -468px 0
@@ -377,7 +370,16 @@ with gr.Blocks(css=css) as demo:
377
  try:
378
  generated_df = parse_csv_df(generated_csv.strip(), csv_header=csv_header)
379
  if len(generated_df) > nb_samples:
380
- output[indices_to_generate[nb_samples]] = generated_df.iloc[-1].to_dict()
 
 
 
 
 
 
 
 
 
381
  nb_samples += 1
382
  yield 1
383
  except Exception:
@@ -409,6 +411,48 @@ with gr.Blocks(css=css) as demo:
409
  ]
410
 
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
413
  _in_csv = False
414
  csv = "\n".join(
@@ -418,7 +462,27 @@ with gr.Blocks(css=css) as demo:
418
  )
419
  if not csv:
420
  raise gr.Error("Failed to parse CSV Preview")
421
- return csv.split("\n")[0], parse_csv_df(csv)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
 
424
  def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
 
2
  import os
3
  import re
4
  import time
5
+ from typing import Any, Dict, List
6
  from itertools import islice
7
  from functools import partial
8
  from multiprocessing.pool import ThreadPool
 
80
  - infinite-dataset-hub
81
  - synthetic
82
  ---
 
83
  {title}
 
84
  _Note: This is an AI-generated dataset so its content may be inaccurate or false_
 
85
  {content}
 
86
  **Source of the data:**
 
87
  The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
 
88
  - **Dataset Generation Page**: {dataset_url}
89
  - **Model**: https://huggingface.co/{model_id}
90
  - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
 
94
  a {
95
  color: var(--body-text-color);
96
  }
 
97
  .datasetButton {
98
  justify-content: start;
99
  justify-content: left;
 
143
  .insivibleButtonGroup {
144
  display: none;
145
  }
 
146
  @keyframes placeHolderShimmer{
147
  0%{
148
  background-position: -468px 0
 
370
  try:
371
  generated_df = parse_csv_df(generated_csv.strip(), csv_header=csv_header)
372
  if len(generated_df) > nb_samples:
373
+ # Convert latest record to dict and refine it
374
+ record = generated_df.iloc[-1].to_dict()
375
+ refined_record = refine_data_generic([record])[0]
376
+
377
+ # Add quality flags if any
378
+ flags = detect_anomalies(refined_record)
379
+ if flags:
380
+ refined_record['_quality_flags'] = flags
381
+
382
+ output[indices_to_generate[nb_samples]] = refined_record
383
  nb_samples += 1
384
  yield 1
385
  except Exception:
 
411
  ]
412
 
413
 
414
+ def refine_data_generic(dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
415
+ """
416
+ Universally refine any dataset.
417
+ Works on list of dicts. Detects field types and applies general cleanup.
418
+ """
419
+ def normalize_value(value):
420
+ if isinstance(value, str):
421
+ # Trim, title-case common descriptors, remove duplicate whitespace
422
+ value = re.sub(r'\s+', ' ', value.strip())
423
+ value = value.replace('_', ' ')
424
+ if any(k in value.lower() for k in ['color', 'material', 'type', 'status']):
425
+ value = value.title()
426
+ return value
427
+
428
+ def clean_record(record):
429
+ cleaned = {}
430
+ for key, value in record.items():
431
+ # Normalize key and value
432
+ clean_key = key.strip().lower().replace(" ", "_")
433
+ if isinstance(value, list):
434
+ cleaned[clean_key] = [normalize_value(v) for v in value]
435
+ elif isinstance(value, dict):
436
+ cleaned[clean_key] = clean_record(value)
437
+ else:
438
+ cleaned[clean_key] = normalize_value(value)
439
+ return cleaned
440
+
441
+ return [clean_record(entry) for entry in dataset]
442
+
443
+ def detect_anomalies(record: Dict[str, Any]) -> List[str]:
444
+ """
445
+ Detect potential anomalies in a record.
446
+ Returns a list of flags for any detected issues.
447
+ """
448
+ flags = []
449
+ for k, v in record.items():
450
+ if isinstance(v, str) and len(v) > 300:
451
+ flags.append(f"{k} looks too verbose.")
452
+ if isinstance(v, str) and v.lower() in ['n/a', 'none', 'undefined']:
453
+ flags.append(f"{k} is missing or undefined.")
454
+ return flags
455
+
456
  def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
457
  _in_csv = False
458
  csv = "\n".join(
 
462
  )
463
  if not csv:
464
  raise gr.Error("Failed to parse CSV Preview")
465
+
466
+ # Get header and parse initial DataFrame
467
+ csv_header = csv.split("\n")[0]
468
+ df = parse_csv_df(csv)
469
+
470
+ # Convert DataFrame to list of dicts for refinement
471
+ records = df.to_dict('records')
472
+
473
+ # Apply refinement
474
+ refined_records = refine_data_generic(records)
475
+
476
+ # Add quality flags
477
+ for record in refined_records:
478
+ flags = detect_anomalies(record)
479
+ if flags:
480
+ record['_quality_flags'] = flags
481
+
482
+ # Convert back to DataFrame
483
+ refined_df = pd.DataFrame(refined_records)
484
+
485
+ return csv_header, refined_df
486
 
487
 
488
  def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame: