Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ import io
|
|
2 |
import os
|
3 |
import re
|
4 |
import time
|
|
|
5 |
from itertools import islice
|
6 |
from functools import partial
|
7 |
from multiprocessing.pool import ThreadPool
|
@@ -79,17 +80,11 @@ tags:
|
|
79 |
- infinite-dataset-hub
|
80 |
- synthetic
|
81 |
---
|
82 |
-
|
83 |
{title}
|
84 |
-
|
85 |
_Note: This is an AI-generated dataset so its content may be inaccurate or false_
|
86 |
-
|
87 |
{content}
|
88 |
-
|
89 |
**Source of the data:**
|
90 |
-
|
91 |
The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
|
92 |
-
|
93 |
- **Dataset Generation Page**: {dataset_url}
|
94 |
- **Model**: https://huggingface.co/{model_id}
|
95 |
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
|
@@ -99,7 +94,6 @@ css = """
|
|
99 |
a {
|
100 |
color: var(--body-text-color);
|
101 |
}
|
102 |
-
|
103 |
.datasetButton {
|
104 |
justify-content: start;
|
105 |
justify-content: left;
|
@@ -149,7 +143,6 @@ a {
|
|
149 |
.insivibleButtonGroup {
|
150 |
display: none;
|
151 |
}
|
152 |
-
|
153 |
@keyframes placeHolderShimmer{
|
154 |
0%{
|
155 |
background-position: -468px 0
|
@@ -377,7 +370,16 @@ with gr.Blocks(css=css) as demo:
|
|
377 |
try:
|
378 |
generated_df = parse_csv_df(generated_csv.strip(), csv_header=csv_header)
|
379 |
if len(generated_df) > nb_samples:
|
380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
nb_samples += 1
|
382 |
yield 1
|
383 |
except Exception:
|
@@ -409,6 +411,48 @@ with gr.Blocks(css=css) as demo:
|
|
409 |
]
|
410 |
|
411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
|
413 |
_in_csv = False
|
414 |
csv = "\n".join(
|
@@ -418,7 +462,27 @@ with gr.Blocks(css=css) as demo:
|
|
418 |
)
|
419 |
if not csv:
|
420 |
raise gr.Error("Failed to parse CSV Preview")
|
421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
|
423 |
|
424 |
def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
|
|
|
2 |
import os
|
3 |
import re
|
4 |
import time
|
5 |
+
from typing import Any, Dict, List
|
6 |
from itertools import islice
|
7 |
from functools import partial
|
8 |
from multiprocessing.pool import ThreadPool
|
|
|
80 |
- infinite-dataset-hub
|
81 |
- synthetic
|
82 |
---
|
|
|
83 |
{title}
|
|
|
84 |
_Note: This is an AI-generated dataset so its content may be inaccurate or false_
|
|
|
85 |
{content}
|
|
|
86 |
**Source of the data:**
|
|
|
87 |
The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
|
|
|
88 |
- **Dataset Generation Page**: {dataset_url}
|
89 |
- **Model**: https://huggingface.co/{model_id}
|
90 |
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
|
|
|
94 |
a {
|
95 |
color: var(--body-text-color);
|
96 |
}
|
|
|
97 |
.datasetButton {
|
98 |
justify-content: start;
|
99 |
justify-content: left;
|
|
|
143 |
.insivibleButtonGroup {
|
144 |
display: none;
|
145 |
}
|
|
|
146 |
@keyframes placeHolderShimmer{
|
147 |
0%{
|
148 |
background-position: -468px 0
|
|
|
370 |
try:
|
371 |
generated_df = parse_csv_df(generated_csv.strip(), csv_header=csv_header)
|
372 |
if len(generated_df) > nb_samples:
|
373 |
+
# Convert latest record to dict and refine it
|
374 |
+
record = generated_df.iloc[-1].to_dict()
|
375 |
+
refined_record = refine_data_generic([record])[0]
|
376 |
+
|
377 |
+
# Add quality flags if any
|
378 |
+
flags = detect_anomalies(refined_record)
|
379 |
+
if flags:
|
380 |
+
refined_record['_quality_flags'] = flags
|
381 |
+
|
382 |
+
output[indices_to_generate[nb_samples]] = refined_record
|
383 |
nb_samples += 1
|
384 |
yield 1
|
385 |
except Exception:
|
|
|
411 |
]
|
412 |
|
413 |
|
414 |
+
def refine_data_generic(dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
415 |
+
"""
|
416 |
+
Universally refine any dataset.
|
417 |
+
Works on list of dicts. Detects field types and applies general cleanup.
|
418 |
+
"""
|
419 |
+
def normalize_value(value):
|
420 |
+
if isinstance(value, str):
|
421 |
+
# Trim, title-case common descriptors, remove duplicate whitespace
|
422 |
+
value = re.sub(r'\s+', ' ', value.strip())
|
423 |
+
value = value.replace('_', ' ')
|
424 |
+
if any(k in value.lower() for k in ['color', 'material', 'type', 'status']):
|
425 |
+
value = value.title()
|
426 |
+
return value
|
427 |
+
|
428 |
+
def clean_record(record):
|
429 |
+
cleaned = {}
|
430 |
+
for key, value in record.items():
|
431 |
+
# Normalize key and value
|
432 |
+
clean_key = key.strip().lower().replace(" ", "_")
|
433 |
+
if isinstance(value, list):
|
434 |
+
cleaned[clean_key] = [normalize_value(v) for v in value]
|
435 |
+
elif isinstance(value, dict):
|
436 |
+
cleaned[clean_key] = clean_record(value)
|
437 |
+
else:
|
438 |
+
cleaned[clean_key] = normalize_value(value)
|
439 |
+
return cleaned
|
440 |
+
|
441 |
+
return [clean_record(entry) for entry in dataset]
|
442 |
+
|
443 |
+
def detect_anomalies(record: Dict[str, Any]) -> List[str]:
|
444 |
+
"""
|
445 |
+
Detect potential anomalies in a record.
|
446 |
+
Returns a list of flags for any detected issues.
|
447 |
+
"""
|
448 |
+
flags = []
|
449 |
+
for k, v in record.items():
|
450 |
+
if isinstance(v, str) and len(v) > 300:
|
451 |
+
flags.append(f"{k} looks too verbose.")
|
452 |
+
if isinstance(v, str) and v.lower() in ['n/a', 'none', 'undefined']:
|
453 |
+
flags.append(f"{k} is missing or undefined.")
|
454 |
+
return flags
|
455 |
+
|
456 |
def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
|
457 |
_in_csv = False
|
458 |
csv = "\n".join(
|
|
|
462 |
)
|
463 |
if not csv:
|
464 |
raise gr.Error("Failed to parse CSV Preview")
|
465 |
+
|
466 |
+
# Get header and parse initial DataFrame
|
467 |
+
csv_header = csv.split("\n")[0]
|
468 |
+
df = parse_csv_df(csv)
|
469 |
+
|
470 |
+
# Convert DataFrame to list of dicts for refinement
|
471 |
+
records = df.to_dict('records')
|
472 |
+
|
473 |
+
# Apply refinement
|
474 |
+
refined_records = refine_data_generic(records)
|
475 |
+
|
476 |
+
# Add quality flags
|
477 |
+
for record in refined_records:
|
478 |
+
flags = detect_anomalies(record)
|
479 |
+
if flags:
|
480 |
+
record['_quality_flags'] = flags
|
481 |
+
|
482 |
+
# Convert back to DataFrame
|
483 |
+
refined_df = pd.DataFrame(refined_records)
|
484 |
+
|
485 |
+
return csv_header, refined_df
|
486 |
|
487 |
|
488 |
def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
|