Update app.py
Browse files
app.py
CHANGED
|
@@ -47,9 +47,9 @@ except ImportError:
|
|
| 47 |
# βββββββββββββββββ Config & Constants βββββββββββββββββββββββββββββββββββββββ
|
| 48 |
TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
|
| 49 |
TMP_ROOT.mkdir(parents=True, exist_ok=True)
|
| 50 |
-
CPU_COUNT
|
| 51 |
-
BATCH_SIZE
|
| 52 |
-
SAMPLE_LIMIT = int(os.getenv("QC_SAMPLE",
|
| 53 |
|
| 54 |
DEFAULT_W = {
|
| 55 |
"Integrity": 0.25,
|
|
@@ -70,7 +70,7 @@ class QCConfig:
|
|
| 70 |
weights: str | None
|
| 71 |
cpu_count: int = CPU_COUNT
|
| 72 |
batch_size: int = BATCH_SIZE
|
| 73 |
-
sample_limit:int = SAMPLE_LIMIT
|
| 74 |
|
| 75 |
# βββββββββββ Helpers & Caching βββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
def load_yaml(path: Path) -> Dict:
|
|
@@ -171,12 +171,12 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
| 171 |
boxes.append(len(bs))
|
| 172 |
counts.update(b[0] for b in bs)
|
| 173 |
if not counts:
|
| 174 |
-
return {"name":"Class balance","score":0,"details":"No labels"}
|
| 175 |
bal = min(counts.values()) / max(counts.values()) * 100
|
| 176 |
return {
|
| 177 |
-
"name":"Class balance",
|
| 178 |
-
"score":bal,
|
| 179 |
-
"details":{
|
| 180 |
"class_counts": dict(counts),
|
| 181 |
"boxes_per_image": {
|
| 182 |
"min": min(boxes),
|
|
@@ -188,7 +188,7 @@ def qc_class_balance(lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
| 188 |
|
| 189 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
| 190 |
if cv2 is None:
|
| 191 |
-
return {"name":"Image quality","score":100,"details":"cv2 missing"}
|
| 192 |
blurry, dark, bright = [], [], []
|
| 193 |
sample = imgs[:cfg.sample_limit]
|
| 194 |
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
|
@@ -200,9 +200,9 @@ def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
| 200 |
bad = len({*blurry, *dark, *bright})
|
| 201 |
score = 100 - bad / max(len(sample), 1) * 100
|
| 202 |
return {
|
| 203 |
-
"name":"Image quality",
|
| 204 |
-
"score":score,
|
| 205 |
-
"details":{
|
| 206 |
"blurry": [str(p) for p in blurry],
|
| 207 |
"dark": [str(p) for p in dark],
|
| 208 |
"bright": [str(p) for p in bright]
|
|
@@ -214,31 +214,36 @@ def qc_duplicates(imgs: List[Path], cfg: QCConfig) -> Dict:
|
|
| 214 |
try:
|
| 215 |
fd = fastdup.create(
|
| 216 |
input_dir=str(Path(imgs[0]).parent.parent),
|
| 217 |
-
work_dir=str(TMP_ROOT/
|
| 218 |
)
|
| 219 |
fd.run()
|
| 220 |
-
|
| 221 |
-
|
|
|
|
| 222 |
return {
|
| 223 |
-
"name":"Duplicates",
|
| 224 |
-
"score":100-dup/len(imgs)*100,
|
| 225 |
-
"details":{"groups":clusters[:50]}
|
| 226 |
}
|
| 227 |
except Exception as e:
|
| 228 |
-
return {
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
| 232 |
model = get_model(cfg.weights)
|
| 233 |
if model is None:
|
| 234 |
-
return {"name":"Model QA","score":100,"details":"skipped"}
|
| 235 |
ious, mism = [], []
|
| 236 |
sample = imgs[:cfg.sample_limit]
|
| 237 |
for i in range(0, len(sample), cfg.batch_size):
|
| 238 |
-
batch = sample[i:i+cfg.batch_size]
|
| 239 |
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
| 240 |
for p, res in zip(batch, results):
|
| 241 |
-
gt = parse_label_file(Path(p).parent.parent/'labels'/f"{Path(p).stem}.txt")
|
| 242 |
for cls, x, y, w, h in gt:
|
| 243 |
best = 0.0
|
| 244 |
for b, c, conf in zip(
|
|
@@ -254,52 +259,51 @@ def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
|
| 254 |
mism.append(str(p))
|
| 255 |
miou = float(np.mean(ious)) if ious else 1.0
|
| 256 |
return {
|
| 257 |
-
"name":"Model QA",
|
| 258 |
-
"score":miou*100,
|
| 259 |
-
"details":{"mean_iou":miou, "mismatches":mism[:50]}
|
| 260 |
}
|
| 261 |
|
| 262 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
| 263 |
if get_noise_indices is None:
|
| 264 |
-
return {"name":"Label issues","score":100,"details":"skipped"}
|
| 265 |
labels, idxs = [], []
|
| 266 |
sample = imgs[:cfg.sample_limit]
|
| 267 |
-
model = get_model(cfg.weights)
|
| 268 |
for i, p in enumerate(sample):
|
| 269 |
bs = parse_label_file(lbls[i]) if lbls[i] else []
|
| 270 |
for cls, *_ in bs:
|
| 271 |
labels.append(int(cls))
|
| 272 |
idxs.append(i)
|
| 273 |
if not labels:
|
| 274 |
-
return {"name":"Label issues","score":100,"details":"no GT"}
|
| 275 |
labels_arr = np.array(labels)
|
| 276 |
uniq = sorted(set(labels_arr))
|
| 277 |
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
| 278 |
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
| 279 |
flags = sorted({idxs[n] for n in noise})
|
| 280 |
files = [str(sample[i]) for i in flags]
|
| 281 |
-
score = 100 - len(flags)/len(labels)*100
|
| 282 |
return {
|
| 283 |
-
"name":"Label issues",
|
| 284 |
-
"score":score,
|
| 285 |
-
"details":{"files":files[:50]}
|
| 286 |
}
|
| 287 |
|
| 288 |
def _rel_iou(b1, b2):
|
| 289 |
x1, y1, w1, h1 = b1
|
| 290 |
x2, y2, w2, h2 = b2
|
| 291 |
-
xa1, ya1 = x1-w1/2, y1-h1/2
|
| 292 |
-
xa2, ya2 = x1+w1/2, y1+h1/2
|
| 293 |
-
xb1, yb1 = x2-w2/2, y2-h2/2
|
| 294 |
-
xb2, yb2 = x2+w2/2, y2+h2/2
|
| 295 |
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
| 296 |
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
| 297 |
-
inter = max(ix2-ix1, 0) * max(iy2-iy1, 0)
|
| 298 |
-
union = w1*h1 + w2*h2 - inter
|
| 299 |
-
return inter/union if union else 0.0
|
| 300 |
|
| 301 |
def aggregate(results: List[Dict]) -> float:
|
| 302 |
-
return sum(DEFAULT_W[r[
|
| 303 |
|
| 304 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
| 305 |
|
|
@@ -308,7 +312,7 @@ def download_rf_dataset(url: str, rf_api: Roboflow, dest: Path) -> Path:
|
|
| 308 |
if not m:
|
| 309 |
raise ValueError(f"Bad RF URL: {url}")
|
| 310 |
ws, proj, ver = m.groups()
|
| 311 |
-
ds_dir = dest/f"{ws}_{proj}_v{ver}"
|
| 312 |
if ds_dir.exists():
|
| 313 |
return ds_dir
|
| 314 |
pr = rf_api.workspace(ws).project(proj)
|
|
@@ -328,9 +332,9 @@ def run_quality(
|
|
| 328 |
qc_integrity(imgs, lbls, cfg),
|
| 329 |
qc_class_balance(lbls, cfg),
|
| 330 |
qc_image_quality(imgs, cfg),
|
| 331 |
-
qc_duplicates(imgs, cfg) if run_dup else {"name":"Duplicates","score":100,"details":"skipped"},
|
| 332 |
-
qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name":"Model QA","score":100,"details":"skipped"},
|
| 333 |
-
qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name":"Label issues","score":100,"details":"skipped"},
|
| 334 |
]
|
| 335 |
final = aggregate(results)
|
| 336 |
|
|
@@ -338,14 +342,14 @@ def run_quality(
|
|
| 338 |
for r in results:
|
| 339 |
md.append(f"### {r['name']} {r['score']:.1f}")
|
| 340 |
md.append("<details><summary>details</summary>\n```json")
|
| 341 |
-
md.append(json.dumps(r[
|
| 342 |
md.append("```\n</details>\n")
|
| 343 |
|
| 344 |
df = pd.DataFrame.from_dict(
|
| 345 |
-
next(r for r in results if r[
|
| 346 |
-
orient=
|
| 347 |
)
|
| 348 |
-
df.index.name =
|
| 349 |
return "\n".join(md), df
|
| 350 |
|
| 351 |
with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
@@ -394,9 +398,9 @@ with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
| 394 |
if not line.strip():
|
| 395 |
continue
|
| 396 |
try:
|
| 397 |
-
ds = download_rf_dataset(line, rf, TMP_ROOT)
|
| 398 |
-
|
| 399 |
-
|
| 400 |
Path(weights.name) if weights else None,
|
| 401 |
cfg, run_dup, run_modelqa
|
| 402 |
)
|
|
|
|
| 47 |
# βββββββββββββββββ Config & Constants βββββββββββββββββββββββββββββββββββββββ
|
| 48 |
TMP_ROOT = Path(tempfile.gettempdir()) / "rf_datasets"
|
| 49 |
TMP_ROOT.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
CPU_COUNT = int(os.getenv("QC_CPU", 1)) # force single-core by default
|
| 51 |
+
BATCH_SIZE = int(os.getenv("QC_BATCH", 4)) # small batches
|
| 52 |
+
SAMPLE_LIMIT = int(os.getenv("QC_SAMPLE", 200))
|
| 53 |
|
| 54 |
DEFAULT_W = {
|
| 55 |
"Integrity": 0.25,
|
|
|
|
| 70 |
weights: str | None
|
| 71 |
cpu_count: int = CPU_COUNT
|
| 72 |
batch_size: int = BATCH_SIZE
|
| 73 |
+
sample_limit: int = SAMPLE_LIMIT
|
| 74 |
|
| 75 |
# βββββββββββ Helpers & Caching βββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
def load_yaml(path: Path) -> Dict:
|
|
|
|
| 171 |
boxes.append(len(bs))
|
| 172 |
counts.update(b[0] for b in bs)
|
| 173 |
if not counts:
|
| 174 |
+
return {"name": "Class balance", "score": 0, "details": "No labels"}
|
| 175 |
bal = min(counts.values()) / max(counts.values()) * 100
|
| 176 |
return {
|
| 177 |
+
"name": "Class balance",
|
| 178 |
+
"score": bal,
|
| 179 |
+
"details": {
|
| 180 |
"class_counts": dict(counts),
|
| 181 |
"boxes_per_image": {
|
| 182 |
"min": min(boxes),
|
|
|
|
| 188 |
|
| 189 |
def qc_image_quality(imgs: List[Path], cfg: QCConfig) -> Dict:
|
| 190 |
if cv2 is None:
|
| 191 |
+
return {"name": "Image quality", "score": 100, "details": "cv2 missing"}
|
| 192 |
blurry, dark, bright = [], [], []
|
| 193 |
sample = imgs[:cfg.sample_limit]
|
| 194 |
with ThreadPoolExecutor(max_workers=cfg.cpu_count) as ex:
|
|
|
|
| 200 |
bad = len({*blurry, *dark, *bright})
|
| 201 |
score = 100 - bad / max(len(sample), 1) * 100
|
| 202 |
return {
|
| 203 |
+
"name": "Image quality",
|
| 204 |
+
"score": score,
|
| 205 |
+
"details": {
|
| 206 |
"blurry": [str(p) for p in blurry],
|
| 207 |
"dark": [str(p) for p in dark],
|
| 208 |
"bright": [str(p) for p in bright]
|
|
|
|
| 214 |
try:
|
| 215 |
fd = fastdup.create(
|
| 216 |
input_dir=str(Path(imgs[0]).parent.parent),
|
| 217 |
+
work_dir=str(TMP_ROOT / "fastdup")
|
| 218 |
)
|
| 219 |
fd.run()
|
| 220 |
+
cc = fd.connected_components_grouped(sort_by="comp_size", ascending=False)
|
| 221 |
+
clusters = cc["files"].tolist()
|
| 222 |
+
dup = sum(len(c) - 1 for c in clusters)
|
| 223 |
return {
|
| 224 |
+
"name": "Duplicates",
|
| 225 |
+
"score": max(0.0, 100 - dup / len(imgs) * 100),
|
| 226 |
+
"details": {"groups": clusters[:50]}
|
| 227 |
}
|
| 228 |
except Exception as e:
|
| 229 |
+
return {
|
| 230 |
+
"name": "Duplicates",
|
| 231 |
+
"score": 100.0,
|
| 232 |
+
"details": {"fastdup_error": str(e)}
|
| 233 |
+
}
|
| 234 |
+
return {"name": "Duplicates", "score": 100.0, "details": {"note": "skipped"}}
|
| 235 |
|
| 236 |
def qc_model_qa(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
| 237 |
model = get_model(cfg.weights)
|
| 238 |
if model is None:
|
| 239 |
+
return {"name": "Model QA", "score": 100, "details": "skipped"}
|
| 240 |
ious, mism = [], []
|
| 241 |
sample = imgs[:cfg.sample_limit]
|
| 242 |
for i in range(0, len(sample), cfg.batch_size):
|
| 243 |
+
batch = sample[i:i + cfg.batch_size]
|
| 244 |
results = model.predict(batch, verbose=False, half=True, dynamic=True)
|
| 245 |
for p, res in zip(batch, results):
|
| 246 |
+
gt = parse_label_file(Path(p).parent.parent / 'labels' / f"{Path(p).stem}.txt")
|
| 247 |
for cls, x, y, w, h in gt:
|
| 248 |
best = 0.0
|
| 249 |
for b, c, conf in zip(
|
|
|
|
| 259 |
mism.append(str(p))
|
| 260 |
miou = float(np.mean(ious)) if ious else 1.0
|
| 261 |
return {
|
| 262 |
+
"name": "Model QA",
|
| 263 |
+
"score": miou * 100,
|
| 264 |
+
"details": {"mean_iou": miou, "mismatches": mism[:50]}
|
| 265 |
}
|
| 266 |
|
| 267 |
def qc_label_issues(imgs: List[Path], lbls: List[Path], cfg: QCConfig) -> Dict:
|
| 268 |
if get_noise_indices is None:
|
| 269 |
+
return {"name": "Label issues", "score": 100, "details": "skipped"}
|
| 270 |
labels, idxs = [], []
|
| 271 |
sample = imgs[:cfg.sample_limit]
|
|
|
|
| 272 |
for i, p in enumerate(sample):
|
| 273 |
bs = parse_label_file(lbls[i]) if lbls[i] else []
|
| 274 |
for cls, *_ in bs:
|
| 275 |
labels.append(int(cls))
|
| 276 |
idxs.append(i)
|
| 277 |
if not labels:
|
| 278 |
+
return {"name": "Label issues", "score": 100, "details": "no GT"}
|
| 279 |
labels_arr = np.array(labels)
|
| 280 |
uniq = sorted(set(labels_arr))
|
| 281 |
probs = np.eye(len(uniq))[np.searchsorted(uniq, labels_arr)]
|
| 282 |
noise = get_noise_indices(labels=labels_arr, probabilities=probs)
|
| 283 |
flags = sorted({idxs[n] for n in noise})
|
| 284 |
files = [str(sample[i]) for i in flags]
|
| 285 |
+
score = 100 - len(flags) / len(labels) * 100
|
| 286 |
return {
|
| 287 |
+
"name": "Label issues",
|
| 288 |
+
"score": score,
|
| 289 |
+
"details": {"files": files[:50]}
|
| 290 |
}
|
| 291 |
|
| 292 |
def _rel_iou(b1, b2):
|
| 293 |
x1, y1, w1, h1 = b1
|
| 294 |
x2, y2, w2, h2 = b2
|
| 295 |
+
xa1, ya1 = x1 - w1/2, y1 - h1/2
|
| 296 |
+
xa2, ya2 = x1 + w1/2, y1 + h1/2
|
| 297 |
+
xb1, yb1 = x2 - w2/2, y2 - h2/2
|
| 298 |
+
xb2, yb2 = x2 + w2/2, y2 + h2/2
|
| 299 |
ix1 = max(xa1, xb1); iy1 = max(ya1, yb1)
|
| 300 |
ix2 = min(xa2, xb2); iy2 = min(ya2, yb2)
|
| 301 |
+
inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)
|
| 302 |
+
union = w1 * h1 + w2 * h2 - inter
|
| 303 |
+
return inter / union if union else 0.0
|
| 304 |
|
| 305 |
def aggregate(results: List[Dict]) -> float:
|
| 306 |
+
return sum(DEFAULT_W[r["name"]] * r["score"] for r in results)
|
| 307 |
|
| 308 |
RF_RE = re.compile(r"https?://universe\.roboflow\.com/([^/]+)/([^/]+)/dataset/(\d+)")
|
| 309 |
|
|
|
|
| 312 |
if not m:
|
| 313 |
raise ValueError(f"Bad RF URL: {url}")
|
| 314 |
ws, proj, ver = m.groups()
|
| 315 |
+
ds_dir = dest / f"{ws}_{proj}_v{ver}"
|
| 316 |
if ds_dir.exists():
|
| 317 |
return ds_dir
|
| 318 |
pr = rf_api.workspace(ws).project(proj)
|
|
|
|
| 332 |
qc_integrity(imgs, lbls, cfg),
|
| 333 |
qc_class_balance(lbls, cfg),
|
| 334 |
qc_image_quality(imgs, cfg),
|
| 335 |
+
qc_duplicates(imgs, cfg) if run_dup else {"name": "Duplicates", "score": 100, "details": "skipped"},
|
| 336 |
+
qc_model_qa(imgs, lbls, cfg) if run_modelqa else {"name": "Model QA", "score": 100, "details": "skipped"},
|
| 337 |
+
qc_label_issues(imgs, lbls, cfg) if run_modelqa else {"name": "Label issues", "score": 100, "details": "skipped"},
|
| 338 |
]
|
| 339 |
final = aggregate(results)
|
| 340 |
|
|
|
|
| 342 |
for r in results:
|
| 343 |
md.append(f"### {r['name']} {r['score']:.1f}")
|
| 344 |
md.append("<details><summary>details</summary>\n```json")
|
| 345 |
+
md.append(json.dumps(r["details"], indent=2))
|
| 346 |
md.append("```\n</details>\n")
|
| 347 |
|
| 348 |
df = pd.DataFrame.from_dict(
|
| 349 |
+
next(r for r in results if r["name"] == "Class balance")["details"]["class_counts"],
|
| 350 |
+
orient="index", columns=["count"]
|
| 351 |
)
|
| 352 |
+
df.index.name = "class"
|
| 353 |
return "\n".join(md), df
|
| 354 |
|
| 355 |
with gr.Blocks(title="YOLO Dataset Quality Evaluator v3") as demo:
|
|
|
|
| 398 |
if not line.strip():
|
| 399 |
continue
|
| 400 |
try:
|
| 401 |
+
ds, md, df = download_rf_dataset(line, rf, TMP_ROOT), *run_quality(
|
| 402 |
+
download_rf_dataset(line, rf, TMP_ROOT),
|
| 403 |
+
None,
|
| 404 |
Path(weights.name) if weights else None,
|
| 405 |
cfg, run_dup, run_modelqa
|
| 406 |
)
|