Yiming-M commited on
Commit
9cb167a
ยท
1 Parent(s): ae09b40

2025-07-31 22:29 ๐Ÿš€

Browse files
Files changed (1) hide show
  1. app.py +287 -54
app.py CHANGED
@@ -306,14 +306,14 @@ def predict(image: Image.Image, variant_dataset_metric: str):
306
 
307
  # ๅฆ‚ๆžœ้€‰ๆ‹ฉ็š„ๆ˜ฏๅˆ†ๅ‰ฒ็บฟ๏ผŒ่ฟ”ๅ›ž้”™่ฏฏไฟกๆฏ
308
  if "โ”โ”โ”โ”โ”โ”" in variant_dataset_metric:
309
- return image, None, None, "Please select a valid model configuration", None, None, None
310
 
311
  # ็กฎไฟๆจกๅž‹ๆญฃ็กฎๅŠ ่ฝฝ
312
  update_model_if_needed(variant_dataset_metric)
313
 
314
  parts = variant_dataset_metric.split(" @ ")
315
  if len(parts) != 3:
316
- return image, None, None, "Invalid model configuration format", None, None, None
317
 
318
  variant, dataset, metric = parts[0], parts[1], parts[2].lower()
319
 
@@ -326,7 +326,7 @@ def predict(image: Image.Image, variant_dataset_metric: str):
326
  elif dataset == "NWPU-Crowd":
327
  dataset_name = "nwpu"
328
  else:
329
- return image, None, None, f"Unknown dataset: {dataset}", None, None, None
330
 
331
  if not hasattr(loaded_model, "input_size"):
332
  if dataset_name == "sha":
@@ -363,7 +363,7 @@ def predict(image: Image.Image, variant_dataset_metric: str):
363
  image_height, image_width = new_height, new_width
364
 
365
  with torch.no_grad():
366
- if hasattr(loaded_model, "num_vpt") and loaded_model.num_vpt > 0: # For ViT models, use sliding window prediction
367
  # For ViT models with VPT
368
  pi_map, lambda_map = _sliding_window_predict(
369
  model=loaded_model,
@@ -458,7 +458,18 @@ def predict(image: Image.Image, variant_dataset_metric: str):
458
  lambda_map = Image.blend(image_rgba, lambda_map, alpha=alpha)
459
  complete_zero_map = Image.blend(image_rgba, complete_zero_map, alpha=alpha)
460
 
461
- return image, den_map, lambda_map, round(count, 2), strucrual_zero_map, sampling_zero_map, complete_zero_map
 
 
 
 
 
 
 
 
 
 
 
462
 
463
 
464
  # -----------------------------
@@ -512,45 +523,228 @@ select option[value*="โ”โ”โ”โ”โ”โ”"] {
512
  text-align: center !important;
513
  opacity: 0.6 !important;
514
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  """
516
 
517
- with gr.Blocks(css=css) as demo:
518
- gr.Markdown("# Crowd Counting by ZIP")
519
- gr.Markdown("Upload an image or select an example below to see the predicted crowd density map and total count.")
 
 
 
 
 
 
 
 
 
 
 
 
 
520
 
521
  with gr.Row():
522
- with gr.Column():
523
- # Dropdown for model variant
524
- model_dropdown = gr.Dropdown(
525
- choices=pretrained_models,
526
- value="ZIP-B @ NWPU-Crowd @ MAE",
527
- label="Select a pretrained model"
528
- )
529
- model_status = gr.Textbox(
530
- label="Model Status",
531
- value="No model loaded",
532
- interactive=False
533
- )
534
-
535
- input_img = gr.Image(label="Input Image", sources=["upload", "clipboard"], type="pil")
536
- submit_btn = gr.Button("Predict")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
 
538
- with gr.Column():
539
- output_den_map = gr.Image(label="Predicted Density Map", type="pil")
540
- output_lambda_map = gr.Image(label="Lambda Map", type="pil")
541
- output_text = gr.Textbox(label="Predicted Count")
 
 
 
 
 
542
 
543
- with gr.Column():
544
- output_structural_zero_map = gr.Image(label="Structural Zero Map", type="pil")
545
- output_sampling_zero_map = gr.Image(label="Sampling Zero Map", type="pil")
546
- output_complete_zero_map = gr.Image(label="Complete Zero Map", type="pil")
 
 
 
 
 
 
547
 
548
  # ๅฝ“ๆจกๅž‹ๅ˜ๅŒ–ๆ—ถ๏ผŒ่‡ชๅŠจๆ›ดๆ–ฐๆจกๅž‹
549
  def on_model_change(variant_dataset_metric):
550
  # ๅฆ‚ๆžœ้€‰ๆ‹ฉ็š„ๆ˜ฏๅˆ†ๅ‰ฒ็บฟ๏ผŒไฟๆŒๅฝ“ๅ‰้€‰ๆ‹ฉไธๅ˜
551
  if "โ”โ”โ”โ”โ”โ”" in variant_dataset_metric:
552
- return "Please select a valid model configuration"
553
- return update_model_if_needed(variant_dataset_metric)
 
 
 
 
 
 
554
 
555
  model_dropdown.change(
556
  fn=on_model_change,
@@ -560,7 +754,7 @@ with gr.Blocks(css=css) as demo:
560
 
561
  # ้กต้ขๅŠ ่ฝฝๆ—ถ่‡ชๅŠจๅŠ ่ฝฝ้ป˜่ฎคๆจกๅž‹
562
  demo.load(
563
- fn=lambda: update_model_if_needed("ZIP-B @ NWPU-Crowd @ MAE"),
564
  outputs=[model_status]
565
  )
566
 
@@ -570,23 +764,62 @@ with gr.Blocks(css=css) as demo:
570
  outputs=[input_img, output_den_map, output_lambda_map, output_text, output_structural_zero_map, output_sampling_zero_map, output_complete_zero_map]
571
  )
572
 
573
- gr.Examples(
574
- examples=[
575
- ["example1.jpg"],
576
- ["example2.jpg"],
577
- ["example3.jpg"],
578
- ["example4.jpg"],
579
- ["example5.jpg"],
580
- ["example6.jpg"],
581
- ["example7.jpg"],
582
- ["example8.jpg"],
583
- ["example9.jpg"],
584
- ["example10.jpg"],
585
- ["example11.jpg"],
586
- ["example12.jpg"]
587
- ],
588
- inputs=input_img,
589
- label="Try an example"
590
- )
591
-
592
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
  # ๅฆ‚ๆžœ้€‰ๆ‹ฉ็š„ๆ˜ฏๅˆ†ๅ‰ฒ็บฟ๏ผŒ่ฟ”ๅ›ž้”™่ฏฏไฟกๆฏ
308
  if "โ”โ”โ”โ”โ”โ”" in variant_dataset_metric:
309
+ return image, None, None, "โš ๏ธ Please select a valid model configuration", None, None, None
310
 
311
  # ็กฎไฟๆจกๅž‹ๆญฃ็กฎๅŠ ่ฝฝ
312
  update_model_if_needed(variant_dataset_metric)
313
 
314
  parts = variant_dataset_metric.split(" @ ")
315
  if len(parts) != 3:
316
+ return image, None, None, "โŒ Invalid model configuration format", None, None, None
317
 
318
  variant, dataset, metric = parts[0], parts[1], parts[2].lower()
319
 
 
326
  elif dataset == "NWPU-Crowd":
327
  dataset_name = "nwpu"
328
  else:
329
+ return image, None, None, f"โŒ Unknown dataset: {dataset}", None, None, None
330
 
331
  if not hasattr(loaded_model, "input_size"):
332
  if dataset_name == "sha":
 
363
  image_height, image_width = new_height, new_width
364
 
365
  with torch.no_grad():
366
+ if hasattr(loaded_model, "num_vpt") and loaded_model.num_vpt is not None and loaded_model.num_vpt > 0: # For ViT models, use sliding window prediction
367
  # For ViT models with VPT
368
  pi_map, lambda_map = _sliding_window_predict(
369
  model=loaded_model,
 
458
  lambda_map = Image.blend(image_rgba, lambda_map, alpha=alpha)
459
  complete_zero_map = Image.blend(image_rgba, complete_zero_map, alpha=alpha)
460
 
461
+ # ๆ ผๅผๅŒ–่ฎกๆ•ฐๆ˜พ็คบ
462
+ count_display = f"๐Ÿ‘ฅ {round(count, 2)} people detected"
463
+ if count < 1:
464
+ count_display = "๐Ÿ‘ค Less than 1 person detected"
465
+ elif count == 1:
466
+ count_display = "๐Ÿ‘ค 1 person detected"
467
+ elif count < 10:
468
+ count_display = f"๐Ÿ‘ฅ {round(count, 1)} people detected"
469
+ else:
470
+ count_display = f"๐Ÿ‘ฅ {round(count)} people detected"
471
+
472
+ return image, den_map, lambda_map, count_display, strucrual_zero_map, sampling_zero_map, complete_zero_map
473
 
474
 
475
  # -----------------------------
 
523
  text-align: center !important;
524
  opacity: 0.6 !important;
525
  }
526
+
527
+ /* ๆ•ดไฝ“ไธป้ข˜็พŽๅŒ– */
528
+ .gradio-container {
529
+ max-width: 1400px !important;
530
+ margin: 0 auto !important;
531
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
532
+ }
533
+
534
+ /* ๆ ‡้ข˜ๆ ทๅผ */
535
+ .gr-markdown h1 {
536
+ text-align: center !important;
537
+ color: #2563eb !important;
538
+ font-weight: 700 !important;
539
+ font-size: 2.5rem !important;
540
+ margin-bottom: 0.5rem !important;
541
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
542
+ -webkit-background-clip: text !important;
543
+ -webkit-text-fill-color: transparent !important;
544
+ }
545
+
546
+ /* ๅ‰ฏๆ ‡้ข˜ๆ ทๅผ */
547
+ .gr-markdown p {
548
+ text-align: center !important;
549
+ color: #6b7280 !important;
550
+ font-size: 1.1rem !important;
551
+ margin-bottom: 2rem !important;
552
+ }
553
+
554
+ /* ๆŒ‰้’ฎ็พŽๅŒ– */
555
+ .gr-button {
556
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
557
+ border: none !important;
558
+ border-radius: 8px !important;
559
+ color: white !important;
560
+ font-weight: 600 !important;
561
+ font-size: 1rem !important;
562
+ padding: 12px 24px !important;
563
+ transition: all 0.3s ease !important;
564
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
565
+ }
566
+
567
+ .gr-button:hover {
568
+ transform: translateY(-2px) !important;
569
+ box-shadow: 0 8px 25px -8px rgba(0, 0, 0, 0.3) !important;
570
+ }
571
+
572
+ /* ่พ“ๅ…ฅๆก†ๆ ทๅผ */
573
+ .gr-textbox, .gr-dropdown {
574
+ border-radius: 8px !important;
575
+ border: 2px solid #e5e7eb !important;
576
+ transition: border-color 0.3s ease !important;
577
+ }
578
+
579
+ .gr-textbox:focus, .gr-dropdown:focus {
580
+ border-color: #667eea !important;
581
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
582
+ }
583
+
584
+ /* ๅ›พๅƒๅฎนๅ™จ็พŽๅŒ– */
585
+ .gr-image {
586
+ border-radius: 12px !important;
587
+ overflow: hidden !important;
588
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1) !important;
589
+ transition: all 0.3s ease !important;
590
+ }
591
+
592
+ .gr-image:hover {
593
+ box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1) !important;
594
+ transform: translateY(-2px) !important;
595
+ }
596
+
597
+ /* ๅˆ—้—ด่ทไผ˜ๅŒ– */
598
+ .gr-column {
599
+ padding: 0 8px !important;
600
+ }
601
+
602
+ /* ๆ ‡็ญพ็พŽๅŒ– */
603
+ .gr-label {
604
+ font-weight: 600 !important;
605
+ color: #374151 !important;
606
+ margin-bottom: 8px !important;
607
+ }
608
+
609
+ /* ๆจกๅž‹็Šถๆ€ๆก†็‰นๆฎŠๆ ทๅผ */
610
+ .gr-textbox[data-testid*="model-status"] {
611
+ background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%) !important;
612
+ font-family: 'Monaco', 'Menlo', monospace !important;
613
+ font-size: 0.9rem !important;
614
+ }
615
+
616
+ /* ็คบไพ‹ๅŒบๅŸŸ็พŽๅŒ– */
617
+ .gr-examples {
618
+ background: #f9fafb !important;
619
+ border-radius: 12px !important;
620
+ padding: 20px !important;
621
+ margin-top: 24px !important;
622
+ border: 1px solid #e5e7eb !important;
623
+ }
624
+
625
+ /* ๅ“ๅบ”ๅผ่ฎพ่ฎก */
626
+ @media (max-width: 768px) {
627
+ .gradio-container {
628
+ padding: 16px !important;
629
+ }
630
+
631
+ .gr-column {
632
+ margin-bottom: 16px !important;
633
+ }
634
+
635
+ .gr-markdown h1 {
636
+ font-size: 2rem !important;
637
+ }
638
+ }
639
+
640
+ /* ๅŠ ่ฝฝๅŠจ็”ป */
641
+ @keyframes pulse {
642
+ 0%, 100% { opacity: 1; }
643
+ 50% { opacity: 0.5; }
644
+ }
645
+
646
+ .gr-loading .gr-image {
647
+ animation: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite !important;
648
+ }
649
+
650
+ /* ๆˆๅŠŸ็Šถๆ€ๆŒ‡็คบ */
651
+ .status-success {
652
+ color: #059669 !important;
653
+ background-color: #d1fae5 !important;
654
+ border: 1px solid #a7f3d0 !important;
655
+ }
656
+
657
+ /* ้”™่ฏฏ็Šถๆ€ๆŒ‡็คบ */
658
+ .status-error {
659
+ color: #dc2626 !important;
660
+ background-color: #fee2e2 !important;
661
+ border: 1px solid #fecaca !important;
662
+ }
663
  """
664
 
665
+ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="ZIP Crowd Counting") as demo:
666
+ gr.Markdown("""
667
+ # ๐ŸŽฏ Crowd Counting by ZIP
668
+ ### Upload an image and get precise crowd density predictions with advanced zero-inflated models
669
+ """)
670
+
671
+ # ๆทปๅŠ ไฟกๆฏ้ขๆฟ
672
+ with gr.Accordion("โ„น๏ธ About ZIP Models", open=False):
673
+ gr.Markdown("""
674
+ **ZIP (Zero-Inflated Poisson)** models are designed to handle crowd counting with:
675
+ - **Structural Zeros**: Areas where people cannot exist (walls, sky, etc.)
676
+ - **Sampling Zeros**: Areas where people could exist but don't
677
+ - **Advanced Metrics**: MAE (Mean Absolute Error) and NAE (Normalized Absolute Error)
678
+
679
+ Choose from different model variants: **ZIP-B** (Base), **ZIP-S** (Small), **ZIP-T** (Tiny), **ZIP-N** (Nano), **ZIP-P** (Pico)
680
+ """)
681
 
682
  with gr.Row():
683
+ with gr.Column(scale=1):
684
+ # ๆจกๅž‹้€‰ๆ‹ฉๅŒบๅŸŸ
685
+ with gr.Group():
686
+ gr.Markdown("### ๐Ÿค– Model Configuration")
687
+ model_dropdown = gr.Dropdown(
688
+ choices=pretrained_models,
689
+ value="ZIP-B @ NWPU-Crowd @ MAE",
690
+ label="๐ŸŽ›๏ธ Select Model & Dataset",
691
+ info="Choose model variant, dataset, and evaluation metric"
692
+ )
693
+ model_status = gr.Textbox(
694
+ label="๐Ÿ“Š Model Status",
695
+ value="๐Ÿ”„ No model loaded",
696
+ interactive=False,
697
+ elem_classes=["status-display"]
698
+ )
699
+
700
+ # ๅ›พๅƒ่พ“ๅ…ฅๅŒบๅŸŸ
701
+ with gr.Group():
702
+ gr.Markdown("### ๐Ÿ“ธ Image Input")
703
+ input_img = gr.Image(
704
+ label="๐Ÿ–ผ๏ธ Upload Image",
705
+ sources=["upload", "clipboard"],
706
+ type="pil",
707
+ height=400
708
+ )
709
+ submit_btn = gr.Button(
710
+ "๐Ÿš€ Analyze Crowd",
711
+ variant="primary",
712
+ size="lg"
713
+ )
714
 
715
+ with gr.Column(scale=1):
716
+ with gr.Group():
717
+ gr.Markdown("### ๐Ÿ“Š Main Results")
718
+ output_den_map = gr.Image(label="๐ŸŽฏ Predicted Density Map", type="pil")
719
+ output_lambda_map = gr.Image(label="๐Ÿ“ˆ Lambda Map", type="pil")
720
+ output_text = gr.Textbox(
721
+ label="๐Ÿ‘ฅ Predicted Count",
722
+ info="Total number of people detected"
723
+ )
724
 
725
+ with gr.Column(scale=1):
726
+ with gr.Group():
727
+ gr.Markdown("### ๐Ÿ” Zero Analysis")
728
+ output_structural_zero_map = gr.Image(label="๐Ÿ—๏ธ Structural Zero Map", type="pil")
729
+ output_sampling_zero_map = gr.Image(label="๐Ÿ“Š Sampling Zero Map", type="pil")
730
+
731
+ with gr.Column(scale=1):
732
+ with gr.Group():
733
+ gr.Markdown("### ๐Ÿ“ˆ Combined Analysis")
734
+ output_complete_zero_map = gr.Image(label="๐ŸŽฏ Complete Zero Map", type="pil")
735
 
736
  # ๅฝ“ๆจกๅž‹ๅ˜ๅŒ–ๆ—ถ๏ผŒ่‡ชๅŠจๆ›ดๆ–ฐๆจกๅž‹
737
  def on_model_change(variant_dataset_metric):
738
  # ๅฆ‚ๆžœ้€‰ๆ‹ฉ็š„ๆ˜ฏๅˆ†ๅ‰ฒ็บฟ๏ผŒไฟๆŒๅฝ“ๅ‰้€‰ๆ‹ฉไธๅ˜
739
  if "โ”โ”โ”โ”โ”โ”" in variant_dataset_metric:
740
+ return "โš ๏ธ Please select a valid model configuration"
741
+ result = update_model_if_needed(variant_dataset_metric)
742
+ if "Model loaded:" in result:
743
+ return f"โœ… {result}"
744
+ elif "Model already loaded:" in result:
745
+ return f"๐Ÿ”„ {result}"
746
+ else:
747
+ return f"โŒ {result}"
748
 
749
  model_dropdown.change(
750
  fn=on_model_change,
 
754
 
755
  # ้กต้ขๅŠ ่ฝฝๆ—ถ่‡ชๅŠจๅŠ ่ฝฝ้ป˜่ฎคๆจกๅž‹
756
  demo.load(
757
+ fn=lambda: f"๐Ÿ”„ {update_model_if_needed('ZIP-B @ NWPU-Crowd @ MAE')}",
758
  outputs=[model_status]
759
  )
760
 
 
764
  outputs=[input_img, output_den_map, output_lambda_map, output_text, output_structural_zero_map, output_sampling_zero_map, output_complete_zero_map]
765
  )
766
 
767
+ # ็พŽๅŒ–็คบไพ‹ๅŒบๅŸŸ
768
+ with gr.Accordion("๐Ÿ–ผ๏ธ Try Example Images", open=True):
769
+ gr.Markdown("**Click on any example below to test the model:**")
770
+ gr.Examples(
771
+ examples=[
772
+ ["example1.jpg"], ["example2.jpg"], # ["example3.jpg"], ["example4.jpg"],
773
+ ["example5.jpg"], ["example6.jpg"], ["example7.jpg"], ["example8.jpg"],
774
+ ["example9.jpg"], ["example10.jpg"], ["example11.jpg"], ["example12.jpg"]
775
+ ],
776
+ inputs=input_img,
777
+ label="๐Ÿ“š Example Gallery",
778
+ examples_per_page=6
779
+ )
780
+
781
+ # ๆทปๅŠ ไฝฟ็”จ่ฏดๆ˜Ž
782
+ with gr.Accordion("๐Ÿ“– How to Use", open=False):
783
+ gr.Markdown("""
784
+ ### Step-by-step Guide:
785
+
786
+ 1. **๐ŸŽ›๏ธ Select Model**: Choose your preferred model variant, dataset, and metric from the dropdown
787
+ 2. **๐Ÿ“ธ Upload Image**: Click the image area to upload your crowd photo or use clipboard
788
+ 3. **๐Ÿš€ Analyze**: Click the "Analyze Crowd" button to start processing
789
+ 4. **๐Ÿ“Š View Results**: Examine the density maps and crowd count in the output panels
790
+
791
+ ### Understanding the Outputs:
792
+
793
+ - **๐ŸŽฏ Density Map**: Shows where people are located with color intensity
794
+ - **๐Ÿ“ˆ Lambda Map**: Represents the expected count per pixel
795
+ - **๐Ÿ—๏ธ Structural Zero Map**: Areas where people cannot exist (buildings, sky)
796
+ - **๐Ÿ“Š Sampling Zero Map**: Areas where people could be but aren't
797
+ - **๐ŸŽฏ Complete Zero Map**: Combined zero probability map
798
+ """)
799
+
800
+ # ๆทปๅŠ ๆŠ€ๆœฏไฟกๆฏ
801
+ with gr.Accordion("๐Ÿ”ฌ Technical Details", open=False):
802
+ gr.Markdown("""
803
+ ### Model Variants:
804
+ - **ZIP-B**: Base model with best performance
805
+ - **ZIP-S**: Smaller model for faster inference
806
+ - **ZIP-T**: Tiny model for resource-constrained environments
807
+ - **ZIP-N**: Nano model for mobile applications
808
+ - **ZIP-P**: Pico model for edge devices
809
+
810
+ ### Datasets:
811
+ - **ShanghaiTech A/B**: Dense crowd scenes
812
+ - **UCF-QNRF**: Ultra high-resolution crowd images
813
+ - **NWPU-Crowd**: Large-scale crowd counting dataset
814
+
815
+ ### Metrics:
816
+ - **MAE**: Mean Absolute Error - average counting error
817
+ - **NAE**: Normalized Absolute Error - relative counting error
818
+ """)
819
+
820
+ demo.launch(
821
+ server_name="0.0.0.0",
822
+ server_port=7860,
823
+ show_api=False,
824
+ share=False
825
+ )