Spaces:

hanszhu
/

Dense-Captioning-Platform

Sleeping

App Files Files Community

hanszhu commited on Aug 13

Commit

eb4d305

0 Parent(s):

build(space): initial Docker Space with Gradio app, MMDet, SAM integration

Browse files

Files changed (26) hide show

.gitattributes +35 -0
.gitignore +46 -0
Dockerfile +29 -0
README.md +13 -0
README_API.md +309 -0
app.py +926 -0
custom_models/custom_cascade_with_meta.py +152 -0
custom_models/custom_dataset.py +537 -0
custom_models/custom_faster_rcnn_with_meta.py +166 -0
custom_models/custom_heads.py +267 -0
custom_models/flexible_load_annotations.py +191 -0
custom_models/mask_filter.py +48 -0
custom_models/nan_recovery_hook.py +181 -0
custom_models/progressive_loss_hook.py +286 -0
custom_models/register.py +29 -0
custom_models/square_fcn_mask_head.py +170 -0
custom_models/square_mask_target.py +67 -0
debug_api.py +123 -0
find_api_endpoint.py +167 -0
models/chart_elementnet_swin.py +394 -0
models/chart_pointnet_swin.py +374 -0
requirements.txt +12 -0
simple_test.py +60 -0
test_api.py +95 -0
test_api_endpoints.py +152 -0
web_test.py +115 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Python cache files
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+# Model files (downloaded automatically)
+models/models--*/
+models/.locks/
+*.pth
+*.pkl
+*.h5
+*.onnx
+# Environment files
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Logs
+*.log
+logs/
+# Temporary files
+*.tmp
+*.temp

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM python:3.10-slim
+ENV DEBIAN_FRONTEND=noninteractive \
+	PIP_NO_CACHE_DIR=1 \
+	MPLBACKEND=Agg \
+	MIM_IGNORE_INSTALL_PYTORCH=1
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	libgl1 libglib2.0-0 git && \
+	rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+# Install pip deps and the mm stack with openmim
+RUN python -m pip install -U pip openmim && \
+	pip install -r requirements.txt && \
+	mim install "mmengine==0.10.4" && \
+	mim install "mmcv==2.1.0" && \
+	mim install "mmdet==3.3.0" && \
+	pip install git+https://github.com/facebookresearch/segment-anything.git
+# Copy the rest of the application
+COPY . /app
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Dense Captioning Platform
+emoji: 🐢
+colorFrom: purple
+colorTo: purple
+sdk: gradio
+sdk_version: 5.39.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

README_API.md ADDED Viewed

	@@ -0,0 +1,309 @@

+# 📊 Dense Captioning Platform API Documentation
+## Overview
+The Dense Captioning Platform provides comprehensive chart analysis through a Gradio-based API. It can classify chart types, detect chart elements, and segment data points from uploaded images.
+## API Access
+**Base URL:** `https://hanszhu-dense-captioning-platform.hf.space`
+**API Type:** Gradio Client API (not RESTful)
+## Installation
+### Prerequisites
+```bash
+pip install gradio-client
+```
+### Quick Start
+### Python Client (Recommended)
+```python
+from gradio_client import Client, handle_file
+# Initialize client with direct URL
+client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+# Analyze a chart image using file path
+result = client.predict(
+    image=handle_file('path/to/your/chart.png'),
+    fn_index=0
+)
+print(result)
+```
+### Using a URL
+```python
+from gradio_client import Client, handle_file
+client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+# Use a publicly accessible image URL
+result = client.predict(
+    image=handle_file("https://example.com/chart.png"),
+    fn_index=0
+)
+print(result)
+```
+## Input Parameters
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `image` | File/URL | Yes | Chart image to analyze (PNG, JPG, JPEG supported) |
+## Important Notes
+### ✅ Working Approach
+- **Use `fn_index=0`** instead of `api_name="/predict"`
+- **Use direct URL** `"https://hanszhu-dense-captioning-platform.hf.space"`
+- **Always use `handle_file()`** for both local files and URLs
+- **This is a Gradio Client API**, not a RESTful API
+### ❌ What Doesn't Work
+- Direct HTTP POST requests to `/predict`
+- Using `api_name="/predict"` with this setup
+- Using `Client("hanszhu/Dense-Captioning-Platform")` (use direct URL instead)
+## Output Format
+The API returns a JSON object with the following structure:
+```json
+{
+    "chart_type_id": 4,
+    "chart_type_label": "Bar plot",
+    "element_result": {
+        "bboxes": [...],
+        "segments": [...]
+    },
+    "datapoint_result": {
+        "bboxes": [...],
+        "segments": [...]
+    },
+    "status": "Full analysis completed",
+    "processing_time": 2.345
+}
+```
+### Output Fields
+| Field | Type | Description |
+|-------|------|-------------|
+| `chart_type_id` | int | Numeric identifier for chart type (0-27) |
+| `chart_type_label` | string | Human-readable chart type name |
+| `element_result` | object/string | Detected chart elements (titles, axes, legends, etc.) |
+| `datapoint_result` | object/string | Segmented data points and regions |
+| `status` | string | Processing status message |
+| `processing_time` | float | Time taken for analysis in seconds |
+## Supported Chart Types
+The platform can classify 28 different chart types:
+| ID | Chart Type | ID | Chart Type |
+|----|------------|----|------------|
+| 0 | Line graph | 14 | Histogram |
+| 1 | Natural image | 15 | Box plot |
+| 2 | Table | 16 | Vector plot |
+| 3 | 3D object | 17 | Pie chart |
+| 4 | Bar plot | 18 | Surface plot |
+| 5 | Scatter plot | 19 | Algorithm |
+| 6 | Medical image | 20 | Contour plot |
+| 7 | Sketch | 21 | Tree diagram |
+| 8 | Geographic map | 22 | Bubble chart |
+| 9 | Flow chart | 23 | Polar plot |
+| 10 | Heat map | 24 | Area chart |
+| 11 | Mask | 25 | Pareto chart |
+| 12 | Block diagram | 26 | Radar chart |
+| 13 | Venn diagram | 27 | Confusion matrix |
+## Chart Elements Detected
+The element detection model identifies:
+- **Titles & Labels**: Chart title, subtitle, axis labels
+- **Axes**: X-axis, Y-axis, tick labels
+- **Legend**: Legend title, legend items, legend text
+- **Data Elements**: Data points, data lines, data bars, data areas
+- **Structural Elements**: Grid lines, plot areas
+## Error Handling
+The API returns error messages in the response fields when issues occur:
+```json
+{
+    "chart_type_id": "Error: Model not available",
+    "chart_type_label": "Error: Model not available",
+    "element_result": "Error: Invalid image format",
+    "datapoint_result": "Error: Processing failed",
+    "status": "Error in chart classification",
+    "processing_time": 0.0
+}
+```
+## Rate Limits
+- **Free Tier**: Limited requests per hour
+- **Processing Time**: Typically 2-5 seconds per image
+- **Image Size**: Recommended max 10MB
+## Complete Working Example
+Here's a complete example that demonstrates all the working patterns:
+```python
+from gradio_client import Client, handle_file
+import json
+def analyze_chart(image_path_or_url):
+    """
+    Analyze a chart image using the Dense Captioning Platform API
+    Args:
+        image_path_or_url (str): Path to local image file or URL to image
+    Returns:
+        dict: Analysis results with chart type, elements, and data points
+    """
+    try:
+        # Initialize client with direct URL
+        client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+        # Make prediction using the working approach
+        result = client.predict(
+            image=handle_file(image_path_or_url),
+            fn_index=0
+        )
+        return result
+    except Exception as e:
+        return {
+            "error": f"API call failed: {str(e)}",
+            "status": "Error",
+            "processing_time": 0.0
+        }
+# Example usage
+if __name__ == "__main__":
+    # Test with a local file
+    local_result = analyze_chart("path/to/your/chart.png")
+    print("Local file result:", json.dumps(local_result, indent=2))
+    # Test with a URL
+    url_result = analyze_chart("https://example.com/chart.png")
+    print("URL result:", json.dumps(url_result, indent=2))
+```
+## Examples
+### Example 1: Bar Chart Analysis
+```python
+from gradio_client import Client, handle_file
+client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+# Analyze a bar chart
+result = client.predict(
+    image=handle_file('bar_chart.png'),
+    fn_index=0
+)
+print(f"Chart Type: {result['chart_type_label']}")
+print(f"Processing Time: {result['processing_time']}s")
+```
+### Example 2: Batch Processing
+```python
+from gradio_client import Client, handle_file
+import os
+client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+# Process multiple charts
+chart_files = ['chart1.png', 'chart2.png', 'chart3.png']
+results = []
+for chart_file in chart_files:
+    if os.path.exists(chart_file):
+        result = client.predict(
+            image=handle_file(chart_file),
+            fn_index=0
+        )
+        results.append(result)
+        print(f"Processed {chart_file}: {result['chart_type_label']}")
+```
+### Example 3: Test with Public Image
+```python
+from gradio_client import Client, handle_file
+client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+# Test with a public image URL
+result = client.predict(
+    image=handle_file("https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"),
+    fn_index=0
+)
+print("✅ API Test Successful!")
+print(f"Chart Type: {result['chart_type_label']}")
+print(f"Status: {result['status']}")
+```
+## Troubleshooting
+### Common Issues
+1. **"Model not available"**: The models are still loading, wait a moment and retry
+2. **"Invalid image format"**: Ensure the image is in PNG, JPG, or JPEG format
+3. **"Processing failed"**: The image might be corrupted or too large
+4. **"Expecting value: line 1 column 1"**: Use `fn_index=0` instead of `api_name="/predict"`
+5. **"Cannot find a function with api_name"**: Use direct URL and `fn_index=0`
+### Best Practices
+1. **Image Quality**: Use clear, high-resolution images for best results
+2. **Format**: PNG or JPG formats work best
+3. **Size**: Keep images under 10MB for faster processing
+4. **Client Setup**: Always use direct URL and `fn_index=0`
+5. **File Handling**: Always use `handle_file()` for both local files and URLs
+6. **Retry Logic**: Implement retry logic for failed requests
+### Quick Test
+To verify the API is working, run this test:
+```python
+from gradio_client import Client, handle_file
+try:
+    client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+    result = client.predict(
+        image=handle_file("https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"),
+        fn_index=0
+    )
+    print("✅ API is working!")
+    print(f"Chart Type: {result['chart_type_label']}")
+except Exception as e:
+    print(f"❌ API test failed: {e}")
+```
+## Support
+For issues or questions:
+- Check the [Hugging Face Space](https://huggingface.co/spaces/hanszhu/Dense-Captioning-Platform)
+- Review the error messages in the API response
+- Ensure your image format and size are within limits

app.py ADDED Viewed

	@@ -0,0 +1,926 @@

+import os
+import sys
+import gradio as gr
+from PIL import Image
+import torch
+import numpy as np
+import cv2
+# Add custom modules to path - try multiple possible locations
+possible_paths = [
+    "./custom_models",
+    "../custom_models",
+    "./Dense-Captioning-Platform/custom_models"
+]
+for path in possible_paths:
+    if os.path.exists(path):
+        sys.path.insert(0, os.path.abspath(path))
+        break
+# Add mmcv to path if it exists
+if os.path.exists('./mmcv'):
+    sys.path.insert(0, os.path.abspath('./mmcv'))
+    print("✅ Added local mmcv to path")
+# Import and register custom modules
+try:
+    from custom_models import register
+    print("✅ Custom modules registered successfully")
+except Exception as e:
+    print(f"⚠️ Warning: Could not register custom modules: {e}")
+# ----------------------
+# Optional MedSAM integration
+# ----------------------
+class MedSAMIntegrator:
+    def __init__(self):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.medsam_model = None
+        self.current_image = None
+        self.current_image_path = None
+        self.embedding = None
+        self._load_medsam_model()
+    def _ensure_segment_anything(self):
+        try:
+            import segment_anything  # noqa: F401
+            return True
+        except Exception as e:
+            print(f"⚠ segment_anything not available: {e}. It must be installed at build time (Dockerfile).")
+            return False
+    def _load_medsam_model(self):
+        try:
+            # Ensure library is present
+            if not self._ensure_segment_anything():
+                print("MedSAM features disabled (segment_anything not available)")
+                return
+            from segment_anything import sam_model_registry as _reg
+            import torch as _torch
+            # Preferred local path
+            medsam_ckpt_path = "models/medsam_vit_b.pth"
+            # If not present, fetch from HF Hub using provided repo or default
+            if not os.path.exists(medsam_ckpt_path):
+                try:
+                    from huggingface_hub import hf_hub_download, list_repo_files
+                    repo_id = os.environ.get("HF_MEDSAM_REPO", "Aniketg6/Fine-Tuned-MedSAM")
+                    # Try to find a .pth/.pt in the repo
+                    print(f"🔄 Trying to download MedSAM checkpoint from {repo_id} ...")
+                    files = list_repo_files(repo_id)
+                    candidate = None
+                    for f in files:
+                        lf = f.lower()
+                        if lf.endswith(".pth") or lf.endswith(".pt"):
+                            candidate = f
+                            break
+                    if candidate is None:
+                        # Fallback to a common name
+                        candidate = "medsam_vit_b.pth"
+                    ckpt_path = hf_hub_download(repo_id=repo_id, filename=candidate, cache_dir="./models")
+                    medsam_ckpt_path = ckpt_path
+                    print(f"✅ Downloaded MedSAM checkpoint: {medsam_ckpt_path}")
+                except Exception as dl_err:
+                    print(f"⚠ Could not fetch MedSAM checkpoint from HF Hub: {dl_err}")
+                    print("MedSAM features disabled (no checkpoint)")
+                    return
+            # Load checkpoint
+            checkpoint = _torch.load(medsam_ckpt_path, map_location='cpu')
+            self.medsam_model = _reg["vit_b"](checkpoint=None)
+            self.medsam_model.load_state_dict(checkpoint)
+            self.medsam_model.to(self.device)
+            self.medsam_model.eval()
+            print("✓ MedSAM model loaded successfully")
+        except Exception as e:
+            print(f"⚠ MedSAM model not available: {e}. MedSAM features disabled.")
+    def is_available(self):
+        return self.medsam_model is not None
+    def load_image(self, image_path, precomputed_embedding=None):
+        try:
+            from skimage import transform, io  # local import to avoid hard dep if unused
+            img_np = io.imread(image_path)
+            if len(img_np.shape) == 2:
+                img_3c = np.repeat(img_np[:, :, None], 3, axis=-1)
+            else:
+                img_3c = img_np
+            self.current_image = img_3c
+            self.current_image_path = image_path
+            if precomputed_embedding is not None:
+                if not self.set_precomputed_embedding(precomputed_embedding):
+                    self.get_embeddings()
+            else:
+                self.get_embeddings()
+            return True
+        except Exception as e:
+            print(f"Error loading image for MedSAM: {e}")
+            return False
+    @torch.no_grad()
+    def get_embeddings(self):
+        if self.current_image is None or self.medsam_model is None:
+            return None
+        from skimage import transform
+        img_1024 = transform.resize(
+            self.current_image, (1024, 1024), order=3, preserve_range=True, anti_aliasing=True
+        ).astype(np.uint8)
+        img_1024 = (img_1024 - img_1024.min()) / np.clip(img_1024.max() - img_1024.min(), a_min=1e-8, a_max=None)
+        img_1024_tensor = (
+            torch.tensor(img_1024).float().permute(2, 0, 1).unsqueeze(0).to(self.device)
+        )
+        self.embedding = self.medsam_model.image_encoder(img_1024_tensor)
+        return self.embedding
+    def set_precomputed_embedding(self, embedding_array):
+        try:
+            if isinstance(embedding_array, np.ndarray):
+                embedding_tensor = torch.tensor(embedding_array).to(self.device)
+                self.embedding = embedding_tensor
+                return True
+            return False
+        except Exception as e:
+            print(f"Error setting precomputed embedding: {e}")
+            return False
+    @torch.no_grad()
+    def medsam_inference(self, box_1024, height, width):
+        if self.embedding is None or self.medsam_model is None:
+            return None
+        box_torch = torch.as_tensor(box_1024, dtype=torch.float, device=self.embedding.device)
+        if len(box_torch.shape) == 2:
+            box_torch = box_torch[:, None, :]
+        sparse_embeddings, dense_embeddings = self.medsam_model.prompt_encoder(
+            points=None, boxes=box_torch, masks=None,
+        )
+        low_res_logits, _ = self.medsam_model.mask_decoder(
+            image_embeddings=self.embedding,
+            image_pe=self.medsam_model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=False,
+        )
+        low_res_pred = torch.sigmoid(low_res_logits)
+        low_res_pred = torch.nn.functional.interpolate(
+            low_res_pred, size=(height, width), mode="bilinear", align_corners=False,
+        )
+        low_res_pred = low_res_pred.squeeze().cpu().numpy()
+        medsam_seg = (low_res_pred > 0.5).astype(np.uint8)
+        return medsam_seg
+    def segment_with_box(self, bbox):
+        if self.embedding is None or self.current_image is None:
+            return None
+        try:
+            H, W, _ = self.current_image.shape
+            x1, y1, x2, y2 = bbox
+            x1 = max(0, min(int(x1), W - 1))
+            y1 = max(0, min(int(y1), H - 1))
+            x2 = max(0, min(int(x2), W - 1))
+            y2 = max(0, min(int(y2), H - 1))
+            if x2 <= x1:
+                x2 = min(x1 + 10, W - 1)
+            if y2 <= y1:
+                y2 = min(y1 + 10, H - 1)
+            box_np = np.array([[x1, y1, x2, y2]], dtype=float)
+            box_1024 = box_np / np.array([W, H, W, H]) * 1024.0
+            medsam_mask = self.medsam_inference(box_1024, H, W)
+            if medsam_mask is not None:
+                return {"mask": medsam_mask, "confidence": 1.0, "method": "medsam_box"}
+            return None
+        except Exception as e:
+            print(f"Error in MedSAM box-based segmentation: {e}")
+            return None
+# Single global instance
+_medsam = MedSAMIntegrator()
+# Cache for SAM automatic mask generator
+_sam_auto_generator = None
+_sam_auto_ckpt_path = None
+def _get_sam_generator():
+    """Load and cache SAM ViT-H automatic mask generator with faster params if checkpoint exists."""
+    global _sam_auto_generator, _sam_auto_ckpt_path
+    if _sam_auto_generator is not None:
+        return _sam_auto_generator
+    try:
+        from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
+        ckpt = "models/sam_vit_h_4b8939.pth"
+        if not os.path.exists(ckpt):
+            try:
+                from huggingface_hub import hf_hub_download
+                ckpt = hf_hub_download(
+                    repo_id="Aniketg6/SAM",
+                    filename="sam_vit_h_4b8939.pth",
+                    cache_dir="./models"
+                )
+                print(f"✅ Downloaded SAM ViT-H checkpoint to: {ckpt}")
+            except Exception as e:
+                print(f"⚠ Failed to download SAM ViT-H checkpoint: {e}")
+                return None
+        _sam_auto_ckpt_path = ckpt
+        sam = sam_model_registry["vit_h"](checkpoint=ckpt)
+        # Speed-tuned generator params
+        _sam_auto_generator = SamAutomaticMaskGenerator(
+            sam,
+            points_per_side=16,
+            pred_iou_thresh=0.88,
+            stability_score_thresh=0.9,
+            crop_n_layers=0,
+            box_nms_thresh=0.7,
+            min_mask_region_area=512  # filter tiny masks
+        )
+        return _sam_auto_generator
+    except Exception as e:
+        print(f"_get_sam_generator failed: {e}")
+        return None
+def _extract_bboxes_from_mmdet_result(det_result):
+    """Extract Nx4 xyxy bboxes from various MMDet result formats."""
+    boxes = []
+    try:
+        # MMDet 3.x: list of DetDataSample
+        if isinstance(det_result, list) and len(det_result) > 0:
+            sample = det_result[0]
+            if hasattr(sample, 'pred_instances'):
+                inst = sample.pred_instances
+                if hasattr(inst, 'bboxes'):
+                    b = inst.bboxes
+                    # mmengine structures may use .tensor for boxes
+                    if hasattr(b, 'tensor'):
+                        b = b.tensor
+                    boxes = b.detach().cpu().numpy().tolist()
+        # Single DetDataSample
+        elif hasattr(det_result, 'pred_instances'):
+            inst = det_result.pred_instances
+            if hasattr(inst, 'bboxes'):
+                b = inst.bboxes
+                if hasattr(b, 'tensor'):
+                    b = b.tensor
+                boxes = b.detach().cpu().numpy().tolist()
+        # MMDet 2.x: tuple of (bbox_result, segm_result)
+        elif isinstance(det_result, tuple) and len(det_result) >= 1:
+            bbox_result = det_result[0]
+            # bbox_result is list per class, each Nx5 [x1,y1,x2,y2,score]
+            if isinstance(bbox_result, (list, tuple)):
+                for arr in bbox_result:
+                    try:
+                        arr_np = np.array(arr)
+                        if arr_np.ndim == 2 and arr_np.shape[1] >= 4:
+                            boxes.extend(arr_np[:, :4].tolist())
+                    except Exception:
+                        continue
+    except Exception as e:
+        print(f"Failed to parse MMDet result for boxes: {e}")
+    return boxes
+def _overlay_masks_on_image(image_pil, mask_list, alpha=0.4):
+    """Overlay binary masks on an image with random colors."""
+    if image_pil is None or not mask_list:
+        return image_pil
+    img = np.array(image_pil.convert('RGB'))
+    overlay = img.copy()
+    for idx, m in enumerate(mask_list):
+        if m is None or 'mask' not in m or m['mask'] is None:
+            continue
+        mask = m['mask'].astype(bool)
+        color = np.random.RandomState(seed=idx + 1234).randint(0, 255, size=3)
+        overlay[mask] = (0.5 * overlay[mask] + 0.5 * color).astype(np.uint8)
+    blended = (alpha * overlay + (1 - alpha) * img).astype(np.uint8)
+    return Image.fromarray(blended)
+def _mask_to_polygons(mask: np.ndarray):
+    """Convert a binary mask (H,W) to a list of polygons ([[x,y], ...]) using OpenCV contours."""
+    try:
+        mask_u8 = (mask.astype(np.uint8) * 255)
+        contours, _ = cv2.findContours(mask_u8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        polygons = []
+        for cnt in contours:
+            if cnt is None or len(cnt) < 3:
+                continue
+            # Simplify contour slightly
+            epsilon = 0.002 * cv2.arcLength(cnt, True)
+            approx = cv2.approxPolyDP(cnt, epsilon, True)
+            poly = approx.reshape(-1, 2).tolist()
+            polygons.append(poly)
+        return polygons
+    except Exception as e:
+        print(f"_mask_to_polygons failed: {e}")
+        return []
+def _find_largest_foreground_bbox(pil_img: Image.Image):
+    """Heuristic: find largest foreground region bbox via Otsu threshold on grayscale.
+    Returns [x1, y1, x2, y2] or full-image bbox if none found."""
+    try:
+        img = np.array(pil_img.convert('RGB'))
+        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        # Otsu threshold (invert if needed by checking mean)
+        _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        # Assume foreground is darker; invert if threshold yields background as white majority
+        if th.mean() > 127:
+            th = 255 - th
+        # Morph close to connect regions
+        kernel = np.ones((5, 5), np.uint8)
+        th = cv2.morphologyEx(th, cv2.MORPH_CLOSE, kernel, iterations=2)
+        contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        if not contours:
+            W, H = pil_img.size
+            return [0, 0, W - 1, H - 1]
+        # Largest contour by area
+        cnt = max(contours, key=cv2.contourArea)
+        x, y, w, h = cv2.boundingRect(cnt)
+        # Pad a little
+        pad = int(0.02 * max(w, h))
+        x1 = max(0, x - pad)
+        y1 = max(0, y - pad)
+        x2 = min(img.shape[1] - 1, x + w + pad)
+        y2 = min(img.shape[0] - 1, y + h + pad)
+        return [x1, y1, x2, y2]
+    except Exception as e:
+        print(f"_find_largest_foreground_bbox failed: {e}")
+        W, H = pil_img.size
+        return [0, 0, W - 1, H - 1]
+def _find_topk_foreground_bboxes(pil_img: Image.Image, max_regions: int = 20, min_area: int = 100):
+    """Find top-K foreground bboxes via Otsu threshold + morphology. Returns list of [x1,y1,x2,y2]."""
+    try:
+        img = np.array(pil_img.convert('RGB'))
+        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        if th.mean() > 127:
+            th = 255 - th
+        kernel = np.ones((3, 3), np.uint8)
+        th = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1)
+        th = cv2.morphologyEx(th, cv2.MORPH_CLOSE, kernel, iterations=2)
+        contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        if not contours:
+            return []
+        contours = sorted(contours, key=cv2.contourArea, reverse=True)
+        bboxes = []
+        H, W = img.shape[:2]
+        for cnt in contours:
+            area = cv2.contourArea(cnt)
+            if area < min_area:
+                continue
+            x, y, w, h = cv2.boundingRect(cnt)
+            # Filter very thin shapes
+            if w < 5 or h < 5:
+                continue
+            pad = int(0.01 * max(w, h))
+            x1 = max(0, x - pad)
+            y1 = max(0, y - pad)
+            x2 = min(W - 1, x + w + pad)
+            y2 = min(H - 1, y + h + pad)
+            bboxes.append([x1, y1, x2, y2])
+            if len(bboxes) >= max_regions:
+                break
+        return bboxes
+    except Exception as e:
+        print(f"_find_topk_foreground_bboxes failed: {e}")
+        return []
+# Try to import mmdet for inference
+try:
+    from mmdet.apis import init_detector, inference_detector
+    MM_DET_AVAILABLE = True
+    print("✅ MMDetection available for inference")
+except ImportError as e:
+    print(f"⚠️ MMDetection import failed: {e}")
+    print("🔄 Attempting to install MMDetection dependencies...")
+    try:
+        import subprocess
+        import sys
+        # Use the working solution with mim install
+        print("🔄 Installing MMDetection dependencies with mim...")
+        # Install openmim if not already installed
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "openmim"])
+        # Install mmengine
+        subprocess.check_call([sys.executable, "-m", "mim", "install", "mmengine"])
+        # Install mmcv with mim (this handles compilation properly)
+        subprocess.check_call([sys.executable, "-m", "mim", "install", "mmcv==2.1.0"])
+        # Install mmdet
+        subprocess.check_call([sys.executable, "-m", "mim", "install", "mmdet"])
+        # Try importing again
+        from mmdet.apis import init_detector, inference_detector
+        MM_DET_AVAILABLE = True
+        print("✅ MMDetection installed and available for inference")
+    except Exception as install_error:
+        print(f"❌ Failed to install MMDetection: {install_error}")
+        MM_DET_AVAILABLE = False
+# === Chart Type Classification (DocFigure) ===
+print("🔄 Loading Chart Classification Model...")
+# Chart type labels from DocFigure dataset (28 classes)
+CHART_TYPE_LABELS = [
+    'Line graph', 'Natural image', 'Table', '3D object', 'Bar plot', 'Scatter plot',
+    'Medical image', 'Sketch', 'Geographic map', 'Flow chart', 'Heat map', 'Mask',
+    'Block diagram', 'Venn diagram', 'Confusion matrix', 'Histogram', 'Box plot',
+    'Vector plot', 'Pie chart', 'Surface plot', 'Algorithm', 'Contour plot',
+    'Tree diagram', 'Bubble chart', 'Polar plot', 'Area chart', 'Pareto chart', 'Radar chart'
+]
+try:
+    # Load the chart_type.pth model file from Hugging Face Hub
+    from huggingface_hub import hf_hub_download
+    import torch
+    from torchvision import transforms
+    print("🔄 Downloading chart_type.pth from Hugging Face Hub...")
+    chart_type_path = hf_hub_download(
+        repo_id="hanszhu/ChartTypeNet-DocFigure",
+        filename="chart_type.pth",
+        cache_dir="./models"
+    )
+    print(f"✅ Downloaded to: {chart_type_path}")
+    # Load the PyTorch model
+    loaded_data = torch.load(chart_type_path, map_location='cpu')
+    # Check if it's a state dict or a complete model
+    if isinstance(loaded_data, dict):
+        # Check if it's a checkpoint with model_state_dict
+        if "model_state_dict" in loaded_data:
+            print("🔄 Loading checkpoint, extracting model_state_dict...")
+            state_dict = loaded_data["model_state_dict"]
+        else:
+            # It's a direct state dict
+            print("🔄 Loading state dict, creating model architecture...")
+            state_dict = loaded_data
+        # Strip "backbone." prefix from state dict keys if present
+        cleaned_state_dict = {}
+        for key, value in state_dict.items():
+            if key.startswith("backbone."):
+                # Remove "backbone." prefix
+                new_key = key[9:]
+                cleaned_state_dict[new_key] = value
+            else:
+                cleaned_state_dict[key] = value
+        print(f"🔄 Cleaned state dict: {len(cleaned_state_dict)} keys")
+        # Create the model architecture
+        from torchvision.models import resnet50
+        chart_type_model = resnet50(pretrained=False)
+        # Create the correct classifier structure to match the state dict
+        import torch.nn as nn
+        in_features = chart_type_model.fc.in_features
+        dropout = nn.Dropout(0.5)
+        chart_type_model.fc = nn.Sequential(
+            nn.Linear(in_features, 512),
+            nn.ReLU(inplace=True),
+            dropout,
+            nn.Linear(512, 28)
+        )
+        # Load the cleaned state dict
+        chart_type_model.load_state_dict(cleaned_state_dict)
+    else:
+        # It's a complete model
+        chart_type_model = loaded_data
+    chart_type_model.eval()
+    # Create a simple processor for the model
+    chart_type_processor = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    CHART_TYPE_AVAILABLE = True
+    print("✅ Chart classification model loaded")
+except Exception as e:
+    print(f"⚠️ Failed to load chart classification model: {e}")
+    import traceback
+    print("🔍 Full traceback:")
+    traceback.print_exc()
+    CHART_TYPE_AVAILABLE = False
+# === Chart Element Detection (Cascade R-CNN) ===
+element_model = None
+datapoint_model = None
+print(f"🔍 MM_DET_AVAILABLE: {MM_DET_AVAILABLE}")
+if MM_DET_AVAILABLE:
+    # Check if config files exist
+    element_config = "models/chart_elementnet_swin.py"
+    point_config = "models/chart_pointnet_swin.py"
+    print(f"🔍 Checking config files...")
+    print(f"🔍 Element config exists: {os.path.exists(element_config)}")
+    print(f"🔍 Point config exists: {os.path.exists(point_config)}")
+    print(f"🔍 Current working directory: {os.getcwd()}")
+    print(f"🔍 Files in models directory: {os.listdir('models') if os.path.exists('models') else 'models directory not found'}")
+    try:
+        print("🔄 Loading ChartElementNet-MultiClass (Cascade R-CNN)...")
+        print(f"🔍 Config path: {element_config}")
+        print(f"🔍 Weights path: hanszhu/ChartElementNet-MultiClass")
+        print(f"🔍 About to call init_detector...")
+        # Download model from Hugging Face Hub
+        from huggingface_hub import hf_hub_download
+        print("🔄 Downloading ChartElementNet weights from Hugging Face Hub...")
+        element_checkpoint = hf_hub_download(
+            repo_id="hanszhu/ChartElementNet-MultiClass",
+            filename="chart_label+.pth",
+            cache_dir="./models"
+        )
+        print(f"✅ Downloaded to: {element_checkpoint}")
+        # Use local config with downloaded weights
+        element_model = init_detector(element_config, element_checkpoint, device="cpu")
+        print("✅ ChartElementNet loaded successfully")
+    except Exception as e:
+        print(f"❌ Failed to load ChartElementNet: {e}")
+        print(f"🔍 Error type: {type(e).__name__}")
+        print(f"🔍 Error details: {str(e)}")
+        import traceback
+        print("🔍 Full traceback:")
+        traceback.print_exc()
+    try:
+        print("🔄 Loading ChartPointNet-InstanceSeg (Mask R-CNN)...")
+        print(f"🔍 Config path: {point_config}")
+        print(f"🔍 Weights path: hanszhu/ChartPointNet-InstanceSeg")
+        print(f"🔍 About to call init_detector...")
+        # Download model from Hugging Face Hub
+        print("🔄 Downloading ChartPointNet weights from Hugging Face Hub...")
+        datapoint_checkpoint = hf_hub_download(
+            repo_id="hanszhu/ChartPointNet-InstanceSeg",
+            filename="chart_datapoint.pth",
+            cache_dir="./models"
+        )
+        print(f"✅ Downloaded to: {datapoint_checkpoint}")
+        # Use local config with downloaded weights
+        datapoint_model = init_detector(point_config, datapoint_checkpoint, device="cpu")
+        print("✅ ChartPointNet loaded successfully")
+    except Exception as e:
+        print(f"❌ Failed to load ChartPointNet: {e}")
+        print(f"🔍 Error type: {type(e).__name__}")
+        print(f"🔍 Error details: {str(e)}")
+        import traceback
+        print("🔍 Full traceback:")
+        traceback.print_exc()
+else:
+    print("❌ MMDetection not available - cannot load custom models")
+    print(f"🔍 MM_DET_AVAILABLE was False")
+print(f"🔍 Final model status:")
+print(f"🔍 element_model: {element_model is not None}")
+print(f"🔍 datapoint_model: {datapoint_model is not None}")
+# === Main prediction function ===
+def analyze(image):
+    """
+    Analyze a chart image and return comprehensive results.
+    Args:
+        image: Input chart image (filepath string or PIL.Image)
+    Returns:
+        dict: Analysis results containing:
+            - chart_type_id (int): Numeric chart type identifier (0-27)
+            - chart_type_label (str): Human-readable chart type name
+            - element_result (str): Detected chart elements (titles, axes, legends, etc.)
+            - datapoint_result (str): Segmented data points and regions
+            - status (str): Processing status message
+            - processing_time (float): Time taken for analysis in seconds
+    """
+    import time
+    from PIL import Image
+    start_time = time.time()
+    # Handle filepath input (convert to PIL Image)
+    if isinstance(image, str):
+        # It's a filepath, load the image
+        image = Image.open(image).convert("RGB")
+    elif image is None:
+        return {"error": "No image provided"}
+    # Ensure we have a PIL Image
+    if not isinstance(image, Image.Image):
+        return {"error": "Invalid image format"}
+    result = {
+        "chart_type_id": "Model not available",
+        "chart_type_label": "Model not available",
+        "element_result": "MMDetection models not available",
+        "datapoint_result": "MMDetection models not available",
+        "status": "Basic chart classification only",
+        "processing_time": 0.0,
+        "medsam": {"available": False}
+    }
+    # Chart Type Classification
+    if CHART_TYPE_AVAILABLE:
+        try:
+            # Preprocess image for PyTorch model
+            processed_image = chart_type_processor(image).unsqueeze(0)  # Add batch dimension
+            # Get prediction
+            with torch.no_grad():
+                outputs = chart_type_model(processed_image)
+                # Handle different output formats
+                if isinstance(outputs, torch.Tensor):
+                    logits = outputs
+                elif hasattr(outputs, 'logits'):
+                    logits = outputs.logits
+                else:
+                    logits = outputs
+                predicted_class = logits.argmax(dim=-1).item()
+            result["chart_type_id"] = predicted_class
+            result["chart_type_label"] = CHART_TYPE_LABELS[predicted_class] if 0 <= predicted_class < len(CHART_TYPE_LABELS) else f"Unknown ({predicted_class})"
+            result["status"] = "Chart classification completed"
+        except Exception as e:
+            result["chart_type_id"] = f"Error: {str(e)}"
+            result["chart_type_label"] = f"Error: {str(e)}"
+            result["status"] = "Error in chart classification"
+    # Chart Element Detection (Cascade R-CNN)
+    if element_model is not None:
+        try:
+            # If medical image, skip heavy MMDet to speed up
+            if isinstance(result.get("chart_type_label"), str) and result["chart_type_label"].lower() == "medical image":
+                result["element_result"] = "skipped_for_medical"
+            else:
+                # Convert PIL image to numpy array for MMDetection
+                np_img = np.array(image.convert("RGB"))[:, :, ::-1]  # PIL → BGR
+                element_result = inference_detector(element_model, np_img)
+                # Convert result to more API-friendly format
+                if isinstance(element_result, tuple):
+                    bbox_result, segm_result = element_result
+                    element_data = {
+                        "bboxes": bbox_result.tolist() if hasattr(bbox_result, 'tolist') else str(bbox_result),
+                        "segments": segm_result.tolist() if hasattr(segm_result, 'tolist') else str(segm_result)
+                    }
+                else:
+                    element_data = str(element_result)
+                result["element_result"] = element_data
+                result["status"] = "Chart classification + element detection completed"
+        except Exception as e:
+            result["element_result"] = f"Error: {str(e)}"
+    # Chart Data Point Segmentation (Mask R-CNN)
+    if datapoint_model is not None:
+        try:
+            # If medical image, skip heavy MMDet to speed up
+            if isinstance(result.get("chart_type_label"), str) and result["chart_type_label"].lower() == "medical image":
+                result["datapoint_result"] = "skipped_for_medical"
+            else:
+                # Convert PIL image to numpy array for MMDetection
+                np_img = np.array(image.convert("RGB"))[:, :, ::-1]  # PIL → BGR
+                datapoint_result = inference_detector(datapoint_model, np_img)
+                # Convert result to more API-friendly format
+                if isinstance(datapoint_result, tuple):
+                    bbox_result, segm_result = datapoint_result
+                    datapoint_data = {
+                        "bboxes": bbox_result.tolist() if hasattr(bbox_result, 'tolist') else str(bbox_result),
+                        "segments": segm_result.tolist() if hasattr(segm_result, 'tolist') else str(segm_result)
+                    }
+                else:
+                    datapoint_data = str(datapoint_result)
+                result["datapoint_result"] = datapoint_data
+                result["status"] = "Full analysis completed"
+        except Exception as e:
+            result["datapoint_result"] = f"Error: {str(e)}"
+    # If predicted as medical image and MedSAM is available, include mask data (polygons)
+    try:
+        label_lower = str(result.get("chart_type_label", "")).strip().lower()
+        if label_lower == "medical image":
+            if _medsam.is_available():
+                # Do not run heuristics here. Prompts are required and handled in the UI then-chain.
+                # Indicate availability and that prompts are needed for segmentation.
+                result["medsam"] = {"available": True, "reason": "provide bbox/points prompts to generate segmentations"}
+            else:
+                # Not available; include reason
+                result["medsam"] = {"available": False, "reason": "segment_anything or checkpoint missing"}
+    except Exception as e:
+        print(f"MedSAM JSON augmentation failed: {e}")
+    result["processing_time"] = round(time.time() - start_time, 3)
+    return result
+def analyze_with_medsam(base_result, image):
+    """Auto-generate segmentations for medical images using SAM ViT-H if available,
+    otherwise fallback to MedSAM over top-K foreground boxes. Returns updated JSON and overlay image."""
+    try:
+        if not isinstance(base_result, dict):
+            return base_result, None
+        label = str(base_result.get("chart_type_label", "")).strip().lower()
+        if label != "medical image":
+            return base_result, None
+        pil_img = Image.open(image).convert("RGB") if isinstance(image, str) else image
+        if pil_img is None:
+            return base_result, None
+        segmentations = []
+        masks_for_overlay = []
+        # Try fast SAM generator first; avoid MedSAM embedding when SAM is available
+        gen = _get_sam_generator()
+        if gen is not None and _sam_auto_ckpt_path is not None and os.path.exists(_sam_auto_ckpt_path):
+            try:
+                import cv2 as _cv2
+                img_path = image if isinstance(image, str) else None
+                if img_path is None:
+                    tmp_path = "./_tmp_input_image.png"
+                    pil_img.save(tmp_path)
+                    img_path = tmp_path
+                img_bgr = _cv2.imread(img_path)
+                masks = gen.generate(img_bgr)
+                # Keep top-K by stability_score or area
+                def _score(m):
+                    s = float(m.get('stability_score', 0.0))
+                    seg = m.get('segmentation', None)
+                    area = int(seg.sum()) if isinstance(seg, np.ndarray) else 0
+                    return (s, area)
+                masks = sorted(masks, key=_score, reverse=True)[:8]
+                for m in masks:
+                    seg = m.get('segmentation', None)
+                    if seg is None:
+                        continue
+                    seg_u8 = seg.astype(np.uint8)
+                    segmentations.append({
+                        "mask": seg_u8.tolist(),
+                        "confidence": float(m.get('stability_score', 1.0)),
+                        "method": "sam_auto"
+                    })
+                    masks_for_overlay.append({"mask": seg_u8})
+            except Exception as e:
+                print(f"SAM generator segmentation failed: {e}")
+        # Fallback to MedSAM boxes only if nothing produced
+        if not segmentations and _medsam.is_available():
+            try:
+                # Prepare embedding once
+                img_path = image if isinstance(image, str) else None
+                if img_path is None:
+                    tmp_path = "./_tmp_input_image.png"
+                    pil_img.save(tmp_path)
+                    img_path = tmp_path
+                _medsam.load_image(img_path)
+                cand_bboxes = _find_topk_foreground_bboxes(pil_img, max_regions=5, min_area=400)
+                for bbox in cand_bboxes:
+                    m = _medsam.segment_with_box(bbox)
+                    if m is None or not isinstance(m.get('mask'), np.ndarray):
+                        continue
+                    segmentations.append({
+                        "mask": m['mask'].astype(np.uint8).tolist(),
+                        "confidence": float(m.get('confidence', 1.0)),
+                        "method": m.get("method", "medsam_box_auto")
+                    })
+                    masks_for_overlay.append(m)
+            except Exception as auto_e:
+                print(f"MedSAM fallback segmentation failed: {auto_e}")
+        W, H = pil_img.size
+        base_result["medsam"] = {
+            "available": True,
+            "height": H,
+            "width": W,
+            "segmentations": segmentations,
+            "num_segments": len(segmentations)
+        }
+        overlay_img = _overlay_masks_on_image(pil_img, masks_for_overlay) if masks_for_overlay else None
+        return base_result, overlay_img
+    except Exception as e:
+        print(f"analyze_with_medsam failed: {e}")
+        return base_result, None
+# === Gradio UI with API enhancements ===
+# Create Blocks interface with explicit API name for stable API surface
+with gr.Blocks(
+    title="📊 Dense Captioning Platform"
+) as demo:
+    gr.Markdown("# 📊 Dense Captioning Platform")
+    gr.Markdown("""
+    **Comprehensive Chart Analysis API**
+    Upload a chart image to get:
+    - **Chart Type Classification**: Identifies the type of chart (line, bar, scatter, etc.)
+    - **Element Detection**: Detects chart elements like titles, axes, legends, data points
+    - **Data Point Segmentation**: Segments individual data points and regions
+    Masks will be automatically generated for medical images when supported.
+    **API Usage:**
+    ```python
+    from gradio_client import Client, handle_file
+    client = Client("hanszhu/Dense-Captioning-Platform")
+    result = client.predict(
+        image=handle_file('path/to/your/chart.png'),
+        api_name="/predict"
+    )
+    print(result)
+    ```
+    **Supported Chart Types:** Line graphs, Bar plots, Scatter plots, Pie charts, Heat maps, and 23+ more
+    """)
+    with gr.Row():
+        with gr.Column():
+            # Input
+            image_input = gr.Image(
+                type="filepath",   # ✅ REQUIRED for gradio_client
+                label="Upload Chart Image",
+                height=400
+            )
+            # Analyze button (single)
+            analyze_btn = gr.Button(
+                "🔍 Analyze",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column():
+            # Output JSON
+            result_output = gr.JSON(
+                label="Analysis Results",
+                height=400
+            )
+            # Overlay image output (populated only for medical images)
+            overlay_output = gr.Image(
+                label="MedSAM Overlay (Medical images)",
+                height=400
+            )
+    # Single API endpoint for JSON
+    analyze_event = analyze_btn.click(
+        fn=analyze,
+        inputs=image_input,
+        outputs=result_output,
+        api_name="/predict"  # ✅ Standard API name that gradio_client expects
+    )
+    # Automatic overlay generation step for medical images
+    analyze_event.then(
+        fn=analyze_with_medsam,
+        inputs=[result_output, image_input],
+        outputs=[result_output, overlay_output],
+    )
+    # Add some examples
+    gr.Examples(
+        examples=[
+            ["https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"]
+        ],
+        inputs=image_input,
+        label="Try with this example"
+    )
+# Launch with API-friendly settings
+if __name__ == "__main__":
+    launch_kwargs = {
+        "server_name": "0.0.0.0",  # Allow external connections
+        "server_port": 7860,
+        "share": False,  # Set to True if you want a public link
+        "show_error": True,  # Show detailed errors for debugging
+        "quiet": False,  # Show startup messages
+        "show_api": True  # Enable API documentation
+    }
+    # Enable queue for gradio_client compatibility
+    demo.queue().launch(**launch_kwargs)  # ✅ required for gradio_client to work

custom_models/custom_cascade_with_meta.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from mmdet.models.detectors import CascadeRCNN
+from mmdet.registry import MODELS
+import torch
+import torch.nn as nn
+@MODELS.register_module()
+class CustomCascadeWithMeta(CascadeRCNN):
+    """Custom Cascade R-CNN with metadata prediction heads."""
+    def __init__(self,
+                 *args,
+                 chart_cls_head=None,
+                 plot_reg_head=None,
+                 axes_info_head=None,
+                 data_series_head=None,
+                 data_points_count_head=None,
+                 coordinate_standardization=None,
+                 data_series_config=None,
+                 axis_aware_feature=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        # Initialize metadata prediction heads
+        if chart_cls_head is not None:
+            self.chart_cls_head = MODELS.build(chart_cls_head)
+        if plot_reg_head is not None:
+            self.plot_reg_head = MODELS.build(plot_reg_head)
+        if axes_info_head is not None:
+            self.axes_info_head = MODELS.build(axes_info_head)
+        if data_series_head is not None:
+            self.data_series_head = MODELS.build(data_series_head)
+        if data_points_count_head is not None:
+            self.data_points_count_head = MODELS.build(data_points_count_head)
+        else:
+            # Default simple regression head for data point count
+            self.data_points_count_head = nn.Sequential(
+                nn.Linear(2048, 512),  # Assuming ResNet-50 backbone features
+                nn.ReLU(),
+                nn.Dropout(0.1),
+                nn.Linear(512, 1)  # Single output for count
+            )
+        # Store configurations
+        self.coordinate_standardization = coordinate_standardization
+        self.data_series_config = data_series_config
+        self.axis_aware_feature = axis_aware_feature
+    def forward_train(self, img, img_metas, gt_bboxes, gt_labels, **kwargs):
+        """Forward function during training."""
+        # Get base detector predictions
+        x = self.extract_feat(img)
+        losses = dict()
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                            self.test_cfg.rpn)
+            rpn_losses, proposal_list = self.rpn_head.forward_train(
+                x,
+                img_metas,
+                gt_bboxes,
+                gt_labels=None,
+                ann_weight=None,
+                proposal_cfg=proposal_cfg)
+            losses.update(rpn_losses)
+        else:
+            proposal_list = kwargs.get('proposals', None)
+        # ROI forward and loss
+        roi_losses = self.roi_head.forward_train(x, img_metas, proposal_list,
+                                               gt_bboxes, gt_labels, **kwargs)
+        losses.update(roi_losses)
+        # Get global features for metadata prediction
+        global_feat = x[-1].mean(dim=[2, 3])  # Global average pooling
+        # Extract ground truth data point counts from img_metas
+        gt_data_point_counts = []
+        for img_meta in img_metas:
+            count = img_meta.get('img_info', {}).get('num_data_points', 0)
+            gt_data_point_counts.append(count)
+        gt_data_point_counts = torch.tensor(gt_data_point_counts, dtype=torch.float32, device=global_feat.device)
+        # Predict data point counts and compute loss
+        pred_data_point_counts = self.data_points_count_head(global_feat).squeeze(-1)
+        data_points_count_loss = nn.MSELoss()(pred_data_point_counts, gt_data_point_counts)
+        losses['data_points_count_loss'] = data_points_count_loss
+        # Use predicted data point count as additional feature for ROI head
+        # Expand the global feature with data point count information
+        normalized_counts = torch.sigmoid(pred_data_point_counts / 100.0)  # Normalize to 0-1 range
+        enhanced_global_feat = torch.cat([global_feat, normalized_counts.unsqueeze(-1)], dim=-1)
+        # Metadata prediction losses
+        if hasattr(self, 'chart_cls_head'):
+            chart_cls_loss = self.chart_cls_head(enhanced_global_feat)
+            losses['chart_cls_loss'] = chart_cls_loss
+        if hasattr(self, 'plot_reg_head'):
+            plot_reg_loss = self.plot_reg_head(enhanced_global_feat)
+            losses['plot_reg_loss'] = plot_reg_loss
+        if hasattr(self, 'axes_info_head'):
+            axes_info_loss = self.axes_info_head(enhanced_global_feat)
+            losses['axes_info_loss'] = axes_info_loss
+        if hasattr(self, 'data_series_head'):
+            data_series_loss = self.data_series_head(enhanced_global_feat)
+            losses['data_series_loss'] = data_series_loss
+        return losses
+    def simple_test(self, img, img_metas, **kwargs):
+        """Test without augmentation."""
+        x = self.extract_feat(img)
+        proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
+        det_bboxes, det_labels = self.roi_head.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg.rcnn, **kwargs)
+        # Get global features for metadata prediction
+        global_feat = x[-1].mean(dim=[2, 3])  # Global average pooling
+        # Predict data point counts
+        pred_data_point_counts = self.data_points_count_head(global_feat).squeeze(-1)
+        # Use predicted data point count as additional feature
+        normalized_counts = torch.sigmoid(pred_data_point_counts / 100.0)  # Normalize to 0-1 range
+        enhanced_global_feat = torch.cat([global_feat, normalized_counts.unsqueeze(-1)], dim=-1)
+        # Get metadata predictions
+        results = []
+        for i, (bboxes, labels) in enumerate(zip(det_bboxes, det_labels)):
+            result = DetDataSample()
+            result.bboxes = bboxes
+            result.labels = labels
+            # Add data point count prediction
+            result.predicted_data_points = pred_data_point_counts[i].item()
+            # Add metadata predictions using enhanced features
+            if hasattr(self, 'chart_cls_head'):
+                result.chart_type = self.chart_cls_head(enhanced_global_feat[i:i+1])
+            if hasattr(self, 'plot_reg_head'):
+                result.plot_bb = self.plot_reg_head(enhanced_global_feat[i:i+1])
+            if hasattr(self, 'axes_info_head'):
+                result.axes_info = self.axes_info_head(enhanced_global_feat[i:i+1])
+            if hasattr(self, 'data_series_head'):
+                result.data_series = self.data_series_head(enhanced_global_feat[i:i+1])
+            results.append(result)
+        return results

custom_models/custom_dataset.py ADDED Viewed

	@@ -0,0 +1,537 @@

+import json
+import os.path as osp
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import LoadImageFromFile
+from mmdet.registry import DATASETS, TRANSFORMS
+from mmdet.datasets.transforms import PackDetInputs
+from mmdet.datasets.base_det_dataset import BaseDetDataset
+import warnings
+# ─── Enhanced robust image loader for real images ───
+@TRANSFORMS.register_module()
+class RobustLoadImageFromFile(LoadImageFromFile):
+    """Enhanced image loader: tries real images first, falls back to dummy if needed."""
+    # Class variable to track missing images
+    missing_count = 0
+    def __init__(self, try_real_images=True, fallback_to_dummy=True, **kwargs):
+        super().__init__(**kwargs)
+        self.try_real_images = try_real_images
+        self.fallback_to_dummy = fallback_to_dummy
+    def transform(self, results):
+        """Try to load real image first, fall back to dummy if not found."""
+        if self.try_real_images:
+            try:
+                # Try standard MMDet image loading first
+                results = super().transform(results)
+                return results
+            except (FileNotFoundError, OSError, Exception) as e:
+                # Count missing image
+                RobustLoadImageFromFile.missing_count += 1
+                # Log warning every 10 missing images to avoid spam
+                if RobustLoadImageFromFile.missing_count % 10 == 1:
+                    warnings.warn(f"Missing image #{RobustLoadImageFromFile.missing_count}: {results.get('img_path', 'unknown')}. "
+                                f"Total missing so far: {RobustLoadImageFromFile.missing_count}",
+                                UserWarning)
+                if not self.fallback_to_dummy:
+                    raise e
+                # Fall through to create dummy image
+        # Create dummy image (either by choice or because real image loading failed)
+        if 'img_shape' in results:
+            h, w = results['img_shape'][:2]
+        else:
+            h = results.get('height', 800)
+            w = results.get('width', 600)
+        results['img'] = np.zeros((h, w, 3), dtype=np.uint8)
+        results['img_shape'] = (h, w, 3)
+        results['ori_shape'] = (h, w, 3)
+        return results
+    @classmethod
+    def get_missing_count(cls):
+        """Get the total count of missing images."""
+        return cls.missing_count
+    @classmethod
+    def reset_missing_count(cls):
+        """Reset the missing image counter."""
+        cls.missing_count = 0
+# ─── Legacy support for old transform name ───
+@TRANSFORMS.register_module()
+class CreateDummyImg(RobustLoadImageFromFile):
+    """Legacy alias for RobustLoadImageFromFile."""
+    pass
+@TRANSFORMS.register_module()
+class ClampBBoxes(BaseTransform):
+    """Simple bbox clamping transform - only clamps coordinates, doesn't filter."""
+    def __init__(self, min_size=1):
+        self.min_size = min_size
+    def transform(self, results):
+        """Clamp bboxes to image bounds without removing any boxes."""
+        if 'gt_bboxes' not in results:
+            return results
+        h, w = results['img_shape'][:2]
+        # Handle both numpy arrays and MMDet's HorizontalBoxes objects
+        gt_bboxes = results['gt_bboxes']
+        if hasattr(gt_bboxes, 'tensor'):
+            # MMDet HorizontalBoxes object - clamp in place
+            gt_bboxes.tensor[:, 0].clamp_(0, w)  # x1
+            gt_bboxes.tensor[:, 1].clamp_(0, h)  # y1
+            gt_bboxes.tensor[:, 2].clamp_(0, w)  # x2
+            gt_bboxes.tensor[:, 3].clamp_(0, h)  # y2
+        else:
+            # Regular numpy array - clamp in place
+            if len(gt_bboxes) > 0:
+                gt_bboxes[:, 0] = np.clip(gt_bboxes[:, 0], 0, w)  # x1
+                gt_bboxes[:, 1] = np.clip(gt_bboxes[:, 1], 0, h)  # y1
+                gt_bboxes[:, 2] = np.clip(gt_bboxes[:, 2], 0, w)  # x2
+                gt_bboxes[:, 3] = np.clip(gt_bboxes[:, 3], 0, h)  # y2
+        # Don't drop anything here - let filter_cfg handle empty GT filtering
+        results['gt_bboxes'] = gt_bboxes
+        return results
+@TRANSFORMS.register_module()
+class SetScaleFactor(BaseTransform):
+    """Compute scale_factor from data_series & plot_bb before any Resize."""
+    def __init__(self, default_scale=(1.0, 1.0)):
+        self.default_scale = default_scale
+    def calculate_scale_factor(self, results):
+        bb = results.get('plot_bb', {})
+        w, h = bb.get('width', 0), bb.get('height', 0)
+        xs, ys = [], []
+        for series in results.get('data_series', []):
+            for pt in series.get('data', []):
+                x, y = pt.get('x'), pt.get('y')
+                if isinstance(x, (int, float)): xs.append(x)
+                if isinstance(y, (int, float)): ys.append(y)
+        if xs and max(xs) != min(xs):
+            x_scale = w / (max(xs) - min(xs))
+        else:
+            x_scale = self.default_scale[0]
+        if ys and max(ys) != min(ys):
+            y_scale = -h / (max(ys) - min(ys))
+        else:
+            y_scale = self.default_scale[1]
+        return (x_scale, y_scale)
+    def transform(self, results):
+        try:
+            sf = self.calculate_scale_factor(results)
+            results['scale_factor'] = np.array(sf, dtype=np.float32)
+        except Exception:
+            results['scale_factor'] = np.array(self.default_scale, dtype=np.float32)
+        H, W = results.get('height', 0), results.get('width', 0)
+        results['img_shape'] = (H, W, 3)
+        return results
+@TRANSFORMS.register_module()
+class EnsureScaleFactor(BaseTransform):
+    """Fallback if no scale_factor set yet."""
+    def transform(self, results):
+        results['scale_factor'] = np.array([1.0, 1.0], dtype=np.float32)
+        return results
+@TRANSFORMS.register_module()
+class SetInputs(BaseTransform):
+    """Copy dummy img into inputs for DetDataPreprocessor."""
+    def transform(self, results):
+        if 'img' in results:
+            results['inputs'] = results['img'].copy()
+        return results
+@TRANSFORMS.register_module()
+class CustomPackDetInputs(PackDetInputs):
+    """Final packing into DetDataSample, ensure inputs present."""
+    def transform(self, results):
+        if 'img' in results:
+            results['inputs'] = results['img'].copy()
+        return super().transform(results)
+@DATASETS.register_module()
+class ChartDataset(BaseDetDataset):
+    """Enhanced dataset for comprehensive chart element detection and analysis."""
+    # Updated METAINFO with 21 enhanced categories
+    METAINFO = {
+        'classes': [
+            'title', 'subtitle', 'x-axis', 'y-axis', 'x-axis-label', 'y-axis-label',
+            'x-tick-label', 'y-tick-label', 'legend', 'legend-title', 'legend-item',
+            'data-point', 'data-line', 'data-bar', 'data-area', 'grid-line',
+            'axis-title', 'tick-label', 'data-label', 'legend-text', 'plot-area'
+        ]
+    }
+    # Chart-type specific element filtering based on actual dataset distribution
+    # Data from analyze_chart_types.py:
+    # • line (41.9%): 1710 images → data-line only
+    # • scatter (18.2%): 742 images → data-point only
+    # • vertical_bar (30.5%): 1246 images → data-bar only
+    # • dot (9.2%): 374 images → data-point only
+    # • horizontal_bar (0.2%): 9 images → data-bar only
+    CHART_TYPE_ELEMENT_MAPPING = {
+        # Line charts (41.9% - 1710 images): ONLY data-line
+        'line': {
+            'allowed_data_elements': {'data-line'},
+            'forbidden_data_elements': {'data-point', 'data-bar', 'data-area'}
+        },
+        # Scatter charts (18.2% - 742 images): ONLY data-point
+        'scatter': {
+            'allowed_data_elements': {'data-point'},
+            'forbidden_data_elements': {'data-line', 'data-bar', 'data-area'}
+        },
+        # Vertical bar charts (30.5% - 1246 images): ONLY data-bar
+        'vertical_bar': {
+            'allowed_data_elements': {'data-bar'},
+            'forbidden_data_elements': {'data-point', 'data-line', 'data-area'}
+        },
+        # Dot charts (9.2% - 374 images): ONLY data-point
+        'dot': {
+            'allowed_data_elements': {'data-point'},
+            'forbidden_data_elements': {'data-line', 'data-bar', 'data-area'}
+        },
+        # Horizontal bar charts (0.2% - 9 images): ONLY data-bar
+        'horizontal_bar': {
+            'allowed_data_elements': {'data-bar'},
+            'forbidden_data_elements': {'data-point', 'data-line', 'data-area'}
+        }
+    }
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.metainfo.update(self.METAINFO)
+        # Print configuration info
+        print(f"📊 ChartDataset initialized with {len(self.METAINFO['classes'])} categories:")
+        for i, cls_name in enumerate(self.METAINFO['classes']):
+            print(f"   {i}: {cls_name}")
+        # Print chart-type filtering info
+        print(f"🎯 Chart-type specific filtering enabled:")
+        for chart_type, mapping in self.CHART_TYPE_ELEMENT_MAPPING.items():
+            allowed = mapping.get('allowed_data_elements', set())
+            forbidden = mapping.get('forbidden_data_elements', set())
+            print(f"   • {chart_type}: ✅ {allowed} | 🚫 {forbidden}")
+        # Debug print the data configuration
+        print(f"📁 Dataset configuration:")
+        print(f"   • data_root: {getattr(self, 'data_root', 'None')}")
+        print(f"   • data_prefix: {getattr(self, 'data_prefix', 'None')}")
+        print(f"   • ann_file: {getattr(self, 'ann_file', 'None')}")
+    def load_data_list(self):
+        """Load enhanced annotation files with priority order."""
+        # Auto-detect best annotation file (same logic as config)
+        def get_best_ann_file(split):
+            ann_dir = osp.join(self.data_root, 'annotations_JSON')
+            # Priority order with flexible naming
+            candidates = [
+                f'{split}_enriched_with_info.json',
+                f'{split}_enriched.json',
+                f'{split}_with_info.json',  # Added: Handles val_with_info.json
+                f'{split}.json',
+                f'{split}_cleaned.json'
+            ]
+            for candidate in candidates:
+                full_path = osp.join(ann_dir, candidate)
+                if osp.exists(full_path):
+                    print(f"📁 ChartDataset using {candidate}")
+                    return full_path
+            # Fallback to ann_file if specified
+            if hasattr(self, 'ann_file') and self.ann_file:
+                fallback_path = osp.join(self.data_root, self.ann_file)
+                if osp.exists(fallback_path):
+                    print(f"📁 Using fallback annotation file: {self.ann_file}")
+                    return fallback_path
+            raise FileNotFoundError(f"No annotation files found in {ann_dir}")
+        # Determine file path
+        if hasattr(self, 'ann_file') and self.ann_file:
+            ann_file_path = osp.join(self.data_root, self.ann_file)
+        else:
+            # Try to auto-detect based on common patterns
+            for split in ['train', 'val']:
+                try:
+                    ann_file_path = get_best_ann_file(split)
+                    break
+                except FileNotFoundError:
+                    continue
+            else:
+                raise FileNotFoundError("Could not find any annotation files")
+        # Load annotation file
+        with open(ann_file_path, 'r') as f:
+            ann = json.load(f)
+        print(f"📊 Loading from {ann_file_path}")
+        print(f"   • Images: {len(ann.get('images', []))}")
+        print(f"   • Annotations: {len(ann.get('annotations', []))}")
+        # Build image lookup
+        img_id_to_info = {img['id']: img for img in ann['images']}
+        # Group annotations by image
+        img_id_to_anns = {}
+        for ann_data in ann.get('annotations', []):
+            img_id = ann_data['image_id']
+            if img_id not in img_id_to_anns:
+                img_id_to_anns[img_id] = []
+            img_id_to_anns[img_id].append(ann_data)
+        # Create data list with enhanced metadata
+        data_list = []
+        for img_id, img_info in img_id_to_info.items():
+            annotations = img_id_to_anns.get(img_id, [])
+            # Skip images without annotations if filter_empty_gt is enabled
+            if not annotations and self.filter_cfg.get('filter_empty_gt', False):
+                    continue
+            # Convert annotations to instances format
+            instances = []
+            for ann in annotations:
+                bbox = ann['bbox']  # [x, y, width, height]
+                # Convert to [x1, y1, x2, y2] format for MMDet
+                bbox_xyxy = [bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]
+                instance = {
+                    'bbox': bbox_xyxy,
+                    'bbox_label': ann['category_id'],
+                    'ignore_flag': 0,
+                    'annotation_id': ann.get('id', -1),
+                    'area': ann.get('area', bbox[2] * bbox[3]),
+                    'element_type': ann.get('element_type', 'unknown')
+                }
+                # Add additional annotation metadata if available
+                for key in ['text', 'role', 'data_point', 'chart_type', 'total_data_points']:
+                    if key in ann:
+                        instance[key] = ann[key]
+                instances.append(instance)
+            # Create data info with enhanced metadata
+            # Fix: Construct full image path using data_prefix (like standard MMDet datasets)
+            filename = img_info['file_name']
+            if self.data_prefix.get('img'):
+                img_path = osp.join(self.data_prefix['img'], filename)
+            else:
+                img_path = filename  # Fallback to original filename
+            data_info = {
+                'img_id': img_info['id'],
+                'img_path': img_path,  # Use constructed path
+                'height': img_info['height'],
+                'width': img_info['width'],
+                'instances': instances,
+                # Enhanced metadata from enriched annotations
+                'chart_type': img_info.get('chart_type', ''),
+                'plot_bb': img_info.get('plot_bb', {}),
+                'data_series': img_info.get('data_series', []),
+                'data_series_stats': img_info.get('data_series_stats', {}),
+                'axes_info': img_info.get('axes_info', {}),
+                'element_counts': img_info.get('element_counts', {}),
+                'source': img_info.get('source', 'unknown')
+            }
+            data_list.append(data_info)
+        print(f"✅ Loaded {len(data_list)} images with enhanced metadata")
+        return data_list
+    def parse_data_info(self, raw_data_info):
+        """Parse data info with enhanced metadata support."""
+        d = raw_data_info.copy()
+        # Debug logging for first few images to verify path construction
+        if hasattr(self, '_debug_count'):
+            self._debug_count += 1
+        else:
+            self._debug_count = 1
+        if self._debug_count <= 3:
+            print(f"🔍 Path verification debug #{self._debug_count}:")
+            print(f"   • img_path from load_data_list: {d['img_path']}")
+            print(f"   • data_root: {getattr(self, 'data_root', 'None')}")
+            full_path = osp.join(self.data_root, d['img_path']) if hasattr(self, 'data_root') else d['img_path']
+            print(f"   • Full absolute path: {full_path}")
+            print(f"   • Path exists: {osp.exists(full_path)}")
+        # Create or get image information
+        img_h, img_w = d['height'], d['width']
+        # Get class names for class-specific filtering
+        class_names = self.METAINFO['classes']
+        # Get filter configuration
+        min_size = self.filter_cfg.get('min_size', 1)
+        class_specific_min_sizes = self.filter_cfg.get('class_specific_min_sizes', {})
+        # Handle bboxes and labels from instances with enhanced filtering
+        bboxes, labels = [], []
+        filtered_count = 0
+        enlarged_count = 0
+        chart_type_filtered_count = 0
+        # Get chart type for filtering
+        chart_type = d.get('chart_type', '').lower()
+        chart_mapping = self.CHART_TYPE_ELEMENT_MAPPING.get(chart_type, {})
+        allowed_data_elements = chart_mapping.get('allowed_data_elements', set())
+        forbidden_data_elements = chart_mapping.get('forbidden_data_elements', set())
+        for inst in d.get('instances', []):
+            bbox = inst['bbox']
+            label_id = inst['bbox_label']
+            # Get class name for this label
+            class_name = class_names[label_id] if 0 <= label_id < len(class_names) else 'unknown'
+            # Chart-type specific filtering: Skip forbidden data elements
+            if chart_type and class_name in forbidden_data_elements:
+                chart_type_filtered_count += 1
+                if self._debug_count <= 3 and chart_type_filtered_count <= 3:
+                    print(f"   🚫 Filtered {class_name} from {chart_type} chart (inappropriate data element)")
+                continue
+            # Chart-type specific validation: Log allowed data elements
+            if chart_type and class_name in allowed_data_elements:
+                if self._debug_count <= 3:
+                    print(f"   ✅ Keeping {class_name} for {chart_type} chart (appropriate data element)")
+            # Validate and clamp bbox
+            x1, y1, x2, y2 = bbox
+            x1 = max(0, min(x1, img_w))
+            y1 = max(0, min(y1, img_h))
+            x2 = max(x1, min(x2, img_w))
+            y2 = max(y1, min(y2, img_h))
+            # Skip invalid bboxes
+            if x2 <= x1 or y2 <= y1:
+                filtered_count += 1
+                continue
+            # Calculate current bbox dimensions
+            bbox_w = x2 - x1
+            bbox_h = y2 - y1
+            bbox_min_dim = min(bbox_w, bbox_h)
+            # Check class-specific minimum size
+            required_min_size = class_specific_min_sizes.get(class_name, min_size)
+            # If bbox is smaller than required, enlarge it to meet the minimum size
+            if bbox_min_dim < required_min_size:
+                # Calculate expansion needed
+                expand_w = max(0, required_min_size - bbox_w) / 2
+                expand_h = max(0, required_min_size - bbox_h) / 2
+                # Expand bbox while keeping it within image bounds
+                new_x1 = max(0, x1 - expand_w)
+                new_y1 = max(0, y1 - expand_h)
+                new_x2 = min(img_w, x2 + expand_w)
+                new_y2 = min(img_h, y2 + expand_h)
+                # Update bbox coordinates
+                x1, y1, x2, y2 = new_x1, new_y1, new_x2, new_y2
+                enlarged_count += 1
+                if self._debug_count <= 3 and enlarged_count <= 3:
+                    print(f"   📏 Enlarged {class_name} bbox: {bbox_w:.1f}x{bbox_h:.1f} → {(x2-x1):.1f}x{(y2-y1):.1f}")
+            bboxes.append([x1, y1, x2, y2])
+            labels.append(label_id)
+        # Log filtering and enlargement statistics for first few images
+        if self._debug_count <= 3:
+            print(f"   📊 Bbox processing: {len(bboxes)} kept, {filtered_count} filtered (invalid), {chart_type_filtered_count} filtered (chart-type), {enlarged_count} enlarged")
+            if chart_type:
+                print(f"   📈 Chart type: {chart_type} | Allowed data elements: {allowed_data_elements}")
+                if forbidden_data_elements:
+                    print(f"   🚫 Forbidden data elements for {chart_type}: {forbidden_data_elements}")
+        # Convert to arrays
+        d['gt_bboxes'] = np.array(bboxes, dtype=np.float32) if bboxes else np.zeros((0, 4), dtype=np.float32)
+        d['gt_bboxes_labels'] = np.array(labels, dtype=np.int64) if labels else np.zeros((0,), dtype=np.int64)
+        # Enhanced scale factor calculation using data_series_stats
+        d['scale_factor'] = np.array([1.0, 1.0], dtype=np.float32)
+        # Use enhanced metadata for better scale factor calculation
+        data_series_stats = d.get('data_series_stats', {})
+        plot_bb = d.get('plot_bb', {})
+        if data_series_stats and plot_bb and all(k in plot_bb for k in ['width', 'height']):
+            x_range = data_series_stats.get('x_range')
+            y_range = data_series_stats.get('y_range')
+            if x_range and len(x_range) == 2 and x_range[1] != x_range[0]:
+                d['scale_factor'][0] = plot_bb['width'] / (x_range[1] - x_range[0])
+            if y_range and len(y_range) == 2 and y_range[1] != y_range[0]:
+                d['scale_factor'][1] = -plot_bb['height'] / (y_range[1] - y_range[0])
+        # Required MMDet fields
+        d.update({
+            'img_shape': (img_h, img_w, 3),
+            'ori_shape': (img_h, img_w, 3),
+            'pad_shape': (img_h, img_w, 3),
+            'flip': False,
+            'flip_direction': None,
+            'img_fields': ['img'],
+            'bbox_fields': ['bbox'],
+        })
+        # Additional metadata for training
+        d['img_info'] = {
+            'height': img_h,
+            'width': img_w,
+            'img_shape': d['img_shape'],
+            'ori_shape': d['ori_shape'],
+            'pad_shape': d['pad_shape'],
+            'scale_factor': d['scale_factor'].copy(),
+            'flip': d['flip'],
+            'flip_direction': d['flip_direction'],
+            # Enhanced metadata
+            'chart_type': d.get('chart_type', ''),
+            'num_data_points': data_series_stats.get('num_data_points', 0),
+            'element_counts': d.get('element_counts', {})
+        }
+        return d
+def print_missing_image_summary():
+    """Print summary of missing images."""
+    count = RobustLoadImageFromFile.get_missing_count()
+    if count > 0:
+        print(f"📊 MISSING IMAGES SUMMARY: {count} images were not found and replaced with dummy images")
+    else:
+        print("✅ All images loaded successfully!")
+def print_dataset_summary():
+    """Print summary of dataset configuration."""
+    print("📊 ENHANCED CHART DATASET SUMMARY:")
+    print(f"   • 21 categories supported for comprehensive chart element detection")
+    print(f"   • Auto-detects best annotation files (enriched_with_info > enriched > regular)")
+    print(f"   • Enhanced metadata: chart_type, data_series_stats, element_counts, axes_info")
+    print(f"   • Robust image loading with fallback to dummy images")
+    print(f"   • Multiple annotations per image (not just plot areas)")
+print("✅ [PLUGIN] Enhanced ChartDataset + transforms registered!")
+print_dataset_summary()

custom_models/custom_faster_rcnn_with_meta.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# custom_faster_rcnn_with_meta.py - Faster R-CNN with coordinate handling for chart data
+import torch
+import torch.nn as nn
+from mmdet.models.detectors.faster_rcnn import FasterRCNN
+from mmdet.registry import MODELS
+@MODELS.register_module()
+class CustomFasterRCNNWithMeta(FasterRCNN):
+    """Faster R-CNN with coordinate standardization for chart detection."""
+    def __init__(self,
+                 *args,
+                 coordinate_standardization=None,
+                 data_points_count_head=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        # Store coordinate standardization settings
+        self.coord_std = coordinate_standardization or {}
+        # Initialize data points count head
+        if data_points_count_head is not None:
+            self.data_points_count_head = MODELS.build(data_points_count_head)
+        else:
+            # Default simple regression head for data point count
+            self.data_points_count_head = nn.Sequential(
+                nn.Linear(2048, 512),  # Assuming ResNet-50 backbone features
+                nn.ReLU(),
+                nn.Dropout(0.1),
+                nn.Linear(512, 1)  # Single output for count
+            )
+        print(f"🎯 CustomFasterRCNNWithMeta initialized with coordinate handling:")
+        print(f"   • Enabled: {self.coord_std.get('enabled', False)}")
+        print(f"   • Origin: {self.coord_std.get('origin', 'top_left')}")
+        print(f"   • Normalize: {self.coord_std.get('normalize', False)}")
+        print(f"   • Data points count prediction: Enabled")
+    def transform_coordinates(self, coords, img_shape, plot_bb=None, axes_info=None):
+        """Transform coordinates based on standardization settings."""
+        if not self.coord_std.get('enabled', False):
+            return coords
+        # Get image dimensions
+        img_height, img_width = img_shape[-2:]
+        # Convert to tensor if not already
+        if not isinstance(coords, torch.Tensor):
+            coords = torch.tensor(coords, device=img_shape.device if hasattr(img_shape, 'device') else 'cpu')
+        # Ensure coords is 2D
+        if coords.dim() == 1:
+            coords = coords.view(-1, 2)
+        # Normalize coordinates if needed
+        if self.coord_std.get('normalize', True):
+            coords = coords / torch.tensor([img_width, img_height], device=coords.device)
+        # Handle bottom-left to top-left origin conversion
+        if self.coord_std.get('origin', 'bottom_left') == 'bottom_left':
+            # Flip y-coordinates to convert from bottom-left to top-left origin
+            coords[:, 1] = 1.0 - coords[:, 1]
+        # Convert back to pixel coordinates
+        if self.coord_std.get('normalize', True):
+            coords = coords * torch.tensor([img_width, img_height], device=coords.device)
+        return coords
+    def forward_train(self,
+                     img,
+                     img_metas,
+                     gt_bboxes,
+                     gt_labels,
+                     gt_bboxes_ignore=None,
+                     **kwargs):
+        """Forward function during training with coordinate transformation."""
+        # Transform ground truth bboxes if coordinate standardization is enabled
+        if self.coord_std.get('enabled', False) and gt_bboxes is not None:
+            transformed_gt_bboxes = []
+            for i, bboxes in enumerate(gt_bboxes):
+                if len(bboxes) > 0:
+                    # Convert bbox format for transformation
+                    # MMDet uses [x1, y1, x2, y2] format
+                    centers = torch.stack([
+                        (bboxes[:, 0] + bboxes[:, 2]) / 2,  # center_x
+                        (bboxes[:, 1] + bboxes[:, 3]) / 2   # center_y
+                    ], dim=1)
+                    # Transform centers
+                    img_shape = img.shape if hasattr(img, 'shape') else (img_metas[i]['img_shape'][0], img_metas[i]['img_shape'][1])
+                    transformed_centers = self.transform_coordinates(
+                        centers, img_shape,
+                        plot_bb=img_metas[i].get('plot_bb'),
+                        axes_info=img_metas[i].get('axes_info')
+                    )
+                    # Reconstruct bboxes with transformed centers
+                    widths = bboxes[:, 2] - bboxes[:, 0]
+                    heights = bboxes[:, 3] - bboxes[:, 1]
+                    transformed_bboxes = torch.stack([
+                        transformed_centers[:, 0] - widths / 2,   # x1
+                        transformed_centers[:, 1] - heights / 2,  # y1
+                        transformed_centers[:, 0] + widths / 2,   # x2
+                        transformed_centers[:, 1] + heights / 2   # y2
+                    ], dim=1)
+                    transformed_gt_bboxes.append(transformed_bboxes)
+                else:
+                    transformed_gt_bboxes.append(bboxes)
+            gt_bboxes = transformed_gt_bboxes
+        # Call parent forward_train with transformed coordinates to get losses
+        losses = super().forward_train(
+            img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs)
+        # Extract features for data point count prediction
+        x = self.extract_feat(img)
+        global_feat = x[-1].mean(dim=[2, 3])  # Global average pooling
+        # Extract ground truth data point counts from img_metas
+        gt_data_point_counts = []
+        for img_meta in img_metas:
+            count = img_meta.get('img_info', {}).get('num_data_points', 0)
+            gt_data_point_counts.append(count)
+        gt_data_point_counts = torch.tensor(gt_data_point_counts, dtype=torch.float32, device=global_feat.device)
+        # Predict data point counts and compute loss
+        pred_data_point_counts = self.data_points_count_head(global_feat).squeeze(-1)
+        data_points_count_loss = nn.MSELoss()(pred_data_point_counts, gt_data_point_counts)
+        losses['data_points_count_loss'] = data_points_count_loss
+        return losses
+    def simple_test(self, img, img_metas, proposals=None, rescale=False):
+        """Simple test function with coordinate inverse transformation."""
+        # Get predictions from parent
+        results = super().simple_test(img, img_metas, proposals, rescale)
+        # Extract features for data point count prediction
+        x = self.extract_feat(img)
+        global_feat = x[-1].mean(dim=[2, 3])  # Global average pooling
+        # Predict data point counts
+        pred_data_point_counts = self.data_points_count_head(global_feat).squeeze(-1)
+        # Add data point count predictions to results
+        if results is not None:
+            for i, result in enumerate(results):
+                if hasattr(result, 'pred_instances'):
+                    result.pred_instances.predicted_data_points = pred_data_point_counts[i].item()
+                elif hasattr(result, 'bboxes'):
+                    # For older MMDet versions, add as additional attribute
+                    result.predicted_data_points = pred_data_point_counts[i].item()
+        # Inverse transform predictions if coordinate standardization is enabled
+        if self.coord_std.get('enabled', False) and results is not None:
+            # Note: For simplicity, we're not doing inverse transform in test
+            # The coordinate system should be consistent during training
+            pass
+        return results

custom_models/custom_heads.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.registry import MODELS
+@MODELS.register_module()
+class FCHead(nn.Module):
+    """Enhanced fully connected head for classification tasks with attention."""
+    def __init__(self, in_channels, num_classes, loss=None):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(in_channels, num_heads=8)
+        self.fc1 = nn.Linear(in_channels, in_channels // 2)
+        self.fc2 = nn.Linear(in_channels // 2, num_classes)
+        self.loss = loss
+    def forward(self, x):
+        # Apply self-attention
+        x = self.attention(x, x, x)[0]
+        # Apply MLP
+        x = F.relu(self.fc1(x))
+        return self.fc2(x)
+@MODELS.register_module()
+class RegHead(nn.Module):
+    """Enhanced regression head for coordinate prediction with distance-based loss."""
+    def __init__(self, in_channels, out_dims, max_points=None, loss=None, attention=False, use_axis_info=False):
+        super().__init__()
+        self.fc = nn.Linear(in_channels, out_dims)
+        self.max_points = max_points
+        self.loss = loss
+        self.attention = attention
+        self.use_axis_info = use_axis_info
+        if attention:
+            self.attention_layer = nn.MultiheadAttention(in_channels, num_heads=8)
+        # Add axis orientation detection
+        if use_axis_info:
+            self.axis_orientation = nn.Linear(in_channels, 2)  # 2 for x/y axis orientation
+    def compute_distance_loss(self, pred_points, gt_points):
+        """Compute distance-based loss between predicted and ground truth points."""
+        # Ensure points are in the same format
+        if pred_points.dim() == 2:
+            pred_points = pred_points.unsqueeze(0)
+        if gt_points.dim() == 2:
+            gt_points = gt_points.unsqueeze(0)
+        # Compute pairwise distances
+        dist = torch.cdist(pred_points, gt_points)
+        # Get minimum distance for each point
+        min_dist, _ = torch.min(dist, dim=2)
+        # Compute loss (using smooth L1 loss for robustness)
+        return F.smooth_l1_loss(min_dist, torch.zeros_like(min_dist))
+    def forward(self, x):
+        if self.attention:
+            x = self.attention_layer(x, x, x)[0]
+        # Get base predictions
+        pred = self.fc(x)
+        # If using axis info, also predict axis orientation
+        if self.use_axis_info:
+            axis_orientation = self.axis_orientation(x)
+            return pred, axis_orientation
+        return pred
+class CoordinateTransformer:
+    """Helper class to transform coordinates between different spaces."""
+    @staticmethod
+    def to_axis_relative(points, axis_info):
+        """Transform points to be relative to axis coordinates.
+        Args:
+            points (torch.Tensor): Points in image coordinates (N, 2)
+            axis_info (torch.Tensor): Axis information [x_min, x_max, y_min, y_max, x_origin, y_origin, x_scale, y_scale]
+        """
+        # Extract axis information
+        x_min, x_max, y_min, y_max, x_origin, y_origin, x_scale, y_scale = axis_info.unbind(1)
+        # Normalize to [0, 1] range
+        x_norm = (points[..., 0] - x_min) / (x_max - x_min)
+        y_norm = (points[..., 1] - y_min) / (y_max - y_min)
+        # Scale to axis units
+        x_axis = x_norm * x_scale + x_origin
+        y_axis = y_norm * y_scale + y_origin
+        return torch.stack([x_axis, y_axis], dim=-1)
+    @staticmethod
+    def to_image_coordinates(points, axis_info):
+        """Transform points from axis coordinates to image coordinates."""
+        # Extract axis information
+        x_min, x_max, y_min, y_max, x_origin, y_origin, x_scale, y_scale = axis_info.unbind(1)
+        # Convert from axis units to normalized coordinates
+        x_norm = (points[..., 0] - x_origin) / x_scale
+        y_norm = (points[..., 1] - y_origin) / y_scale
+        # Convert to image coordinates
+        x_img = x_norm * (x_max - x_min) + x_min
+        y_img = y_norm * (y_max - y_min) + y_min
+        return torch.stack([x_img, y_img], dim=-1)
+@MODELS.register_module()
+class DataSeriesHead(nn.Module):
+    """Specialized head for data series prediction with dual attention to coordinates and axis-relative positions."""
+    def __init__(self, in_channels, max_points=50, loss=None):
+        super().__init__()
+        self.max_points = max_points
+        self.loss = loss
+        # Feature extraction
+        self.fc1 = nn.Linear(in_channels, in_channels // 2)
+        # Separate branches for absolute and relative coordinates
+        self.absolute_branch = nn.Sequential(
+            nn.Linear(in_channels // 2, in_channels // 4),
+            nn.ReLU(),
+            nn.Linear(in_channels // 4, max_points * 2)  # 2 coordinates per point
+        )
+        self.relative_branch = nn.Sequential(
+            nn.Linear(in_channels // 2, in_channels // 4),
+            nn.ReLU(),
+            nn.Linear(in_channels // 4, max_points * 2)  # 2 coordinates per point
+        )
+        # Attention mechanisms
+        self.coord_attention = nn.MultiheadAttention(in_channels, num_heads=8)
+        self.axis_attention = nn.MultiheadAttention(in_channels, num_heads=8)
+        self.sequence_attention = nn.MultiheadAttention(in_channels, num_heads=8)
+        # Sequence-aware processing
+        self.sequence_encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=in_channels,
+                nhead=8,
+                dim_feedforward=in_channels * 4,
+                dropout=0.1
+            ),
+            num_layers=2
+        )
+        # Pattern recognition
+        self.pattern_recognizer = nn.Sequential(
+            nn.Linear(in_channels, in_channels // 2),
+            nn.ReLU(),
+            nn.Linear(in_channels // 2, 5)  # 5 for different chart patterns
+        )
+        # Coordinate transformer
+        self.coord_transformer = CoordinateTransformer()
+    def check_monotonicity(self, points, chart_type):
+        """Check if points follow expected monotonicity based on chart type."""
+        if chart_type in ['line', 'scatter']:
+            # For line/scatter, check if points are generally increasing or decreasing
+            diffs = points[..., 1].diff()
+            return torch.all(diffs >= 0) or torch.all(diffs <= 0)
+        return True
+    def forward(self, x, axis_info=None, chart_type=None):
+        # Apply coordinate attention
+        coord_feat = self.coord_attention(x, x, x)[0]
+        # Apply axis attention if axis info is available
+        if axis_info is not None:
+            axis_feat = self.axis_attention(x, x, x)[0]
+            # Combine features
+            x = coord_feat + axis_feat
+        else:
+            x = coord_feat
+        # Apply sequence attention
+        seq_feat = self.sequence_attention(x, x, x)[0]
+        x = x + seq_feat
+        # Process through sequence encoder
+        x = self.sequence_encoder(x.unsqueeze(0)).squeeze(0)
+        # Extract base features
+        x = F.relu(self.fc1(x))
+        # Get predictions from both branches
+        absolute_points = self.absolute_branch(x)
+        relative_points = self.relative_branch(x)
+        # Reshape to (batch_size, max_points, 2)
+        absolute_points = absolute_points.view(-1, self.max_points, 2)
+        relative_points = relative_points.view(-1, self.max_points, 2)
+        # If axis information is provided, transform relative points
+        if axis_info is not None:
+            relative_points = self.coord_transformer.to_axis_relative(relative_points, axis_info)
+        # Get pattern prediction
+        pattern_logits = self.pattern_recognizer(x)
+        # Check monotonicity if chart type is provided
+        if chart_type is not None:
+            monotonicity = self.check_monotonicity(absolute_points, chart_type)
+        else:
+            monotonicity = None
+        return absolute_points, relative_points, pattern_logits, monotonicity
+    def compute_loss(self, pred_absolute, pred_relative, gt_absolute, gt_relative,
+                    pattern_logits, gt_pattern, axis_info=None, chart_type=None):
+        """Compute combined loss for both absolute and relative coordinates."""
+        # Ensure points are in the same format
+        if pred_absolute.dim() == 2:
+            pred_absolute = pred_absolute.unsqueeze(0)
+        if pred_relative.dim() == 2:
+            pred_relative = pred_relative.unsqueeze(0)
+        if gt_absolute.dim() == 2:
+            gt_absolute = gt_absolute.unsqueeze(0)
+        if gt_relative.dim() == 2:
+            gt_relative = gt_relative.unsqueeze(0)
+        # Compute absolute coordinate loss
+        absolute_loss = self.compute_distance_loss(pred_absolute, gt_absolute)
+        # Compute relative coordinate loss
+        if axis_info is not None:
+            # Transform predicted absolute points to relative coordinates
+            pred_absolute_relative = self.coord_transformer.to_axis_relative(pred_absolute, axis_info)
+            relative_loss = self.compute_distance_loss(pred_absolute_relative, gt_relative)
+        else:
+            relative_loss = torch.tensor(0.0, device=pred_absolute.device)
+        # Compute pattern recognition loss
+        pattern_loss = F.cross_entropy(pattern_logits, gt_pattern)
+        # Add monotonicity penalty if applicable
+        if chart_type is not None:
+            monotonicity = self.check_monotonicity(pred_absolute, chart_type)
+            monotonicity_loss = F.binary_cross_entropy(monotonicity.float(), torch.ones_like(monotonicity.float()))
+        else:
+            monotonicity_loss = torch.tensor(0.0, device=pred_absolute.device)
+        # Combine losses with weights
+        total_loss = (absolute_loss + relative_loss +
+                     0.5 * pattern_loss + 0.3 * monotonicity_loss)
+        return total_loss
+    def compute_distance_loss(self, pred_points, gt_points):
+        """Compute distance-based loss between predicted and ground truth points."""
+        # Compute pairwise distances
+        dist = torch.cdist(pred_points, gt_points)
+        # Get minimum distance for each point
+        min_dist, _ = torch.min(dist, dim=2)
+        # Compute loss (using smooth L1 loss for robustness)
+        return F.smooth_l1_loss(min_dist, torch.zeros_like(min_dist))

custom_models/flexible_load_annotations.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+from typing import Dict, Optional
+from mmcv.transforms.base import BaseTransform
+from mmdet.registry import TRANSFORMS
+from mmdet.datasets.transforms.loading import LoadAnnotations
+import logging
+from mmdet.structures.mask import BitmapMasks
+logger = logging.getLogger(__name__)
+@TRANSFORMS.register_module()
+class FlexibleLoadAnnotations(LoadAnnotations):
+    """
+    Flexible annotation loader that handles mixed mask/bbox datasets.
+    """
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_mask: bool = True,
+                 with_seg: bool = False,
+                 poly2mask: bool = True,
+                 **kwargs):
+        super().__init__(
+            with_bbox=with_bbox,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            poly2mask=poly2mask,
+            **kwargs
+        )
+        self.mask_stats = {'total': 0, 'with_masks': 0, 'without_masks': 0}
+    def _load_masks(self, results: dict) -> dict:
+        """Load mask annotations from COCO format instances."""
+        if not self.with_mask or not isinstance(results, dict):
+            return results
+        # Check for ann_info format (what COCO dataset actually provides)
+        ann_info = results.get('ann_info')
+        if isinstance(ann_info, dict):
+            # Check if segmentation is in ann_info
+            if 'segmentation' in ann_info:
+                segmentation = ann_info['segmentation']
+                if segmentation and isinstance(segmentation, list) and len(segmentation) > 0:
+                    # Convert to mask format
+                    ann_info['masks'] = segmentation
+                    return super()._load_masks(results)
+            # Check for polygon data in ann_info
+            if 'polygon' in ann_info:
+                polygon = ann_info['polygon']
+                if polygon and isinstance(polygon, dict):
+                    try:
+                        # Convert polygon to COCO segmentation format
+                        coords = []
+                        for j in range(4):  # Assuming 4-point polygons
+                            x_key = f'x{j}'
+                            y_key = f'y{j}'
+                            if x_key in polygon and y_key in polygon:
+                                coords.extend([polygon[x_key], polygon[y_key]])
+                        if len(coords) >= 6:  # Need at least 3 points (6 coordinates)
+                            # Convert to COCO format: [x1, y1, x2, y2, x3, y3, ...]
+                            segmentation = [coords]
+                            ann_info['segmentation'] = segmentation
+                            ann_info['masks'] = segmentation
+                            return super()._load_masks(results)
+                    except Exception as e:
+                        logger.debug(f"Polygon conversion failed: {e}")
+        # Handle COCO format: instances with segmentation
+        instances = results.get('instances')
+        if isinstance(instances, list):
+            # Process ALL instances - keep both with and without masks
+            valid_instances = []
+            for i, instance in enumerate(instances):
+                self.mask_stats['total'] += 1
+                # Check for segmentation in COCO format (COCO dataset stores it in 'mask' field)
+                segmentation = instance.get('mask') or instance.get('segmentation')
+                if segmentation and isinstance(segmentation, list) and len(segmentation) > 0:
+                    # Handle nested list format: [[x1, y1, x2, y2, ...]]
+                    if isinstance(segmentation[0], list):
+                        # Nested format - check if inner list has enough coordinates
+                        inner_seg = segmentation[0]
+                        if len(inner_seg) >= 6:  # Need at least 3 points (6 coordinates)
+                            instance['mask'] = segmentation  # Keep original nested format for parent
+                            valid_instances.append(instance)
+                            self.mask_stats['with_masks'] += 1
+                        else:
+                            # Keep instance for bbox training even without valid mask
+                            instance['mask'] = []
+                            valid_instances.append(instance)
+                            self.mask_stats['without_masks'] += 1
+                    else:
+                        # Flat format - already correct
+                        instance['mask'] = segmentation
+                        valid_instances.append(instance)
+                        self.mask_stats['with_masks'] += 1
+                else:
+                    # Check for polygon data and convert to segmentation
+                    polygon = instance.get('polygon')
+                    if polygon and isinstance(polygon, dict):
+                        # Convert polygon to COCO segmentation format
+                        try:
+                            # Extract polygon coordinates
+                            coords = []
+                            for j in range(4):  # Assuming 4-point polygons
+                                x_key = f'x{j}'
+                                y_key = f'y{j}'
+                                if x_key in polygon and y_key in polygon:
+                                    coords.extend([polygon[x_key], polygon[y_key]])
+                            if len(coords) >= 6:  # Need at least 3 points (6 coordinates)
+                                # Convert to COCO format: [x1, y1, x2, y2, x3, y3, ...]
+                                segmentation = [coords]
+                                instance['segmentation'] = segmentation
+                                instance['mask'] = segmentation
+                                valid_instances.append(instance)
+                                self.mask_stats['with_masks'] += 1
+                            else:
+                                # Keep instance for bbox training even without mask
+                                # Add empty mask field to prevent KeyError in parent class
+                                instance['mask'] = []
+                                valid_instances.append(instance)
+                                self.mask_stats['without_masks'] += 1
+                        except Exception as e:
+                            # Keep instance for bbox training even if polygon conversion fails
+                            # Add empty mask field to prevent KeyError in parent class
+                            instance['mask'] = []
+                            valid_instances.append(instance)
+                            self.mask_stats['without_masks'] += 1
+                    else:
+                        # Keep instance for bbox training even without segmentation
+                        # Add empty mask field to prevent KeyError in parent class
+                        instance['mask'] = []
+                        valid_instances.append(instance)
+                        self.mask_stats['without_masks'] += 1
+            # Update results with valid instances only
+            results['instances'] = valid_instances
+            # Call parent method to process the filtered instances
+            if valid_instances:
+                super()._load_masks(results)  # Parent modifies results in place
+                return results
+            else:
+                # No valid masks, create empty mask structure
+                h, w = results.get('img_shape', (0, 0))
+                results['gt_masks'] = BitmapMasks([], h, w)
+                results['gt_ignore_flags'] = np.array([], dtype=bool)
+                return results
+        # Check for direct segmentation in results
+        if 'segmentation' in results:
+            segmentation = results['segmentation']
+            if segmentation and isinstance(segmentation, list) and len(segmentation) > 0:
+                results['masks'] = segmentation
+                return super()._load_masks(results)
+        return results
+    def transform(self, results: dict) -> dict:
+        """Transform function to load annotations."""
+        # ensure we always return a dict
+        if not isinstance(results, dict):
+            logger.error(f"Expected dict, got {type(results)}")
+            return {}
+        # Call parent transform to handle bbox loading
+        results = super().transform(results)
+        # Handle mask loading with our custom logic
+        results = self._load_masks(results)
+        # periodic logging
+        if self.mask_stats['total'] % 1000 == 0:
+            t = self.mask_stats['total']
+            w = self.mask_stats['with_masks']
+            wo = self.mask_stats['without_masks']
+            logger.info(f"Mask stats - total: {t}, with_masks: {w}, without_masks: {wo}")
+        return results
+    def __repr__(self) -> str:
+        """String representation."""
+        return (f'{self.__class__.__name__}('
+                f'with_bbox={self.with_bbox}, '
+                f'with_mask={self.with_mask}, '
+                f'with_seg={self.with_seg}, '
+                f'poly2mask={self.poly2mask})')

custom_models/mask_filter.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import numpy as np
+from mmcv.transforms.base import BaseTransform
+from mmdet.registry import TRANSFORMS
+import logging
+logger = logging.getLogger(__name__)
+@TRANSFORMS.register_module()
+class MaskFilter(BaseTransform):
+    """Filter out images with no valid masks during training.
+    This transform checks if there are any valid masks in the image and
+    returns None if no masks are found, which will cause the image to be skipped.
+    """
+    def __init__(self, min_masks=1):
+        self.min_masks = min_masks
+    def transform(self, results):
+        """Filter results based on mask availability.
+        Args:
+            results (dict): Result dict from dataset.
+        Returns:
+            dict or None: Returns results if valid masks found, None otherwise.
+        """
+        # Check if we have valid masks
+        gt_masks = results.get('gt_masks')
+        if gt_masks is None:
+            logger.warning("MaskFilter: No gt_masks found, skipping image")
+            return None
+        # Count valid masks
+        if hasattr(gt_masks, 'masks'):
+            num_masks = len(gt_masks.masks)
+        elif hasattr(gt_masks, 'polygons'):
+            num_masks = len(gt_masks.polygons)
+        else:
+            num_masks = 0
+        if num_masks < self.min_masks:
+            logger.info(f"MaskFilter: Only {num_masks} masks found (min: {self.min_masks}), skipping image")
+            return None
+        logger.info(f"MaskFilter: {num_masks} masks found, keeping image for training")
+        return results

custom_models/nan_recovery_hook.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# nan_recovery_hook.py - Graceful NaN loss recovery for Cascade R-CNN
+import torch
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmdet.registry import HOOKS
+from typing import Optional, Dict, Any
+@HOOKS.register_module()
+class NanRecoveryHook(Hook):
+    """Hook to handle NaN losses gracefully without crashing training.
+    This hook detects NaN losses and handles them by:
+    1. Replacing NaN losses with the last valid loss value
+    2. Skipping gradient updates for that iteration
+    3. Logging the recovery for monitoring
+    4. Allowing training to continue normally
+    """
+    def __init__(self,
+                 fallback_loss: float = 0.5,
+                 max_consecutive_nans: int = 100,  # Increased from 10
+                 log_interval: int = 50):  # Log less frequently
+        self.fallback_loss = fallback_loss
+        self.max_consecutive_nans = max_consecutive_nans
+        self.log_interval = log_interval
+        # State tracking
+        self.last_valid_loss = fallback_loss
+        self.consecutive_nans = 0
+        self.total_nans = 0
+        self.nan_iterations = []
+    def before_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None) -> None:
+        """Reset any state before training iteration."""
+        pass
+    def after_train_iter(self,
+                        runner: Runner,
+                        batch_idx: int,
+                        data_batch: Optional[dict] = None,
+                        outputs: Optional[Dict[str, Any]] = None) -> None:
+        """Handle NaN losses after training iteration."""
+        if outputs is None:
+            return
+        # Check ALL loss components for NaN, not just the main loss
+        has_nan = False
+        # Check main loss
+        total_loss = outputs.get('loss')
+        if total_loss is not None and (torch.isnan(total_loss) or torch.isinf(total_loss)):
+            has_nan = True
+        # Check all individual loss components
+        for key, value in outputs.items():
+            if isinstance(value, torch.Tensor) and 'loss' in key.lower():
+                if torch.isnan(value) or torch.isinf(value):
+                    has_nan = True
+                    break
+        if has_nan:
+            self._handle_nan_loss(runner, batch_idx, outputs)
+        else:
+            # Valid loss - update tracking
+            if total_loss is not None:
+                self.last_valid_loss = float(total_loss.item())
+                if self.consecutive_nans > 0:
+                    runner.logger.info(f"🎉 Loss recovered after {self.consecutive_nans} NaN iterations")
+                self.consecutive_nans = 0
+    def _handle_nan_loss(self, runner: Runner, batch_idx: int, outputs: Dict[str, Any]) -> None:
+        """Handle NaN loss by replacing with detached fallback and managing state."""
+        self.consecutive_nans += 1
+        self.total_nans += 1
+        self.nan_iterations.append(batch_idx)
+        # Try to get last good state from SkipBadSamplesHook if available
+        last_good_iteration = batch_idx
+        last_good_loss = self.last_valid_loss
+        for hook in runner.hooks:
+            if hasattr(hook, 'last_good_iteration') and hasattr(hook, 'last_good_loss'):
+                if hook.last_good_loss is not None:
+                    last_good_iteration = hook.last_good_iteration
+                    last_good_loss = hook.last_good_loss
+                break
+        # Replace NaN loss with detached fallback (no gradients = true no-op)
+        if 'loss' in outputs and outputs['loss'] is not None:
+            fallback_tensor = torch.tensor(
+                last_good_loss,
+                device=outputs['loss'].device,
+                dtype=outputs['loss'].dtype
+                # NOTE: No requires_grad=True - this makes it detached
+            )
+            outputs['loss'] = fallback_tensor
+        # Also fix individual loss components with detached tensors
+        self._fix_loss_components(outputs, last_good_loss)
+        # Log recovery with state info
+        if self.consecutive_nans <= 5 or self.consecutive_nans % self.log_interval == 0:
+            runner.logger.warning(
+                f"🔄 NaN Recovery at iteration {batch_idx}: "
+                f"Using last good loss {last_good_loss:.4f} from iteration {last_good_iteration}. "
+                f"Consecutive NaNs: {self.consecutive_nans}, Total: {self.total_nans}"
+            )
+        # Reset training state if too many consecutive NaNs
+        if self.consecutive_nans >= self.max_consecutive_nans:
+            self._reset_nan_state(runner, last_good_iteration)
+    def _reset_nan_state(self, runner: Runner, last_good_iteration: int) -> None:
+        """Reset training state when too many consecutive NaNs."""
+        runner.logger.error(
+            f"🔄 Too many consecutive NaN losses ({self.consecutive_nans}). "
+            f"Resetting to last good state from iteration {last_good_iteration}"
+        )
+        try:
+            # Clear model gradients
+            if hasattr(runner.model, 'zero_grad'):
+                runner.model.zero_grad()
+            # Clear CUDA cache
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Reset consecutive counter
+            self.consecutive_nans = 0
+            runner.logger.info(f"✅ NaN state reset. Resuming training...")
+        except Exception as e:
+            runner.logger.error(f"❌ Failed to reset NaN state: {e}")
+    def _fix_loss_components(self, outputs: Dict[str, Any], fallback_loss: float = None) -> None:
+        """Fix ALL loss components with detached tensors (no gradients)."""
+        if fallback_loss is None:
+            fallback_loss = self.last_valid_loss
+        fallback_small = max(0.01, fallback_loss * 0.1)  # Ensure non-zero minimum
+        # Fix ALL tensors with 'loss' in the key name using detached tensors
+        for key, value in outputs.items():
+            if isinstance(value, torch.Tensor) and 'loss' in key.lower():
+                if torch.isnan(value) or torch.isinf(value):
+                    # Create detached replacement tensor (no gradients)
+                    replacement = torch.tensor(
+                        fallback_small,
+                        device=value.device,
+                        dtype=value.dtype
+                        # NOTE: No requires_grad=True - detached for true no-op
+                    )
+                    outputs[key] = replacement
+                    print(f"   🔧 Fixed {key}: {value.item():.4f} -> detached {fallback_small:.4f}")
+        # Also fix any scalar values that might be NaN
+        for key, value in list(outputs.items()):
+            if isinstance(value, (int, float)) and 'loss' in key.lower():
+                if not torch.isfinite(torch.tensor(value)):
+                    outputs[key] = fallback_small
+                    print(f"   🔧 Fixed scalar {key}: {value} -> {fallback_small:.4f}")
+    def after_train_epoch(self, runner: Runner) -> None:
+        """Summary statistics after each epoch."""
+        if self.total_nans > 0:
+            runner.logger.info(
+                f"📊 NaN Recovery Summary for Epoch: "
+                f"{self.total_nans} NaN losses recovered. "
+                f"Training continued successfully."
+            )
+            # Reset for next epoch
+            self.consecutive_nans = 0
+            self.total_nans = 0
+            self.nan_iterations.clear()

custom_models/progressive_loss_hook.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# progressive_loss_hook.py - Progressive Loss Switching Hook for Cascade R-CNN
+import torch
+from mmengine.hooks import Hook
+from mmdet.registry import HOOKS
+from mmdet.models.losses import SmoothL1Loss, GIoULoss, CIoULoss, DIoULoss
+@HOOKS.register_module()
+class ProgressiveLossHook(Hook):
+    """
+    Progressive Loss Switching Hook for Cascade R-CNN.
+    Starts with SmoothL1Loss for all stages, then progressively switches
+    stage 3 (final stage) to GIoU/CIoU/DIoU after the model stabilizes.
+    Args:
+        switch_epoch (int): Epoch to switch stage 3 from SmoothL1 to target loss
+        target_loss_type (str): Target loss type for stage 3 ('GIoULoss', 'CIoULoss', or 'DIoULoss')
+        loss_weight (float): Loss weight for the new loss function
+        warmup_epochs (int): Number of epochs to gradually blend the losses
+        monitor_stage_weights (bool): Whether to log stage loss weights
+        nan_detection (bool): Whether to enable NaN detection and rollback
+        max_nan_tolerance (int): Maximum consecutive NaN losses before rollback
+    """
+    def __init__(self,
+                 switch_epoch=5,
+                 target_loss_type='GIoULoss',
+                 loss_weight=1.0,
+                 warmup_epochs=2,
+                 monitor_stage_weights=True,
+                 nan_detection=False,
+                 max_nan_tolerance=5):
+        super().__init__()
+        self.switch_epoch = switch_epoch
+        self.target_loss_type = target_loss_type
+        self.loss_weight = loss_weight
+        self.warmup_epochs = warmup_epochs
+        self.monitor_stage_weights = monitor_stage_weights
+        self.nan_detection = nan_detection
+        self.max_nan_tolerance = max_nan_tolerance
+        self.switched = False
+        self.original_loss = None
+        self.consecutive_nans = 0
+        self.rollback_performed = False
+    def before_train_epoch(self, runner):
+        """Check if we should switch the loss function."""
+        current_epoch = runner.epoch
+        # Switch at the specified epoch
+        if current_epoch >= self.switch_epoch and not self.switched:
+            self._switch_stage2_loss(runner)
+            self.switched = True
+            runner.logger.info(
+                f"Epoch {current_epoch}: Switched Stage 3 loss to {self.target_loss_type}")
+        # Monitor during warmup period
+        elif current_epoch >= self.switch_epoch and current_epoch < self.switch_epoch + self.warmup_epochs:
+            if self.monitor_stage_weights:
+                self._log_loss_info(runner, current_epoch)
+    def _switch_stage2_loss(self, runner):
+        """Switch stage 3 bbox loss from SmoothL1 to target loss."""
+        model = runner.model
+        # Navigate to stage 3 bbox head (index 2) - final refinement stage
+        try:
+            # Handle DDP wrapper
+            if hasattr(model, 'module'):
+                bbox_head_stage2 = model.module.roi_head.bbox_head[2]
+            else:
+                bbox_head_stage2 = model.roi_head.bbox_head[2]
+            # Store original loss for comparison
+            self.original_loss = bbox_head_stage2.loss_bbox
+            # Create new loss function
+            if self.target_loss_type == 'GIoULoss':
+                new_loss = GIoULoss(loss_weight=self.loss_weight)
+                # Enable decoded bbox regression for IoU losses
+                bbox_head_stage2.reg_decoded_bbox = True
+            elif self.target_loss_type == 'CIoULoss':
+                new_loss = CIoULoss(loss_weight=self.loss_weight)
+                # Enable decoded bbox regression for IoU losses
+                bbox_head_stage2.reg_decoded_bbox = True
+            elif self.target_loss_type == 'DIoULoss':
+                new_loss = DIoULoss(loss_weight=self.loss_weight)
+                # Enable decoded bbox regression for IoU losses
+                bbox_head_stage2.reg_decoded_bbox = True
+            else:
+                raise ValueError(f"Unsupported target loss type: {self.target_loss_type}")
+            # Store the switch information with loss-specific benefits
+            if self.target_loss_type == 'CIoULoss':
+                runner.logger.info(f"🎯 CIoU Loss Benefits for Data Points:")
+                runner.logger.info(f"   • Directly optimizes center point distance")
+                runner.logger.info(f"   • Enforces aspect ratio consistency (square-ish data points)")
+                runner.logger.info(f"   • Better convergence for small objects")
+                runner.logger.info(f"   • Most complete bounding box quality metric")
+            elif self.target_loss_type == 'DIoULoss':
+                runner.logger.info(f"🎯 DIoU Loss Benefits for Data Points:")
+                runner.logger.info(f"   • Directly optimizes center point distance")
+                runner.logger.info(f"   • Better convergence for small objects")
+                runner.logger.info(f"   • More precise localization for data points")
+            elif self.target_loss_type == 'GIoULoss':
+                runner.logger.info(f"🎯 GIoU Loss Benefits:")
+                runner.logger.info(f"   • Improved IoU-based optimization")
+                runner.logger.info(f"   • Better than standard IoU loss")
+            # Replace the loss function
+            bbox_head_stage2.loss_bbox = new_loss
+            runner.logger.info(
+                f"Progressive Loss Switch: Stage 3 changed from "
+                f"{type(self.original_loss).__name__} to {self.target_loss_type}")
+        except Exception as e:
+            runner.logger.error(f"Failed to switch loss function: {e}")
+    def _log_loss_info(self, runner, epoch):
+        """Log information about current loss configuration."""
+        try:
+            model = runner.model
+            if hasattr(model, 'module'):
+                bbox_heads = model.module.roi_head.bbox_head
+            else:
+                bbox_heads = model.roi_head.bbox_head
+            loss_info = {}
+            for i, head in enumerate(bbox_heads):
+                loss_type = type(head.loss_bbox).__name__
+                loss_weight = head.loss_bbox.loss_weight
+                loss_info[f'stage_{i+1}'] = f"{loss_type}(w={loss_weight})"
+            runner.logger.info(f"Epoch {epoch} Loss Configuration: {loss_info}")
+        except Exception as e:
+            runner.logger.warning(f"Could not log loss info: {e}")
+    def after_train_iter(self, runner, batch_idx, data_batch=None, outputs=None):
+        """Monitor loss values during training and detect NaN."""
+        if self.switched and outputs is not None and isinstance(outputs, dict):
+            # NaN detection and rollback logic
+            if self.nan_detection and not self.rollback_performed:
+                total_loss = outputs.get('loss', None)
+                if total_loss is not None and torch.isnan(total_loss):
+                    self.consecutive_nans += 1
+                    runner.logger.warning(f"🚨 NaN detected in total loss! Consecutive: {self.consecutive_nans}/{self.max_nan_tolerance}")
+                    if self.consecutive_nans >= self.max_nan_tolerance:
+                        self._rollback_loss(runner)
+                        self.consecutive_nans = 0
+                        self.rollback_performed = True
+                        runner.logger.error(f"🔄 EMERGENCY ROLLBACK: Switched back to SmoothL1Loss due to {self.max_nan_tolerance} consecutive NaN losses")
+                        return
+                elif total_loss is not None and torch.isfinite(total_loss):
+                    # Reset NaN counter on successful iteration
+                    self.consecutive_nans = 0
+            # Log individual stage losses if available
+            log_vars = outputs.get('log_vars', {})
+            stage_losses = {}
+            for key, value in log_vars.items():
+                if 'loss_bbox' in key and isinstance(value, (int, float)):
+                    stage_losses[key] = value
+            if stage_losses and self.monitor_stage_weights:
+                # Log every 100 iterations to avoid spam
+                if runner.iter % 100 == 0:
+                    loss_summary = ", ".join([f"{k}: {v:.4f}" for k, v in stage_losses.items()])
+                    runner.logger.info(f"Stage Losses - {loss_summary}")
+    def after_train_epoch(self, runner):
+        """Check epoch completion and reset NaN counters."""
+        if self.nan_detection and self.switched:
+            # Log current status
+            if self.consecutive_nans > 0:
+                runner.logger.warning(f"Epoch {runner.epoch} completed with {self.consecutive_nans} NaN occurrences")
+            else:
+                runner.logger.info(f"Epoch {runner.epoch} completed successfully with {self.target_loss_type}")
+    def _rollback_loss(self, runner):
+        """Rollback stage 3 to SmoothL1Loss."""
+        try:
+            model = runner.model
+            if hasattr(model, 'module'):
+                bbox_head_stage2 = model.module.roi_head.bbox_head[2]
+            else:
+                bbox_head_stage2 = model.roi_head.bbox_head[2]
+            # Create new SmoothL1Loss
+            rollback_loss = SmoothL1Loss(beta=1.0, loss_weight=1.0)
+            bbox_head_stage2.loss_bbox = rollback_loss
+            bbox_head_stage2.reg_decoded_bbox = False  # Disable decoded bbox for SmoothL1
+            runner.logger.info(f"✅ Successfully rolled back Stage 3 from {self.target_loss_type} to SmoothL1Loss")
+        except Exception as e:
+            runner.logger.error(f"❌ Failed to rollback loss function: {e}")
+@HOOKS.register_module()
+class AdaptiveLossHook(Hook):
+    """
+    Adaptive version that switches based on training stability metrics.
+    Monitors IoU overlap quality and switches when model is stable.
+    """
+    def __init__(self,
+                 min_epoch=3,
+                 min_avg_iou=0.4,
+                 target_loss_type='GIoULoss',
+                 loss_weight=1.0,
+                 check_interval=100):
+        super().__init__()
+        self.min_epoch = min_epoch
+        self.min_avg_iou = min_avg_iou
+        self.target_loss_type = target_loss_type
+        self.loss_weight = loss_weight
+        self.check_interval = check_interval
+        self.switched = False
+        self.iou_history = []
+    def after_train_iter(self, runner, batch_idx, data_batch=None, outputs=None):
+        """Monitor training stability through IoU metrics."""
+        if (not self.switched and
+            runner.epoch >= self.min_epoch and
+            runner.iter % self.check_interval == 0):
+            # Extract IoU information from outputs if available
+            if outputs and isinstance(outputs, dict):
+                log_vars = outputs.get('log_vars', {})
+                # Look for any IoU-related metrics
+                iou_metrics = [v for k, v in log_vars.items()
+                              if 'iou' in k.lower() and isinstance(v, (int, float))]
+                if iou_metrics:
+                    avg_iou = sum(iou_metrics) / len(iou_metrics)
+                    self.iou_history.append(avg_iou)
+                    # Keep only recent history
+                    if len(self.iou_history) > 10:
+                        self.iou_history.pop(0)
+                    # Check if we should switch
+                    if (len(self.iou_history) >= 5 and
+                        sum(self.iou_history[-5:]) / 5 >= self.min_avg_iou):
+                        self._switch_stage2_loss(runner)
+                        self.switched = True
+                        recent_iou = sum(self.iou_history[-5:]) / 5
+                        runner.logger.info(
+                            f"Adaptive switch at epoch {runner.epoch}, iter {runner.iter}: "
+                            f"avg IoU {recent_iou:.3f} >= {self.min_avg_iou}")
+    def _switch_stage2_loss(self, runner):
+        """Same switching logic as ProgressiveLossHook."""
+        model = runner.model
+        try:
+            if hasattr(model, 'module'):
+                bbox_head_stage2 = model.module.roi_head.bbox_head[2]
+            else:
+                bbox_head_stage2 = model.roi_head.bbox_head[2]
+            if self.target_loss_type == 'GIoULoss':
+                new_loss = GIoULoss(loss_weight=self.loss_weight)
+                bbox_head_stage2.reg_decoded_bbox = True
+            elif self.target_loss_type == 'CIoULoss':
+                new_loss = CIoULoss(loss_weight=self.loss_weight)
+                bbox_head_stage2.reg_decoded_bbox = True
+            elif self.target_loss_type == 'DIoULoss':
+                new_loss = DIoULoss(loss_weight=self.loss_weight)
+                bbox_head_stage2.reg_decoded_bbox = True
+            else:
+                raise ValueError(f"Unsupported target loss type: {self.target_loss_type}")
+            bbox_head_stage2.loss_bbox = new_loss
+            runner.logger.info(f"Adaptive Loss Switch: Stage 3 → {self.target_loss_type}")
+        except Exception as e:
+            runner.logger.error(f"Failed to switch loss function: {e}")

custom_models/register.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from mmdet.registry import MODELS, DATASETS, TRANSFORMS, HOOKS
+from .custom_heads import FCHead, RegHead, DataSeriesHead
+from .custom_cascade_with_meta import CustomCascadeWithMeta
+from .custom_dataset import (
+    ChartDataset, RobustLoadImageFromFile, CreateDummyImg,
+    ClampBBoxes, SetScaleFactor, EnsureScaleFactor, SetInputs, CustomPackDetInputs
+)
+from .flexible_load_annotations import FlexibleLoadAnnotations
+from .custom_hooks import (
+    ChartTypeDistributionHook, SkipInvalidLossHook, RuntimeErrorHook, MissingImageReportHook, SkipBadSamplesHook, CompatibleCheckpointHook
+)
+from .nan_recovery_hook import NanRecoveryHook
+from .progressive_loss_hook import ProgressiveLossHook, AdaptiveLossHook
+from .square_fcn_mask_head import SquareFCNMaskHead
+def register_all_modules():
+    """Register all enhanced modules for comprehensive chart detection."""
+    # Note: Most modules are already registered via decorators
+    # Only register modules that don't use decorators
+    print("✅ Enhanced chart detection modules registered via decorators:")
+    print("   📊 Models: FCHead, RegHead, DataSeriesHead, CustomCascadeWithMeta, SquareFCNMaskHead")
+    print("   📁 Datasets: ChartDataset (21 categories)")
+    print("   🔄 Transforms: RobustLoadImageFromFile, CreateDummyImg, ClampBBoxes, SetScaleFactor, EnsureScaleFactor, SetInputs, CustomPackDetInputs, FlexibleLoadAnnotations")
+    print("   🎯 Hooks: ChartTypeDistributionHook, SkipInvalidLossHook, RuntimeErrorHook, MissingImageReportHook, SkipBadSamplesHook, NanRecoveryHook, CompatibleCheckpointHook, ProgressiveLossHook, AdaptiveLossHook")
+# Just print info, don't re-register since decorators handle it
+register_all_modules()

custom_models/square_fcn_mask_head.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+import torch
+from torch import Tensor
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.registry import MODELS
+from mmdet.structures.mask.mask_target import mask_target
+from typing import Union, Dict, Any
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from typing import List
+from .square_mask_target import square_mask_target
+@MODELS.register_module()
+class SquareFCNMaskHead(FCNMaskHead):
+    """FCN mask head that forces square mask targets.
+    This head ensures that all mask targets are square regardless of the original
+    aspect ratio to avoid tensor size mismatches during training.
+    """
+    def __init__(self, *args, **kwargs):
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: Initializing SquareFCNMaskHead")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: args: {args}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: kwargs: {kwargs}")
+        super().__init__(*args, **kwargs)
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: SquareFCNMaskHead initialized successfully")
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+        Args:
+            x (Tensor): Extract mask RoI features.
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: Input shape: {x.shape}")
+        for i, conv in enumerate(self.convs):
+            x = conv(x)
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: After conv {i} shape: {x.shape}")
+        if self.upsample is not None:
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: Upsampling from {x.shape}")
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: After upsample shape: {x.shape}")
+        else:
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: No upsampling, shape: {x.shape}")
+        mask_preds = self.conv_logits(x)
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: Final mask_preds shape: {mask_preds.shape}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_preds device: {mask_preds.device}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_preds dtype: {mask_preds.dtype}")
+        return mask_preds
+    def loss_and_target(self,
+                       mask_preds: Tensor,
+                       sampling_results: List[Any],
+                       batch_gt_instances: List[InstanceData],
+                       rcnn_train_cfg: Union[Dict[str, Any], ConfigDict]) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, mask_h, mask_w).
+            sampling_results (List[:obj:`SamplingResult`]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (List[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: loss_and_target called")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_preds shape: {mask_preds.shape}")
+        # Get mask targets
+        mask_targets = self.get_targets(sampling_results, batch_gt_instances,
+                                       rcnn_train_cfg)
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_targets shape: {mask_targets.shape}")
+        # Get labels for positive proposals
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: pos_labels shape: {pos_labels.shape}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: pos_labels: {pos_labels}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: pos_labels min: {pos_labels.min()}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: pos_labels max: {pos_labels.max()}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: num_classes: {self.num_classes}")
+        # Check for out-of-bounds labels
+        if pos_labels.max() >= self.num_classes:
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: ERROR! Found label {pos_labels.max()} >= num_classes {self.num_classes}")
+            # Clamp labels to valid range
+            pos_labels = torch.clamp(pos_labels, 0, self.num_classes - 1)
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: Clamped pos_labels max: {pos_labels.max()}")
+        # Check for size mismatch between predictions and targets
+        if mask_preds.shape[-2:] != mask_targets.shape[-2:]:
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: SIZE MISMATCH!")
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_preds shape: {mask_preds.shape}")
+            print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_targets shape: {mask_targets.shape}")
+        # Calculate loss - use the original approach like FCNMaskHead
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: About to call loss_mask")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_preds shape: {mask_preds.shape}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_targets shape: {mask_targets.shape}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: pos_labels shape: {pos_labels.shape}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_preds device: {mask_preds.device}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: mask_targets device: {mask_targets.device}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: pos_labels device: {pos_labels.device}")
+        # Call loss function with full mask_preds and pos_labels like the original FCN mask head
+        loss_mask = self.loss_mask(mask_preds, mask_targets, pos_labels)
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: Loss calculated successfully: {loss_mask}")
+        # only return the *nested* loss dict that StandardRoIHead.update() expects
+        return dict(
+            loss_mask={'loss_mask': loss_mask},
+            # if you really need mask_targets downstream you can still return it under a
+            # different key, but it will be ignored by the standard loss updater
+            mask_targets=mask_targets
+        )
+    def get_targets(self,
+                   sampling_results: List[Any],
+                   batch_gt_instances: List[InstanceData],
+                   rcnn_train_cfg: Union[Dict[str, Any], ConfigDict]) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (List[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+        Returns:
+            Tensor: Mask targets of each positive proposals in the image,
+                has shape (num_pos, mask_h, mask_w).
+        """
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: get_targets called")
+        pos_proposals_list = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds_list = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks_list = [res.masks for res in batch_gt_instances]
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: Number of sampling results: {len(sampling_results)}")
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: rcnn_train_cfg: {rcnn_train_cfg}")
+        # Use our custom square mask target function
+        mask_targets = square_mask_target(pos_proposals_list, pos_assigned_gt_inds_list,
+                                         gt_masks_list, rcnn_train_cfg)
+        print(f"🔍 SQUARE_FCN_MASK_HEAD: Final mask_targets shape: {mask_targets.shape}")
+        return mask_targets

custom_models/square_mask_target.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+from mmdet.registry import MODELS
+from mmdet.structures.mask.mask_target import mask_target as original_mask_target
+def square_mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list, cfg):
+    """Compute square mask target for positive proposals in multiple images.
+    This function forces all mask targets to be square regardless of the original
+    aspect ratio to avoid tensor size mismatches.
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple images.
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each positive proposals.
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of each image.
+        cfg (dict): Config dict that specifies the mask size.
+    Returns:
+        Tensor: Square mask target of each image, has shape (num_pos, size, size).
+    """
+    # Get the target size (should be a tuple like (14, 14))
+    mask_size = _pair(cfg.mask_size)
+    # Force square size by using the minimum dimension
+    square_size = min(mask_size)
+    # Create a proper ConfigDict object
+    from mmengine.config import ConfigDict
+    square_cfg = ConfigDict({'mask_size': (square_size, square_size)})
+    # Call the original mask target function with square size
+    mask_targets = original_mask_target(pos_proposals_list, pos_assigned_gt_inds_list,
+                                       gt_masks_list, square_cfg)
+    print(f"🔍 SQUARE_MASK_TARGET: Original mask_targets shape: {mask_targets.shape}")
+    print(f"🔍 SQUARE_MASK_TARGET: Expected square_size: {square_size}")
+    # Force square shape by padding or cropping if necessary
+    if mask_targets.size(1) != square_size or mask_targets.size(2) != square_size:
+        print(f"🔍 SQUARE_MASK_TARGET: Forcing square shape from {mask_targets.shape} to ({mask_targets.size(0)}, {square_size}, {square_size})")
+        # Create new tensor with square shape
+        num_masks = mask_targets.size(0)
+        square_targets = torch.zeros(num_masks, square_size, square_size,
+                                   device=mask_targets.device, dtype=mask_targets.dtype)
+        # Copy the mask data, padding with zeros if necessary
+        h, w = mask_targets.size(1), mask_targets.size(2)
+        h_copy = min(h, square_size)
+        w_copy = min(w, square_size)
+        square_targets[:, :h_copy, :w_copy] = mask_targets[:, :h_copy, :w_copy]
+        mask_targets = square_targets
+        print(f"🔍 SQUARE_MASK_TARGET: Final mask_targets shape: {mask_targets.shape}")
+    else:
+        print(f"🔍 SQUARE_MASK_TARGET: Masks already square: {mask_targets.shape}")
+    return mask_targets
+# Register the custom function
+MODELS.register_module(name='square_mask_target', module=square_mask_target)

debug_api.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""
+Debug script to check API status and endpoints
+"""
+import requests
+import json
+def check_space_status():
+    """Check if the space is running"""
+    print("🔍 Checking space status...")
+    try:
+        # Check main page
+        response = requests.get("https://hanszhu-dense-captioning-platform.hf.space/")
+        print(f"Main page status: {response.status_code}")
+        if response.status_code == 200:
+            print("✅ Space is accessible")
+        else:
+            print("❌ Space is not accessible")
+    except Exception as e:
+        print(f"❌ Error checking space: {e}")
+def check_api_endpoints():
+    """Check various API endpoints"""
+    print("\n🔍 Checking API endpoints...")
+    base_url = "https://hanszhu-dense-captioning-platform.hf.space"
+    endpoints = [
+        "/",
+        "/api",
+        "/api/predict",
+        "/predict",
+        "/api/predict/",
+        "/predict/"
+    ]
+    for endpoint in endpoints:
+        try:
+            response = requests.get(f"{base_url}{endpoint}")
+            print(f"{endpoint}: {response.status_code} - {response.headers.get('content-type', 'unknown')}")
+            if response.status_code == 200:
+                print(f"  Content preview: {response.text[:100]}...")
+        except Exception as e:
+            print(f"{endpoint}: Error - {e}")
+def test_post_request():
+    """Test POST request to predict endpoint"""
+    print("\n🔍 Testing POST request...")
+    try:
+        # Test URL
+        test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+        # Try different POST formats
+        test_data = [
+            {"data": [test_url]},
+            {"fn_index": 0, "data": [test_url]},
+            {"data": test_url},
+            test_url
+        ]
+        for i, data in enumerate(test_data):
+            print(f"\nTest {i+1}: {type(data)}")
+            try:
+                response = requests.post(
+                    "https://hanszhu-dense-captioning-platform.hf.space/predict",
+                    json=data,
+                    headers={"Content-Type": "application/json"}
+                )
+                print(f"  Status: {response.status_code}")
+                print(f"  Content-Type: {response.headers.get('content-type', 'unknown')}")
+                print(f"  Response: {response.text[:200]}...")
+            except Exception as e:
+                print(f"  Error: {e}")
+    except Exception as e:
+        print(f"❌ Error in POST test: {e}")
+def test_gradio_client_detailed():
+    """Test gradio_client with detailed error handling"""
+    print("\n🔍 Testing gradio_client with detailed error handling...")
+    try:
+        from gradio_client import Client
+        print("Creating client...")
+        client = Client("hanszhu/Dense-Captioning-Platform")
+        print("Getting space info...")
+        info = client.view_api()
+        print(f"API info: {info}")
+        print("Making prediction...")
+        test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+        result = client.predict(test_url, api_name="/predict")
+        print(f"✅ Success! Result: {result}")
+    except Exception as e:
+        print(f"❌ gradio_client error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    print("🚀 Debugging Dense Captioning Platform API")
+    print("=" * 60)
+    check_space_status()
+    check_api_endpoints()
+    test_post_request()
+    test_gradio_client_detailed()
+    print("\n" + "=" * 60)
+    print("�� Debug completed!")

find_api_endpoint.py ADDED Viewed

	@@ -0,0 +1,167 @@

+#!/usr/bin/env python3
+"""
+Script to find the correct API endpoint
+"""
+import requests
+import json
+def try_different_endpoints():
+    """Try different possible API endpoints"""
+    print("🔍 Trying different API endpoints...")
+    base_url = "https://hanszhu-dense-captioning-platform.hf.space"
+    # Different possible endpoints
+    endpoints = [
+        "/api/predict",
+        "/predict",
+        "/api/run/predict",
+        "/run/predict",
+        "/api/0",
+        "/0",
+        "/api/1",
+        "/1",
+        "/api/2",
+        "/2",
+        "/api/3",
+        "/3",
+        "/api/4",
+        "/4",
+        "/api/5",
+        "/5"
+    ]
+    test_data = {
+        "data": ["https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"]
+    }
+    for endpoint in endpoints:
+        print(f"\nTrying POST to: {endpoint}")
+        try:
+            response = requests.post(
+                f"{base_url}{endpoint}",
+                json=test_data,
+                headers={"Content-Type": "application/json"}
+            )
+            print(f"  Status: {response.status_code}")
+            print(f"  Content-Type: {response.headers.get('content-type', 'unknown')}")
+            if response.status_code == 200:
+                print("  ✅ SUCCESS! Found working endpoint!")
+                print(f"  Response: {response.text[:200]}...")
+                return endpoint
+            elif response.status_code == 405:
+                print("  ⚠️ Method not allowed (endpoint exists but wrong method)")
+            elif response.status_code == 404:
+                print("  ❌ Not found")
+            else:
+                print(f"  ❌ Unexpected status: {response.text[:100]}...")
+        except Exception as e:
+            print(f"  ❌ Error: {e}")
+    return None
+def try_get_endpoints():
+    """Try GET requests to find API info"""
+    print("\n🔍 Trying GET requests to find API info...")
+    base_url = "https://hanszhu-dense-captioning-platform.hf.space"
+    get_endpoints = [
+        "/api",
+        "/api/",
+        "/api/predict",
+        "/api/predict/",
+        "/api/run/predict",
+        "/api/run/predict/",
+        "/api/0",
+        "/api/1",
+        "/api/2",
+        "/api/3",
+        "/api/4",
+        "/api/5"
+    ]
+    for endpoint in get_endpoints:
+        print(f"\nTrying GET: {endpoint}")
+        try:
+            response = requests.get(f"{base_url}{endpoint}")
+            print(f"  Status: {response.status_code}")
+            print(f"  Content-Type: {response.headers.get('content-type', 'unknown')}")
+            if response.status_code == 200:
+                content = response.text[:200]
+                print(f"  Content: {content}...")
+                # Check if it's JSON
+                if response.headers.get('content-type', '').startswith('application/json'):
+                    print("  ✅ JSON response - this might be the API info!")
+                    try:
+                        data = response.json()
+                        print(f"  API Info: {json.dumps(data, indent=2)}")
+                    except:
+                        pass
+        except Exception as e:
+            print(f"  ❌ Error: {e}")
+def try_gradio_client_different_ways():
+    """Try gradio_client with different approaches"""
+    print("\n🔍 Trying gradio_client with different approaches...")
+    try:
+        from gradio_client import Client
+        print("Creating client...")
+        client = Client("hanszhu/Dense-Captioning-Platform")
+        print("Trying different API names...")
+        api_names = ["/predict", "/run/predict", "0", "1", "2", "3", "4", "5"]
+        for api_name in api_names:
+            print(f"\nTrying api_name: {api_name}")
+            try:
+                test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+                result = client.predict(test_url, api_name=api_name)
+                print(f"  ✅ SUCCESS with api_name={api_name}!")
+                print(f"  Result: {result}")
+                return api_name
+            except Exception as e:
+                print(f"  ❌ Failed: {e}")
+    except Exception as e:
+        print(f"❌ gradio_client error: {e}")
+if __name__ == "__main__":
+    print("🚀 Finding the correct API endpoint")
+    print("=" * 60)
+    # Try different POST endpoints
+    working_endpoint = try_different_endpoints()
+    # Try GET endpoints for API info
+    try_get_endpoints()
+    # Try gradio_client with different approaches
+    working_api_name = try_gradio_client_different_ways()
+    print("\n" + "=" * 60)
+    print("🏁 Endpoint discovery completed!")
+    if working_endpoint:
+        print(f"✅ Found working POST endpoint: {working_endpoint}")
+    if working_api_name:
+        print(f"✅ Found working gradio_client api_name: {working_api_name}")
+    if not working_endpoint and not working_api_name:
+        print("❌ No working endpoints found")
+        print("The space might still be loading or need different configuration")

models/chart_elementnet_swin.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# cascade_rcnn_r50_fpn_meta.py - Enhanced config with Swin Transformer backbone
+#
+# PROGRESSIVE LOSS STRATEGY:
+# - All 3 Cascade stages start with SmoothL1Loss for stable initial training
+# - At epoch 5, Stage 3 (final stage) switches to GIoULoss via ProgressiveLossHook
+# - Stage 1 & 2 remain SmoothL1Loss throughout training
+# - This ensures model stability before introducing more complex IoU-based losses
+# Custom imports - this registers our modules without polluting config namespace
+custom_imports = dict(
+    imports=[
+        'custom_models.custom_dataset',
+        'custom_models.register',
+        'custom_models.custom_hooks',
+        'custom_models.progressive_loss_hook',
+    ],
+    allow_failed_imports=False
+)
+# Add to Python path
+import sys
+import os
+# Use a simpler path approach that doesn't rely on __file__
+sys.path.insert(0, os.path.join(os.getcwd(), '..', '..'))
+# Custom Cascade model with coordinate handling for chart data
+model = dict(
+    type='CustomCascadeWithMeta',  # Use custom model with coordinate handling
+    coordinate_standardization=dict(
+        enabled=True,
+        origin='bottom_left',      # Match annotation creation coordinate system
+        normalize=True,
+        relative_to_plot=False,    # Keep simple for now
+        scale_to_axis=False        # Keep simple for now
+    ),
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    # ----- Swin Transformer Base (22K) Backbone + FPN -----
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=128,  # Swin Base embedding dimensions
+        depths=[2, 2, 18, 2],  # Swin Base depths
+        num_heads=[4, 8, 16, 32],  # Swin Base attention heads
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.3,  # Slightly higher for more complex model
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_22k_20220317-4f79f7c0.pth'
+        )
+    ),
+    neck=dict(
+        type='FPN',
+        in_channels=[128, 256, 512, 1024],  # Swin Base: embed_dims * 2^(stage)
+        out_channels=256,
+        num_outs=6,
+        start_level=0,
+        add_extra_convs='on_input'
+    ),
+    # Enhanced RPN with smaller anchors for tiny objects + improved losses
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[1, 2, 4, 8],  # Even smaller scales for tiny objects
+            ratios=[0.5, 1.0, 2.0],  # Multiple aspect ratios
+            strides=[4, 8, 16, 32, 64, 128]),  # Extended FPN strides
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    # Progressive Loss Strategy: Start with SmoothL1 for all 3 stages
+    # Stage 3 (final stage) will switch to GIoU at epoch 5 via ProgressiveLossHook
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            # Stage 1: Always SmoothL1Loss (coarse detection)
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=21,  # 21 enhanced categories
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+            # Stage 2: Always SmoothL1Loss (intermediate refinement)
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=21,  # 21 enhanced categories
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+            # Stage 3: SmoothL1 → GIoU at epoch 5 (progressive switching)
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=21,  # 21 enhanced categories
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.02, 0.02, 0.05, 0.05]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ]),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.8),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.4,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)
+        ]),
+    # Enhanced test configuration with soft-NMS and multi-scale support
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.005,  # Even lower threshold to catch more classes
+            nms=dict(
+                type='soft_nms',  # Soft-NMS for better small object detection
+                iou_threshold=0.5,
+                min_score=0.005,
+                method='gaussian',
+                sigma=0.5),
+            max_per_img=500)))  # Allow more detections
+# Dataset settings - using cleaned annotations
+dataset_type = 'ChartDataset'
+data_root = ''  # Remove data_root duplication
+# Define the 21 chart element classes that match the annotations
+CLASSES = (
+    'title', 'subtitle', 'x-axis', 'y-axis', 'x-axis-label', 'y-axis-label',
+    'x-tick-label', 'y-tick-label', 'legend', 'legend-title', 'legend-item',
+    'data-point', 'data-line', 'data-bar', 'data-area', 'grid-line',
+    'axis-title', 'tick-label', 'data-label', 'legend-text', 'plot-area'
+)
+# Updated to use cleaned annotation files
+train_dataloader = dict(
+    batch_size=2,  # Increased back to 2
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='legend_data/annotations_JSON_cleaned/train_enriched.json',  # Full path
+        data_prefix=dict(img='legend_data/train/images/'),  # Full path
+        metainfo=dict(classes=CLASSES),  # Tell dataset what classes to expect
+        filter_cfg=dict(filter_empty_gt=True, min_size=0, class_specific_min_sizes={
+            'data-point': 16,    # Back to 16x16 from 32x32
+            'data-bar': 16,      # Back to 16x16 from 32x32
+            'tick-label': 16,    # Back to 16x16 from 32x32
+            'x-tick-label': 16,  # Back to 16x16 from 32x32
+            'y-tick-label': 16   # Back to 16x16 from 32x32
+        }),
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(type='Resize', scale=(1600, 1000), keep_ratio=True),  # Higher resolution for tiny objects
+            dict(type='RandomFlip', prob=0.5),
+            dict(type='ClampBBoxes'),  # Ensure bboxes stay within image bounds
+            dict(type='PackDetInputs')
+        ]
+    )
+)
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='legend_data/annotations_JSON_cleaned/val_enriched_with_info.json',  # Full path
+        data_prefix=dict(img='legend_data/train/images/'),  # All images are in train/images
+        metainfo=dict(classes=CLASSES),  # Tell dataset what classes to expect
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(1600, 1000), keep_ratio=True),  # Base resolution for validation
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(type='ClampBBoxes'),  # Ensure bboxes stay within image bounds
+            dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor'))
+        ]
+    )
+)
+test_dataloader = val_dataloader
+# Enhanced evaluators with debugging
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='legend_data/annotations_JSON_cleaned/val_enriched_with_info.json',  # Using cleaned annotations
+    metric='bbox',
+    format_only=False,
+    classwise=True,  # Enable detailed per-class metrics table
+    proposal_nums=(100, 300, 1000))  # More detailed AR metrics
+test_evaluator = val_evaluator
+# Add custom hooks for debugging empty results
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CompatibleCheckpointHook', interval=1, save_best='auto', max_keep_ckpts=3),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook'))
+# Add NaN recovery hook for graceful handling like Faster R-CNN
+custom_hooks = [
+    dict(type='SkipBadSamplesHook', interval=1),           # Skip samples with bad GT data
+    dict(type='ChartTypeDistributionHook', interval=500),  # Monitor class distribution
+    dict(type='MissingImageReportHook', interval=1000),    # Track missing images
+    dict(type='NanRecoveryHook',                           # For logging & monitoring
+         fallback_loss=1.0,
+         max_consecutive_nans=100,
+         log_interval=50),
+    dict(type='ProgressiveLossHook',                       # Progressive loss switching
+         switch_epoch=5,                                   # Switch stage 3 to GIoU at epoch 5
+         target_loss_type='GIoULoss',                      # Use GIoU for stage 3 (final stage)
+         loss_weight=1.0,                                  # Keep same loss weight
+         warmup_epochs=2,                                  # Monitor for 2 epochs after switch
+         monitor_stage_weights=True),                      # Log stage loss details
+]
+# Training configuration - extended to 40 epochs for Swin Base on small objects
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=40, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# Optimizer with standard stable settings
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35.0, norm_type=2)
+)
+# Extended learning rate schedule with cosine annealing for Swin Base
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.05,  # 1e-4 / 2e-2 = 0.05 (warmup from 1e-4 to 2e-2)
+        by_epoch=False,
+        begin=0,
+        end=1000),  # 1k iteration warmup
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        end=40,  # Match max_epochs
+        by_epoch=True,
+        T_max=40,
+        eta_min=1e-6,  # Minimum learning rate
+        convert_to_iter_based=True)
+]
+# Work directory
+work_dir = './work_dirs/cascade_rcnn_swin_base_40ep_cosine_fpn_meta'
+# Multi-scale test configuration (uncomment to enable)
+# img_scales = [(800, 500), (1600, 1000), (2400, 1500)]  # 0.5x, 1.0x, 1.5x scales
+# tta_model = dict(
+#     type='DetTTAModel',
+#     tta_cfg=dict(
+#         nms=dict(type='nms', iou_threshold=0.5),
+#         max_per_img=100)
+# )
+# Fresh start
+resume = False
+load_from = None

models/chart_pointnet_swin.py ADDED Viewed

	@@ -0,0 +1,374 @@

+# mask_rcnn_swin_meta.py - Mask R-CNN with Swin Transformer for data point segmentation
+#
+# ADAPTED FROM CASCADE R-CNN CONFIG:
+# - Uses same Swin Transformer Base backbone with optimizations
+# - Maintains data-point class weighting (10x) and IoU strategies
+# - Adds mask head for instance segmentation of data points
+# - Uses enhanced annotation files with segmentation masks
+# - Keeps custom hooks and progressive loss strategies
+#
+# MASK-SPECIFIC OPTIMIZATIONS:
+# - RoI size 14x14 for mask extraction (matches data point size)
+# - FCN mask head with 4 convolution layers
+# - Mask loss weight balanced with bbox and classification losses
+# - Enhanced test-time augmentation for better mask quality
+#
+# DATA POINT FOCUS:
+# - Primary target: data-point class (ID 11) with 10x weight
+# - Generates both bounding boxes AND instance masks
+# - Optimized for 16x16 pixel data points in scientific charts
+# Removed _base_ inheritance to avoid path issues - all configs are inlined below
+# Custom imports - same as Cascade R-CNN setup
+custom_imports = dict(
+    imports=[
+        'custom_models.register',
+        'custom_models.custom_hooks',
+        'custom_models.progressive_loss_hook',
+        'custom_models.flexible_load_annotations',
+    ],
+    allow_failed_imports=False
+)
+# Add to Python path
+import sys
+sys.path.insert(0, '.')
+# Mask R-CNN model with Swin Transformer backbone
+model = dict(
+    type='MaskRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        pad_mask=True,  # Important for mask training
+        mask_pad_value=0,
+    ),
+    # Same Swin Transformer Base backbone as Cascade R-CNN
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=128,  # Swin Base embedding dimensions
+        depths=[2, 2, 18, 2],  # Swin Base depths
+        num_heads=[4, 8, 16, 32],  # Swin Base attention heads
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.3,  # Same as Cascade config
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_22k_20220317-4f79f7c0.pth'
+        )
+    ),
+    # Same FPN as Cascade R-CNN
+    neck=dict(
+        type='FPN',
+        in_channels=[128, 256, 512, 1024],  # Swin Base: embed_dims * 2^(stage)
+        out_channels=256,
+        num_outs=5,  # Standard for Mask R-CNN (was 6 in Cascade)
+        start_level=0,
+        add_extra_convs='on_input'
+    ),
+    # Same RPN configuration as Cascade R-CNN
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[1, 2, 4, 8],  # Same small scales for tiny objects
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),  # Standard FPN strides for Mask R-CNN
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
+    ),
+    # Mask R-CNN ROI head with bbox + mask branches
+    roi_head=dict(
+        type='StandardRoIHead',
+        # Bbox ROI extractor (same as Cascade R-CNN final stage)
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]
+        ),
+        # Bbox head with data-point class weighting
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=22,  # 22 enhanced categories including boxplot
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]
+            ),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=[1.0,  # background class (index 0)
+                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                             10.0,  # data-point at index 12 gets 10x weight (11+1 for background)
+                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]  # Added boxplot class
+            ),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
+        ),
+        # Mask ROI extractor (optimized for 16x16 data points)
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=(14, 14), sampling_ratio=0, aligned=True),  # Force exact 14x14 with legacy alignment
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]
+        ),
+        # Mask head optimized for data points with square mask targets
+        mask_head=dict(
+            type='SquareFCNMaskHead',
+            num_convs=4,  # 4 conv layers for good feature extraction
+            in_channels=256,
+            roi_feat_size=14,  # Explicitly set ROI feature size
+            conv_out_channels=256,
+            num_classes=22,  # 22 enhanced categories including boxplot
+            upsample_cfg=dict(type=None),  # No upsampling - keep 14x14
+            loss_mask=dict(
+                type='CrossEntropyLoss',
+                use_mask=True,
+                loss_weight=1.0  # Balanced with bbox loss
+            )
+        )
+    ),
+    # Training configuration adapted from Cascade R-CNN
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        # RCNN training (using Cascade stage 2 settings - balanced for mask training)
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,  # Balanced IoU for bbox + mask training
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,  # Important for small data points
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=(14, 14),  # Force exact 14x14 size for data points
+            pos_weight=-1,
+            debug=False)
+    ),
+    # Test configuration with soft NMS
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.005,  # Low threshold to catch data points
+            nms=dict(
+                type='soft_nms',  # Soft NMS for better small object detection
+                iou_threshold=0.3,  # Low for data points
+                min_score=0.005,
+                method='gaussian',
+                sigma=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5  # Binary mask threshold
+        )
+    )
+)
+# Dataset settings - using standard COCO dataset for mask support
+dataset_type = 'CocoDataset'
+data_root = ''
+# 22 enhanced categories including boxplot
+CLASSES = (
+    'title', 'subtitle', 'x-axis', 'y-axis', 'x-axis-label', 'y-axis-label',        # 0-5
+    'x-tick-label', 'y-tick-label', 'legend', 'legend-title', 'legend-item',        # 6-10
+    'data-point', 'data-line', 'data-bar', 'data-area', 'grid-line',              # 11-15 (data-point at index 11)
+    'axis-title', 'tick-label', 'data-label', 'legend-text', 'plot-area',         # 16-20
+    'boxplot'                                                                      # 21
+)
+# Verify data-point class index
+assert CLASSES[11] == 'data-point', f"Expected 'data-point' at index 11 in CLASSES tuple, got '{CLASSES[11]}'"
+# Training dataloader with mask annotations
+train_dataloader = dict(
+    batch_size=2,  # Same as Cascade R-CNN
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='legend_match_swin/mask_generation/enhanced_datasets/train_filtered_with_masks_only.json',
+        data_prefix=dict(img='legend_data/train/images/'),
+        metainfo=dict(classes=CLASSES),
+        filter_cfg=dict(filter_empty_gt=False, min_size=12),  # Don't filter out images with masks
+        # Disable any built-in filtering that might remove annotations
+        test_mode=False,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='FlexibleLoadAnnotations', with_bbox=True, with_mask=True),
+            dict(type='Resize', scale=(1120, 672), keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+            dict(type='ClampBBoxes'),
+            dict(type='PackDetInputs')
+        ]
+    )
+)
+# Validation dataloader with mask annotations
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='legend_match_swin/mask_generation/enhanced_datasets/val_enriched_with_masks_only.json',
+        data_prefix=dict(img='legend_data/train/images/'),
+        metainfo=dict(classes=CLASSES),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(1120, 672), keep_ratio=True),
+            dict(type='FlexibleLoadAnnotations', with_bbox=True, with_mask=True),
+            dict(type='ClampBBoxes'),
+            dict(type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor'))
+        ]
+    )
+)
+test_dataloader = val_dataloader
+# Enhanced evaluators for both bbox and mask metrics
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='legend_match_swin/mask_generation/enhanced_datasets/val_enriched_with_masks_only.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    classwise=True,
+    proposal_nums=(100, 300, 1000)
+)
+test_evaluator = val_evaluator
+# Same custom hooks as Cascade R-CNN
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CompatibleCheckpointHook', interval=1, save_best='auto', max_keep_ckpts=3),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook')
+)
+# Same custom hooks as Cascade R-CNN (adapted for Mask R-CNN)
+custom_hooks = [
+    dict(type='SkipBadSamplesHook', interval=1),
+    dict(type='ChartTypeDistributionHook', interval=500),
+    dict(type='MissingImageReportHook', interval=1000),
+    dict(type='NanRecoveryHook',
+         fallback_loss=1.0,
+         max_consecutive_nans=50,
+         log_interval=25),
+    # Note: Progressive loss hook not used in standard Mask R-CNN
+    # but could be adapted if needed for bbox loss only
+]
+# Training configuration - reduced to 20 epochs
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# Same optimizer settings as Cascade R-CNN
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10.0, norm_type=2)
+)
+# Same learning rate schedule as Cascade R-CNN
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        T_max=20,
+        eta_min=1e-5,
+        convert_to_iter_based=True)
+]
+# Work directory
+work_dir = '/content/drive/MyDrive/Research Summer 2025/Dense Captioning Toolkit/CHART-DeMatch/work_dirs/mask_rcnn_swin_base_20ep_meta'
+# Fresh start
+resume = False
+load_from = None
+# Default runtime settings (normally inherited from _base_)
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio==5.39.0
+torch>=2.0.0
+torchvision>=0.15.0
+transformers>=4.30.0
+Pillow>=9.0.0
+numpy>=1.21.0
+opencv-python>=4.8.0
+huggingface-hub>=0.16.0
+openmim
+mmdet
+mmengine
+scikit-image>=0.21.0

simple_test.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+"""
+Simple test script for the Dense Captioning Platform API
+"""
+def test_gradio_client():
+    """Test using gradio_client"""
+    print("🧪 Testing with gradio_client...")
+    try:
+        from gradio_client import Client, handle_file
+        # Initialize client with direct URL (working approach)
+        client = Client("https://hanszhu-dense-captioning-platform.hf.space")
+        # Test with a simple image URL
+        test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+        print(f"Testing with URL: {test_url}")
+        # Make prediction using handle_file and fn_index (working approach)
+        result = client.predict(
+            image=handle_file(test_url),
+            fn_index=0  # Use fn_index instead of api_name
+        )
+        print("✅ gradio_client test successful!")
+        print(f"Result: {result}")
+        return True
+    except Exception as e:
+        print(f"❌ gradio_client test failed: {e}")
+        return False
+def test_direct_api():
+    """Test direct API call (Blocks don't support RESTful APIs)"""
+    print("\n🧪 Testing direct API call...")
+    print("⚠️ Direct API calls not supported for Blocks-based Spaces")
+    print("   Use gradio_client instead for API access")
+    return False
+if __name__ == "__main__":
+    print("🚀 Testing Dense Captioning Platform API")
+    print("=" * 50)
+    # Test both methods
+    gradio_success = test_gradio_client()
+    direct_success = test_direct_api()
+    print("\n" + "=" * 50)
+    print("🏁 Test Results:")
+    print(f"gradio_client: {'✅ PASS' if gradio_success else '❌ FAIL'}")
+    print(f"Direct API: {'✅ PASS' if direct_success else '❌ FAIL'}")
+    if gradio_success or direct_success:
+        print("\n🎉 API is working!")
+    else:
+        print("\n⚠️ API needs more configuration")

test_api.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/env python3
+"""
+Test script for the Dense Captioning Platform API
+"""
+import requests
+import json
+from PIL import Image
+import io
+import base64
+def test_api_with_url():
+    """Test the API using a URL"""
+    print("🧪 Testing API with URL...")
+    # Test URL (a simple chart image)
+    test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+    # API endpoint
+    api_url = "https://hanszhu-dense-captioning-platform.hf.space/predict"
+    try:
+        # Make the request
+        response = requests.post(
+            api_url,
+            json={"data": [test_url]},
+            headers={"Content-Type": "application/json"}
+        )
+        print(f"Status Code: {response.status_code}")
+        print(f"Response: {response.text[:500]}...")
+        if response.status_code == 200:
+            result = response.json()
+            print("✅ API test successful!")
+            print(f"Chart Type: {result.get('data', [{}])[0].get('chart_type_label', 'Unknown')}")
+        else:
+            print("❌ API test failed!")
+    except Exception as e:
+        print(f"❌ Error testing API: {e}")
+def test_api_with_gradio_client():
+    """Test the API using gradio_client"""
+    print("\n🧪 Testing API with gradio_client...")
+    try:
+        from gradio_client import Client
+        # Initialize client
+        client = Client("hanszhu/Dense-Captioning-Platform")
+        # Test with a URL
+        result = client.predict(
+            "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
+            api_name="/predict"
+        )
+        print("✅ gradio_client test successful!")
+        print(f"Result: {result}")
+    except Exception as e:
+        print(f"❌ Error with gradio_client: {e}")
+def test_api_endpoints():
+    """Test available API endpoints"""
+    print("\n🧪 Testing API endpoints...")
+    base_url = "https://hanszhu-dense-captioning-platform.hf.space"
+    endpoints = [
+        "/",
+        "/api",
+        "/api/predict",
+        "/predict"
+    ]
+    for endpoint in endpoints:
+        try:
+            response = requests.get(f"{base_url}{endpoint}")
+            print(f"{endpoint}: {response.status_code}")
+        except Exception as e:
+            print(f"{endpoint}: Error - {e}")
+if __name__ == "__main__":
+    print("🚀 Testing Dense Captioning Platform API")
+    print("=" * 50)
+    # Test different approaches
+    test_api_endpoints()
+    test_api_with_url()
+    test_api_with_gradio_client()
+    print("\n" + "=" * 50)
+    print("🏁 API testing completed!")

test_api_endpoints.py ADDED Viewed

	@@ -0,0 +1,152 @@

+#!/usr/bin/env python3
+"""
+Comprehensive API endpoint testing
+"""
+import requests
+import json
+def test_all_possible_endpoints():
+    """Test all possible API endpoint combinations"""
+    print("🔍 Testing all possible API endpoints...")
+    base_url = "https://hanszhu-dense-captioning-platform.hf.space"
+    test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+    # Different endpoint patterns
+    endpoints = [
+        "/api/predict",
+        "/predict",
+        "/api/run/predict",
+        "/run/predict",
+        "/api/0",
+        "/0",
+        "/api/1",
+        "/1",
+        "/api/2",
+        "/2"
+    ]
+    # Different request formats
+    request_formats = [
+        {"data": [test_url]},
+        {"data": [test_url], "fn_index": 0},
+        {"data": [test_url], "fn_index": 1},
+        {"data": test_url},
+        {"data": [test_url], "session_hash": "test123"}
+    ]
+    for endpoint in endpoints:
+        print(f"\n🔍 Testing endpoint: {endpoint}")
+        for i, format_data in enumerate(request_formats):
+            print(f"  Format {i+1}: {format_data}")
+            try:
+                response = requests.post(
+                    f"{base_url}{endpoint}",
+                    json=format_data,
+                    headers={"Content-Type": "application/json"},
+                    timeout=10
+                )
+                print(f"    Status: {response.status_code}")
+                if response.status_code == 200:
+                    print("    ✅ SUCCESS!")
+                    print(f"    Response: {response.text[:200]}...")
+                    return endpoint, format_data
+                elif response.status_code == 405:
+                    print("    ⚠️ Method not allowed (endpoint exists)")
+                elif response.status_code == 404:
+                    print("    ❌ Not found")
+                else:
+                    print(f"    ❌ Unexpected: {response.text[:100]}...")
+            except Exception as e:
+                print(f"    ❌ Error: {e}")
+    return None, None
+def test_gradio_client_different_ways():
+    """Test gradio_client with different approaches"""
+    print("\n🔍 Testing gradio_client with different approaches...")
+    try:
+        from gradio_client import Client
+        print("Creating client...")
+        client = Client("hanszhu/Dense-Captioning-Platform")
+        print("Trying different API names...")
+        api_names = ["/predict", "/run/predict", "0", "1", "2", "3", "4", "5"]
+        for api_name in api_names:
+            print(f"\nTrying api_name: {api_name}")
+            try:
+                test_url = "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"
+                result = client.predict(test_url, api_name=api_name)
+                print(f"  ✅ SUCCESS with api_name={api_name}!")
+                print(f"  Result: {result}")
+                return api_name
+            except Exception as e:
+                print(f"  ❌ Failed: {e}")
+    except Exception as e:
+        print(f"❌ gradio_client error: {e}")
+def check_space_status():
+    """Check if the space is running and accessible"""
+    print("\n🔍 Checking space status...")
+    try:
+        response = requests.get("https://hanszhu-dense-captioning-platform.hf.space/", timeout=10)
+        print(f"Space status: {response.status_code}")
+        if response.status_code == 200:
+            print("✅ Space is running")
+            # Check for API-related content
+            content = response.text.lower()
+            if "api" in content:
+                print("✅ API-related content found")
+            if "predict" in content:
+                print("✅ Predict-related content found")
+            if "gradio" in content:
+                print("✅ Gradio content found")
+        else:
+            print("❌ Space is not accessible")
+    except Exception as e:
+        print(f"❌ Error checking space: {e}")
+if __name__ == "__main__":
+    print("🚀 Comprehensive API Endpoint Testing")
+    print("=" * 60)
+    # Check space status
+    check_space_status()
+    # Test all endpoints
+    working_endpoint, working_format = test_all_possible_endpoints()
+    # Test gradio_client
+    working_api_name = test_gradio_client_different_ways()
+    print("\n" + "=" * 60)
+    print("🏁 Testing completed!")
+    if working_endpoint and working_format:
+        print(f"✅ Found working combination:")
+        print(f"   Endpoint: {working_endpoint}")
+        print(f"   Format: {working_format}")
+    if working_api_name:
+        print(f"✅ Found working gradio_client api_name: {working_api_name}")
+    if not working_endpoint and not working_api_name:
+        print("❌ No working endpoints found")
+        print("The space might need different configuration or the API is not properly exposed")

web_test.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env python3
+"""
+Test script to check web interface and understand API issue
+"""
+import requests
+import time
+def check_web_interface():
+    """Check if the web interface is working"""
+    print("🔍 Checking web interface...")
+    try:
+        response = requests.get("https://hanszhu-dense-captioning-platform.hf.space/")
+        if response.status_code == 200:
+            print("✅ Web interface is accessible")
+            # Check if it contains our app content
+            if "Dense Captioning Platform" in response.text:
+                print("✅ App is loaded correctly")
+            else:
+                print("❌ App content not found")
+            # Check if it contains Gradio elements
+            if "gradio" in response.text.lower():
+                print("✅ Gradio is loaded")
+            else:
+                print("❌ Gradio not found")
+        else:
+            print(f"❌ Web interface not accessible: {response.status_code}")
+    except Exception as e:
+        print(f"❌ Error checking web interface: {e}")
+def check_api_info():
+    """Check API info endpoint"""
+    print("\n🔍 Checking API info...")
+    try:
+        # Try different API info endpoints
+        endpoints = [
+            "https://hanszhu-dense-captioning-platform.hf.space/api",
+            "https://hanszhu-dense-captioning-platform.hf.space/api/",
+            "https://hanszhu-dense-captioning-platform.hf.space/api/predict",
+            "https://hanszhu-dense-captioning-platform.hf.space/api/predict/"
+        ]
+        for endpoint in endpoints:
+            print(f"\nTrying: {endpoint}")
+            try:
+                response = requests.get(endpoint)
+                print(f"  Status: {response.status_code}")
+                print(f"  Content-Type: {response.headers.get('content-type', 'unknown')}")
+                if response.status_code == 200:
+                    content = response.text[:200]
+                    print(f"  Content: {content}...")
+                    # Check if it's JSON
+                    if response.headers.get('content-type', '').startswith('application/json'):
+                        print("  ✅ JSON response")
+                    else:
+                        print("  ❌ Not JSON response")
+            except Exception as e:
+                print(f"  Error: {e}")
+    except Exception as e:
+        print(f"❌ Error checking API info: {e}")
+def wait_and_retry():
+    """Wait and retry to see if the API becomes available"""
+    print("\n⏳ Waiting for API to become available...")
+    for i in range(5):
+        print(f"\nAttempt {i+1}/5:")
+        try:
+            response = requests.get("https://hanszhu-dense-captioning-platform.hf.space/api")
+            if response.status_code == 200 and response.headers.get('content-type', '').startswith('application/json'):
+                print("✅ API is now available!")
+                return True
+            else:
+                print(f"❌ API not ready yet: {response.status_code}")
+        except Exception as e:
+            print(f"❌ Error: {e}")
+        if i < 4:  # Don't sleep after the last attempt
+            print("Waiting 30 seconds...")
+            time.sleep(30)
+    return False
+if __name__ == "__main__":
+    print("🚀 Testing Dense Captioning Platform Web Interface")
+    print("=" * 60)
+    check_web_interface()
+    check_api_info()
+    # Wait and retry
+    if not wait_and_retry():
+        print("\n⚠️ API is still not available after waiting")
+        print("This might indicate:")
+        print("1. The space is still loading models")
+        print("2. There's a configuration issue")
+        print("3. The API endpoints need different configuration")
+    print("\n" + "=" * 60)
+    print("🏁 Web interface test completed!")