Spaces:

facebook
/

omnisealbench

Running

File size: 11,978 Bytes

DESCRIPTIONS = {
    "snr": {
        "full_name": "Signal-to-Noise Ratio",
        "description": "Measures the ratio between the power of a signal and the power of background noise, expressed in decibels (dB). Higher values indicate better quality, with less noise relative to the signal.",
        "link": "https://en.wikipedia.org/wiki/Signal-to-noise_ratio",
    },
    "sisnr": {
        "full_name": "Scale-Invariant Signal-to-Noise Ratio",
        "description": "A variant of SNR that is invariant to scaling of the signal, making it better for comparing audio quality across different amplitude levels. Higher values indicate better quality.",
        "link": "https://arxiv.org/abs/1811.02508",
    },
    "stoi": {
        "full_name": "Short-Time Objective Intelligibility",
        "description": "Measures the intelligibility of speech by comparing temporal envelopes of clean and degraded speech. Values range from 0 to 1, with higher values indicating better intelligibility.",
        "link": "https://ieeexplore.ieee.org/document/5495701",
    },
    "pesq": {
        "full_name": "Perceptual Evaluation of Speech Quality",
        "description": "An ITU-T standard for measuring speech quality, designed to mimic human perception. Scores range from -0.5 to 4.5, with higher values indicating better perceived quality.",
        "link": "https://www.itu.int/rec/T-REC-P.862",
    },
    "psnr": {
        "full_name": "Peak Signal-to-Noise Ratio",
        "description": "Measures the ratio between the maximum possible power of a signal and the power of corrupting noise. Used primarily for image and video quality assessment, with higher values (in dB) indicating better quality.",
        "link": "https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio",
    },
    "ssim": {
        "full_name": "Structural SIMilarity Index",
        "description": "Measures the perceived similarity between two images based on structural information, contrast, and luminance. Values range from -1 to 1, with 1 indicating perfect similarity.",
        "link": "https://en.wikipedia.org/wiki/Structural_similarity",
    },
    "msssim": {
        "full_name": "Multi-Scale Structural Similarity Index",
        "description": "An extension of SSIM that measures image quality at multiple scales, providing a more robust quality assessment that better mimics human visual perception. Values range from 0 to 1, with higher values indicating better quality.",
        "link": "https://ieeexplore.ieee.org/document/1292216",
    },
    "lpips": {
        "full_name": "Learned Perceptual Image Patch Similarity",
        "description": "A perceptual similarity metric that uses deep neural networks to better approximate human perception. Lower values indicate greater similarity, with 0 representing identical images.",
        "link": "https://arxiv.org/abs/1801.03924",
    },
    "vmaf": {
        "full_name": "Video Multi-method Assessment Fusion",
        "description": "A machine learning-based video quality metric developed by Netflix that combines multiple quality measurements to better correlate with human perception. Scores range from 0 to 100, with higher values indicating better quality.",
        "link": "https://github.com/Netflix/vmaf",
    },
    "decoder_time": {
        "full_name": "Decoder Time",
        "description": "The time (in seconds) taken by the watermark decoder to extract the watermark from the media. Lower values indicate more efficient decoding.",
        "link": "",
    },
    "bit_acc": {
        "full_name": "Bit Accuracy",
        "description": "The percentage of bits correctly decoded from a watermark. Values range from 0 to 1, with 1 indicating perfect extraction of the watermark message.",
        "link": "https://en.wikipedia.org/wiki/Bit_error_rate",
    },
    "word_acc": {
        "full_name": "Word Accuracy",
        "description": "A binary metric indicating whether the entire watermark message was correctly decoded (True) or not (False).",
        "link": "https://en.wikipedia.org/wiki/Word_error_rate",
    },
    "log10_p_value": {
        "full_name": "Log10 P-Value",
        "description": "The logarithm (base 10) of the probability that a decoded watermark could have occurred by chance. More negative values indicate stronger confidence that a real watermark was detected. This metric provides a better comparison that bit accuracy because it fairly compares different message sizes.",
        "link": "https://en.wikipedia.org/wiki/P-value",
    },
    "TPR": {
        "full_name": "True Positive Rate",
        "description": "The proportion of watermarked media correctly identified as containing a watermark. Also known as sensitivity or recall. Values range from 0 to 1, with higher values indicating better detection performance.",
        "link": "https://en.wikipedia.org/wiki/Sensitivity_and_specificity",
    },
    "FPR": {
        "full_name": "False Positive Rate",
        "description": "The proportion of unwatermarked media incorrectly identified as containing a watermark. Values range from 0 to 1, with lower values indicating better detection performance.",
        "link": "https://en.wikipedia.org/wiki/False_positive_rate",
    },
    "watermark_det_score": {
        "full_name": "Watermark Detection Score",
        "description": "A confidence score indicating the system's certainty that a watermark is present. Values typically range from 0 to 1, with higher values indicating greater confidence in watermark detection.",
        "link": "",
    },
}

MODEL_DESCRIPTIONS = {
    "audioseal": {
        "full_name": "AudioSeal",
        "description": "AudioSeal is the first audio watermarking technique designed specifically for localized detection of AI-generated speech.",
        "paper_link": "https://arxiv.org/abs/2401.17264",
        "github_link": "https://github.com/facebookresearch/audioseal",
    },
    "wavmark_fast": {
        "full_name": "WavMark",
        "description": "WavMark uses invertible networks to hide 32 bits in 1-second audio segments. Detection is performed by sliding along the audio in 0.05-second steps and decoding the message for each window. If the first 10 decoded bits match a synchronization pattern, the rest of the payload is saved (22 bits), and the window can directly slide 1 second (instead of 0.05 seconds).",
        "paper_link": "https://arxiv.org/pdf/2308.12770",
        "github_link": "https://github.com/wavmark/wavmark",
    },
    "timbre": {
        "full_name": "Timbre",
        "description": "Timbre embeds the watermark into the frequency domain, which is inherently robust against common data processing methods.",
        "paper_link": "https://arxiv.org/abs/2312.03410",
        "github_link": "https://github.com/TimbreWatermarking/TimbreWatermarking",
    },
    "wam": {
        "full_name": "Watermark Anything Model",
        "description": "The Watermark Anything Model (WAM) is designed for localized image watermarking.",
        "paper_link": "https://arxiv.org/abs/2411.07231",
        "github_link": "https://github.com/facebookresearch/watermark-anything",
    },
    "trustmark": {
        "full_name": "TrustMark - Universal Watermarking for Arbitrary Resolution Images",
        "description": "TrustMark - a GAN-based watermarking method with novel design in architecture and spatio-spectra losses to balance the trade-off between watermarked image quality with the watermark recovery accuracy.",
        "paper_link": "https://arxiv.org/abs/2311.18297",
        "github_link": "https://github.com/adobe/trustmark",
    },
    "ssl": {
        "full_name": "Self-Supervised Latent Spaces",
        "description": "This approach revisits watermarking techniques using pre-trained deep networks and self-supervised methods to embed marks and binary messages into latent spaces.",
        "paper_link": "https://arxiv.org/abs/2112.09581",
        "github_link": "https://github.com/facebookresearch/ssl_watermarking",
    },
    "fnns": {
        "full_name": "Fixed Neural Network Steganography",
        "description": "This approach revisits steganography through adversarial perturbation: it modifies the image such that a fixed decoder correctly outputs the desired message (similar to SSL but with a different network).",
        "paper_link": "https://openreview.net/pdf?id=hcMvApxGSzZ",
        "github_link": "https://github.com/varshakishore/FNNS",
    },
    "hidden": {
        "full_name": "Hiding Data With Deep Networks",
        "description": "First deep watermarking approach from 2018. We use the model trained and open-sourced here, which uses the same architecture and a similar training procedure. Note that this implementation uses a Just Noticeable Difference heatmap to modulate the watermark distortion for less visibility instead of using a perceptual loss during training like in the original paper.",
        "paper_link": "https://arxiv.org/abs/1807.09937",
        "github_link": "https://github.com/ando-khachatryan/HiDDeN",
    },
    "dctdwt": {
        "full_name": "Combined DCT-DWT",
        "description": "The algorithm watermarks a given image using a combination of the Discrete Wavelet Transform (DWT) and the Discrete Cosine Transform (DCT). Performance evaluation results show that combining the two transforms improved the performance of the watermarking algorithms that are based solely on the DWT transform.",
        "paper_link": "https://pdfs.semanticscholar.org/1c47/f281c00cffad4e30deff48a922553cb04d17.pdf",
        "github_link": "https://github.com/ShieldMnt/invisible-watermark",
    },
    "cine_jit": {
        "full_name": "CINE: Towards Blind Watermarking: Combining Invertible and Non-invertible Mechanisms",
        "description": "It remains a challenge to design a watermarking model with high imperceptibility and robustness against strong noise attacks. To resolve this issue, we present a framework Combining the Invertible and Non-invertible (CIN) mechanisms. The CIN is composed of the invertible part to achieve high imperceptibility and the non-invertible part to strengthen the robustness against strong noise attacks.",
        "paper_link": "https://arxiv.org/abs/2212.12678",
        "github_link": "https://github.com/rmpku/CIN",
    },
    "mbrs_jit": {
        "full_name": "MBRS: Mini-Batch of Real and Simulated JPEG Compression",
        "description": "An end-to-end auto-encoder watermarking framework that, during training, randomly applies one of three noise layers per mini-batch: a real JPEG compressor (with variable quality factors), a differentiable simulated JPEG layer, or a noise-free identity layer. To boost performance it incorporates Squeeze-and-Excitation blocks for richer feature learning, a message processor to expand the payload more effectively, and an additive diffusion block to guard against crop attacks. Under JPEG compression at Q=50, MBRS achieves a bit error rate <0.01% and PSNR >36 dB, while also demonstrating strong robustness to Gaussian filtering, cropping, crop-out, and dropout distortions.",
        "paper_link": "https://arxiv.org/pdf/2108.08211",
        "github_link": "https://github.com/jzyustc/mbrs",
    },
    "videoseal": {
        "full_name": "VideoSeal",
        "description": "A neural video watermarking system designed to embed imperceptible watermarks that are robust against common video manipulations and processing operations.",
        "paper_link": "https://arxiv.org/abs/2412.09492",
        "github_link": "https://github.com/facebookresearch/videoseal",
    },
    "rivagan": {
        "full_name": "RivaGAN",
        "description": "A GAN-based approach for robust invisible video watermarking that maintains high visual quality while providing resistance against common video attacks and transformations.",
        "paper_link": "https://arxiv.org/abs/1909.01285",
        "github_link": "https://github.com/DAI-Lab/RivaGAN",
    },
}