Spaces:

azzandr
/

ID-gambling-website-detection

Running

File size: 19,458 Bytes

import gradio as gr
import os
import re
import time
import torch
import torch.nn as nn
from PIL import Image
import pytesseract
from playwright.sync_api import sync_playwright
import asyncio
from transformers import AutoTokenizer, BertTokenizerFast
from torchvision import transforms
from torchvision import models
from torchvision.transforms import functional as F
import pandas as pd
from huggingface_hub import hf_hub_download
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import subprocess
import traceback

# =============================================
# CONFIGURATION
# =============================================

BLOCK_PATTERNS = [
    "doubleclick", "adservice", "googlesyndication", "ads", "adserver", "cookie", "consent",
    "analytics", "tracker", "tracking", "stats", "metric", "telemetry", "social", "facebook",
    "twitter", "linkedin", "pinterest", "popup", "notification", "banner"
]
PAGE_TIMEOUT = 30000  # reduced to 30 seconds
WAIT_FOR_LOAD_TIMEOUT = 5000  # reduced to 5 seconds
CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"]
MAX_REDIRECTS = 5  # Maximum number of redirects to follow

# =============================================
# HELPER FUNCTIONS
# =============================================

def ensure_http(url):
    if not url.startswith(('http://', 'https://')):
        return 'http://' + url
    return url

def sanitize_filename(url):
    return re.sub(r'[^\w\-_\. ]', '_', url)

def block_ads_and_cookies(page):
    def route_intercept(route):
        if any(resource in route.request.url.lower() for resource in BLOCK_PATTERNS):
            route.abort()
        else:
            route.continue_()
    page.route("**/*", route_intercept)

def wait_for_page_stable(page):
    try:
        # First wait for DOM content
        page.wait_for_load_state('domcontentloaded', timeout=PAGE_TIMEOUT)
        
        # Then wait for network to be idle
        try:
            page.wait_for_load_state('networkidle', timeout=WAIT_FOR_LOAD_TIMEOUT)
        except:
            print("Network not fully idle, continuing anyway...")
        
        # Small additional wait
        time.sleep(2)
    except Exception as e:
        print(f"⚠️  Page not fully stable: {e}")

def detect_and_bypass_cloudflare(page):
    try:
        content = page.content()
        if any(keyword.lower() in content.lower() for keyword in CLOUDFLARE_CHECK_KEYWORDS):
            print("⚡ Detected Cloudflare challenge, waiting 5 seconds...")
            time.sleep(5)
            page.reload()
            wait_for_page_stable(page)
    except Exception as e:
        print(f"⚠️  Failed to bypass Cloudflare: {e}")

# --- Setup ---

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer with proper error handling
try:
    # # Try to load from local tokenizer directory
    # tokenizer_path = '/app/tokenizers/indobert-base-p1'
    # if os.path.exists(tokenizer_path):
    #     print(f"Loading tokenizer from local path: {tokenizer_path}")
    #     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    # else:
    #     # If local not available, try direct download with cache
    #     print("Local tokenizer not found, downloading from Hugging Face...")
    #     # tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', 
    #     #                                          use_fast=True,
    #     #                                          cache_dir='/app/tokenizers')
    tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p1")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    # Fallback to default BERT tokenizer if needed
    print("Falling back to default BERT tokenizer")
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Image transformation
class ResizePadToSquare:
    def __init__(self, target_size=300):
        self.target_size = target_size

    def __call__(self, img):
        img = img.convert("RGB")
        img.thumbnail((self.target_size, self.target_size), Image.BILINEAR)
        delta_w = self.target_size - img.size[0]
        delta_h = self.target_size - img.size[1]
        padding = (delta_w // 2, delta_h // 2, delta_w - delta_w // 2, delta_h - delta_h // 2)
        img = F.pad(img, padding, fill=0, padding_mode='constant')
        return img

transform = transforms.Compose([
    ResizePadToSquare(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225]),
])

# Jalankan ini sekali di awal startup aplikasi (misalnya di main file / sebelum model load)
def ensure_playwright_chromium():
    try:
        print("Checking and installing Playwright Chromium if not present...")
        subprocess.run(["playwright", "install", "chromium"], check=True)
        print("Playwright Chromium installation completed.")
    except Exception as e:
        print("Error during Playwright Chromium installation:", e)
        traceback.print_exc()

# Pastikan dipanggil saat startup (di luar fungsi screenshot)
ensure_playwright_chromium()

# Screenshot folder
SCREENSHOT_DIR = "screenshots"
os.makedirs(SCREENSHOT_DIR, exist_ok=True)

# Set Tesseract language
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'  # Path to tesseract in Docker
print("Tesseract OCR initialized.")

# --- Model ---
class LateFusionModel(nn.Module):
    def __init__(self, image_model, text_model):
        super(LateFusionModel, self).__init__()
        self.image_model = image_model
        self.text_model = text_model
        self.image_weight = nn.Parameter(torch.tensor(0.5))
        self.text_weight = nn.Parameter(torch.tensor(0.5))

    def forward(self, images, input_ids, attention_mask):
        with torch.no_grad():
            image_logits = self.image_model(images).squeeze(1)
            text_logits = self.text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)

        weights = torch.softmax(torch.stack([self.image_weight, self.text_weight]), dim=0)
        fused_logits = weights[0] * image_logits + weights[1] * text_logits

        return fused_logits, image_logits, text_logits, weights

# Load model
model_path = "models/best_fusion_model.pt"
if os.path.exists(model_path):
    fusion_model = torch.load(model_path, map_location=device, weights_only=False)
else:
    model_path = hf_hub_download(repo_id="azzandr/gambling-fusion-model", filename="best_fusion_model.pt")
    fusion_model = torch.load(model_path, map_location=device, weights_only=False)

fusion_model.to(device)
fusion_model.eval()
print("Fusion model loaded successfully!")

# Load Image-Only Model
# Load image model from state_dict
image_model_path = "models/best_image_model_Adam_lr0.0001_bs32_state_dict.pt"
if os.path.exists(image_model_path):
    image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
    num_features = image_only_model.classifier[1].in_features
    image_only_model.classifier = nn.Linear(num_features, 1)
    image_only_model.load_state_dict(torch.load(image_model_path, map_location=device))
    image_only_model.to(device)
    image_only_model.eval()
    print("Image-only model loaded from state_dict successfully!")
else:
    # Download from HuggingFace if local file doesn't exist
    image_model_path = hf_hub_download(repo_id="azzandr/gambling-image-model", 
                                      filename="best_image_model_Adam_lr0.0001_bs32_state_dict.pt")
    image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
    num_features = image_only_model.classifier[1].in_features
    image_only_model.classifier = nn.Linear(num_features, 1)
    image_only_model.load_state_dict(torch.load(image_model_path, map_location=device))
    image_only_model.to(device)
    image_only_model.eval()
    print("Image-only model loaded from HuggingFace successfully!")

# --- Functions ---
def clean_text(text):
    exceptions = {
        "di", "ke", "ya"
    }
    # ----- BASIC CLEANING -----
    text = re.sub(r"http\S+", "", text)  # Hapus URL
    text = re.sub(r"\n", " ", text)  # Ganti newline dengan spasi
    text = re.sub(r"[^a-zA-Z']", " ", text)  # Hanya sisakan huruf dan apostrof
    text = re.sub(r"\s{2,}", " ", text).strip().lower()  # Hapus spasi ganda, ubah ke lowercase

    # ----- FILTERING -----
    words = text.split()
    filtered_words = [
        w for w in words
        if (len(w) > 2 or w in exceptions)  # Simpan kata >2 huruf atau ada di exceptions
    ]
    text = ' '.join(filtered_words)

    # ----- REMOVE UNWANTED PATTERNS -----
    text = re.sub(r'\b[aeiou]+\b', '', text)  # Hapus kata semua vokal (panjang berapa pun)
    text = re.sub(r'\b[^aeiou\s]+\b', '', text)  # Hapus kata semua konsonan (panjang berapa pun)
    text = re.sub(r'\b\w{20,}\b', '', text)  # Hapus kata sangat panjang (≥20 huruf)
    text = re.sub(r'\s+', ' ', text).strip()  # Bersihkan spasi ekstra

    # check words number
    if len(text.split()) < 5:
        print(f"Cleaned text too short ({len(text.split())} words). Ignoring text.")
        return ""  # empty return to use image-only
    return text

def create_browser_context(playwright):
    return playwright.chromium.launch(
        args=[
            '--disable-features=IsolateOrigins,site-per-process',
            '--disable-web-security',
            '--disable-site-isolation-trials',
            '--disable-setuid-sandbox',
            '--no-sandbox',
            '--disable-gpu',
            '--disable-dev-shm-usage'
        ]
    ).new_context(
        viewport={"width": 1280, "height": 800},
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
        ignore_https_errors=True,
        java_script_enabled=True,
        bypass_csp=True,
        extra_http_headers={
            "Accept-Language": "en-US,en;q=0.9",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Connection": "keep-alive",
            "DNT": "1"
        }
    )

def setup_request_interception(page):
    redirect_count = 0
    
    def handle_request(route):
        nonlocal redirect_count
        request = route.request
        
        # Block known ad/tracking patterns
        if any(pattern in request.url.lower() for pattern in BLOCK_PATTERNS):
            print(f"Blocking request to: {request.url}")
            route.abort()
            return

        # Handle redirects
        if request.redirect_chain:
            redirect_count += 1
            if redirect_count > MAX_REDIRECTS:
                print(f"Too many redirects ({redirect_count}), aborting request")
                route.abort()
                return
        
        # Continue with the request
        route.continue_()
    
    page.route("**/*", handle_request)

def take_screenshot(url):
    url = ensure_http(url)
    filename = sanitize_filename(url) + '.png'
    filepath = os.path.join(SCREENSHOT_DIR, filename)
    
    try:
        print(f"\n=== [START SCREENSHOT] URL: {url} ===")
        
        with sync_playwright() as p:
            print("Launching browser with custom configuration...")
            context = create_browser_context(p)
            page = context.new_page()
            
            print("Setting up request interception...")
            setup_request_interception(page)
            
            try:
                print("Attempting to navigate to URL...")
                response = page.goto(
                    url,
                    wait_until="commit",  # Changed to commit instead of domcontentloaded
                    timeout=PAGE_TIMEOUT
                )
                
                if not response:
                    print("No response received, attempting to continue...")
                elif response.status >= 400:
                    print(f"Received error status code: {response.status}")
                
                # Try to wait for the page to be more stable
                wait_for_page_stable(page)
                
                # Take screenshot even if page might not be fully loaded
                print("Taking screenshot...")
                page.screenshot(path=filepath)
                
            except Exception as nav_error:
                print(f"Navigation error: {nav_error}")
                # Try to take screenshot anyway if we have any content
                try:
                    if page.url != "about:blank":
                        print("Taking screenshot of partial page...")
                        page.screenshot(path=filepath)
                    else:
                        raise nav_error
                except:
                    raise nav_error
            finally:
                context.close()
        
        if os.path.exists(filepath):
            print(f"Screenshot saved successfully to {filepath}")
            return filepath
        else:
            raise Exception("Screenshot file was not created")

    except Exception as e:
        print(f"[ERROR] Failed to take screenshot for URL: {url}")
        print(f"Exception: {e}")
        traceback.print_exc()
        return None

def resize_if_needed(image_path, max_mb=1, target_width=720):
    file_size = os.path.getsize(image_path) / (1024 * 1024)  # dalam MB
    if file_size > max_mb:
        try:
            with Image.open(image_path) as img:
                width, height = img.size
                if width > target_width:
                    ratio = target_width / float(width)
                    new_height = int((float(height) * float(ratio)))
                    img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
                    img.save(image_path, optimize=True, quality=85)
                    print(f"Image resized to {target_width}x{new_height}")
        except Exception as e:
            print(f"Resize error: {e}")

def extract_text_from_image(image_path):
    try:
        resize_if_needed(image_path, max_mb=1, target_width=720)
        
        # Use Tesseract OCR with Indonesian language
        text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
        print(f"OCR text extracted with Tesseract: {len(text)} characters")
        
        return text.strip()
    except Exception as e:
        print(f"Tesseract OCR error: {e}")
        return ""

def prepare_data_for_model(image_path, text):
    image = Image.open(image_path)
    image_tensor = transform(image).unsqueeze(0).to(device)

    clean_text_data = clean_text(text)
    encoding = tokenizer.encode_plus(
        clean_text_data,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    return image_tensor, input_ids, attention_mask

def predict_single_url(url):
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
        
    screenshot_path = take_screenshot(url)
    if not screenshot_path:
        return f"Error: Failed to take screenshot for {url}", None, None, None, None

    raw_text = extract_text_from_image(screenshot_path)
    cleaned_text = clean_text(raw_text) if raw_text.strip() else ""

    if not raw_text.strip():  # Jika text kosong
        print(f"No OCR text found for {url}. Using Image-Only Model.")
        image = Image.open(screenshot_path)
        image_tensor = transform(image).unsqueeze(0).to(device)

        with torch.no_grad():
            image_logits = image_only_model(image_tensor).squeeze(1)
            image_probs = torch.sigmoid(image_logits)

            threshold = 0.6
            is_gambling = image_probs[0] > threshold

        label = "Gambling" if is_gambling else "Non-Gambling"
        confidence = image_probs[0].item() if is_gambling else 1 - image_probs[0].item()
        print(f"[Image-Only] URL: {url}")
        print(f"Prediction: {label} | Confidence: {confidence:.2f}\n")
        return label, f"Confidence: {confidence:.2f}", screenshot_path, raw_text, cleaned_text

    else:
        image_tensor, input_ids, attention_mask = prepare_data_for_model(screenshot_path, raw_text)

        with torch.no_grad():
            fused_logits, image_logits, text_logits, weights = fusion_model(image_tensor, input_ids, attention_mask)
            fused_probs = torch.sigmoid(fused_logits)
            image_probs = torch.sigmoid(image_logits)
            text_probs = torch.sigmoid(text_logits)

            threshold = 0.6
            is_gambling = fused_probs[0] > threshold

        label = "Gambling" if is_gambling else "Non-Gambling"
        confidence = fused_probs[0].item() if is_gambling else 1 - fused_probs[0].item()

        # ✨ Log detail
        print(f"[Fusion Model] URL: {url}")
        print(f"Image Model Prediction Probability: {image_probs[0]:.2f}")
        print(f"Text Model Prediction Probability: {text_probs[0]:.2f}")
        print(f"Fusion Final Prediction: {label} | Confidence: {confidence:.2f}\n")

        return label, f"Confidence: {confidence:.2f}", screenshot_path, raw_text, cleaned_text

def predict_batch_urls(file_obj):
    results = []
    content = file_obj.read().decode('utf-8')
    urls = [line.strip() for line in content.splitlines() if line.strip()]
    for url in urls:
        label, confidence, screenshot_path, raw_text, cleaned_text = predict_single_url(url)
        results.append({"url": url, "label": label, "confidence": confidence, "screenshot_path": screenshot_path, "raw_text": raw_text, "cleaned_text": cleaned_text})

    df = pd.DataFrame(results)
    print(f"Batch prediction completed for {len(urls)} URLs.")
    return df

# --- Gradio App ---

with gr.Blocks() as app:
    gr.Markdown("# 🕵️ Gambling Website Detection (URL Based)")
    gr.Markdown("### Using Playwright & Tesseract OCR")

    with gr.Tab("Single URL"):
        url_input = gr.Textbox(label="Enter Website URL")
        predict_button = gr.Button("Predict")
        
        with gr.Row():
            with gr.Column():
                label_output = gr.Label()
                confidence_output = gr.Textbox(label="Confidence", interactive=False)
                
            with gr.Column():
                screenshot_output = gr.Image(label="Screenshot", type="filepath")
                
        with gr.Row():
            with gr.Column():
                raw_text_output = gr.Textbox(label="Raw OCR Text", lines=5)
            with gr.Column():
                cleaned_text_output = gr.Textbox(label="Cleaned Text", lines=5)

        predict_button.click(
            fn=predict_single_url, 
            inputs=url_input, 
            outputs=[label_output, confidence_output, screenshot_output, raw_text_output, cleaned_text_output]
        )

    with gr.Tab("Batch URLs"):
        file_input = gr.File(label="Upload .txt file with URLs (one per line)")
        batch_predict_button = gr.Button("Batch Predict")
        batch_output = gr.DataFrame()

        batch_predict_button.click(fn=predict_batch_urls, inputs=file_input, outputs=batch_output)

if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)