Spaces:

azzandr
/

ID-gambling-website-detection

Running

Azzan Dwi Riski

initial commit

c329d21 22 days ago

11.9 kB

	import gradio as gr
	import os
	import re
	import time
	import torch
	import torch.nn as nn
	from PIL import Image
	import pytesseract
	from playwright.sync_api import sync_playwright
	import asyncio
	from transformers import AutoTokenizer
	from torchvision import transforms
	from torchvision import models
	from torchvision.transforms import functional as F
	import pandas as pd
	from huggingface_hub import hf_hub_download
	import warnings
	warnings.filterwarnings("ignore")
	from pathlib import Path

	# --- Setup ---

	# Device setup
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

	# Image transformation
	class ResizePadToSquare:
	def __init__(self, target_size=300):
	self.target_size = target_size

	def __call__(self, img):
	img = img.convert("RGB")
	img.thumbnail((self.target_size, self.target_size), Image.BILINEAR)
	delta_w = self.target_size - img.size[0]
	delta_h = self.target_size - img.size[1]
	padding = (delta_w // 2, delta_h // 2, delta_w - delta_w // 2, delta_h - delta_h // 2)
	img = F.pad(img, padding, fill=0, padding_mode='constant')
	return img

	transform = transforms.Compose([
	ResizePadToSquare(300),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406],
	std=[0.229, 0.224, 0.225]),
	])

	# Screenshot folder
	SCREENSHOT_DIR = "screenshots"
	os.makedirs(SCREENSHOT_DIR, exist_ok=True)

	# Set Tesseract language
	pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' # Path to tesseract in Docker
	print("Tesseract OCR initialized.")

	# --- Model ---

	class LateFusionModel(nn.Module):
	def __init__(self, image_model, text_model):
	super(LateFusionModel, self).__init__()
	self.image_model = image_model
	self.text_model = text_model
	self.image_weight = nn.Parameter(torch.tensor(0.5))
	self.text_weight = nn.Parameter(torch.tensor(0.5))

	def forward(self, images, input_ids, attention_mask):
	with torch.no_grad():
	image_logits = self.image_model(images).squeeze(1)
	text_logits = self.text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)

	weights = torch.softmax(torch.stack([self.image_weight, self.text_weight]), dim=0)
	fused_logits = weights[0] * image_logits + weights[1] * text_logits

	return fused_logits, image_logits, text_logits, weights

	# Load model
	model_path = "models/best_fusion_model.pt"
	if os.path.exists(model_path):
	fusion_model = torch.load(model_path, map_location=device, weights_only=False)
	else:
	model_path = hf_hub_download(repo_id="azzandr/gambling-fusion-model", filename="best_fusion_model.pt")
	fusion_model = torch.load(model_path, map_location=device, weights_only=False)

	fusion_model.to(device)
	fusion_model.eval()
	print("Fusion model loaded successfully!")

	# Load Image-Only Model
	# Load image model from state_dict
	image_model_path = "models/best_image_model_Adam_lr0.0001_bs32_state_dict.pt"
	if os.path.exists(image_model_path):
	image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
	num_features = image_only_model.classifier[1].in_features
	image_only_model.classifier = nn.Linear(num_features, 1)
	image_only_model.load_state_dict(torch.load(image_model_path, map_location=device))
	image_only_model.to(device)
	image_only_model.eval()
	print("Image-only model loaded from state_dict successfully!")
	else:
	# Download from HuggingFace if local file doesn't exist
	image_model_path = hf_hub_download(repo_id="azzandr/gambling-fusion-model",
	filename="best_image_model_Adam_lr0.0001_bs32_state_dict.pt")
	image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
	num_features = image_only_model.classifier[1].in_features
	image_only_model.classifier = nn.Linear(num_features, 1)
	image_only_model.load_state_dict(torch.load(image_model_path, map_location=device))
	image_only_model.to(device)
	image_only_model.eval()
	print("Image-only model loaded from HuggingFace successfully!")

	# --- Functions ---
	def clean_text(text):
	exceptions = {
	"di", "ke", "ya"
	}
	# ----- BASIC CLEANING -----
	text = re.sub(r"http\S+", "", text) # Hapus URL
	text = re.sub(r"\n", " ", text) # Ganti newline dengan spasi
	text = re.sub(r"[^a-zA-Z']", " ", text) # Hanya sisakan huruf dan apostrof
	text = re.sub(r"\s{2,}", " ", text).strip().lower() # Hapus spasi ganda, ubah ke lowercase

	# ----- FILTERING -----
	words = text.split()
	filtered_words = [
	w for w in words
	if (len(w) > 2 or w in exceptions) # Simpan kata >2 huruf atau ada di exceptions
	]
	text = ' '.join(filtered_words)

	# ----- REMOVE UNWANTED PATTERNS -----
	text = re.sub(r'\b[aeiou]+\b', '', text) # Hapus kata semua vokal (panjang berapa pun)
	text = re.sub(r'\b[^aeiou\s]+\b', '', text) # Hapus kata semua konsonan (panjang berapa pun)
	text = re.sub(r'\b\w{20,}\b', '', text) # Hapus kata sangat panjang (≥20 huruf)
	text = re.sub(r'\s+', ' ', text).strip() # Bersihkan spasi ekstra

	# check words number
	if len(text.split()) < 5:
	print(f"Cleaned text too short ({len(text.split())} words). Ignoring text.")
	return "" # empty return to use image-only
	return text

	def take_screenshot(url):
	filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_') + '.png'
	filepath = os.path.join(SCREENSHOT_DIR, filename)

	try:
	print(f"Taking screenshot with Playwright for URL: {url}")
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page(viewport={"width": 1280, "height": 800})

	# Add timeout and navigation options
	page.set_default_timeout(60000) # 60 seconds timeout

	# Navigate to the URL with wait until options
	page.goto(url, wait_until="networkidle", timeout=60000)

	# Wait a bit for dynamic content to load
	page.wait_for_timeout(3000)

	# Take full page screenshot
	page.screenshot(path=filepath, full_page=True)
	browser.close()

	print(f"Screenshot taken for URL: {url}")
	return filepath
	except Exception as e:
	print(f"Error taking screenshot with Playwright: {e}")
	return None

	def resize_if_needed(image_path, max_mb=1, target_width=720):
	file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
	if file_size > max_mb:
	try:
	with Image.open(image_path) as img:
	width, height = img.size
	if width > target_width:
	ratio = target_width / float(width)
	new_height = int((float(height) * float(ratio)))
	img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
	img.save(image_path, optimize=True, quality=85)
	print(f"Image resized to {target_width}x{new_height}")
	except Exception as e:
	print(f"Resize error: {e}")

	def extract_text_from_image(image_path):
	try:
	resize_if_needed(image_path, max_mb=1, target_width=720)

	# Use Tesseract OCR with Indonesian language
	text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
	print(f"OCR text extracted with Tesseract: {len(text)} characters")

	return text.strip()
	except Exception as e:
	print(f"Tesseract OCR error: {e}")
	return ""

	def prepare_data_for_model(image_path, text):
	image = Image.open(image_path)
	image_tensor = transform(image).unsqueeze(0).to(device)

	clean_text_data = clean_text(text)
	encoding = tokenizer.encode_plus(
	clean_text_data,
	add_special_tokens=True,
	max_length=128,
	padding='max_length',
	truncation=True,
	return_tensors='pt'
	)

	input_ids = encoding['input_ids'].to(device)
	attention_mask = encoding['attention_mask'].to(device)

	return image_tensor, input_ids, attention_mask

	def predict_single_url(url):
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	screenshot_path = take_screenshot(url)
	if not screenshot_path:
	return f"Error: Failed to take screenshot for {url}", None

	text = extract_text_from_image(screenshot_path)

	if not text.strip(): # Jika text kosong
	print(f"No OCR text found for {url}. Using Image-Only Model.")
	image = Image.open(screenshot_path)
	image_tensor = transform(image).unsqueeze(0).to(device)

	with torch.no_grad():
	image_logits = image_only_model(image_tensor).squeeze(1)
	image_probs = torch.sigmoid(image_logits)

	threshold = 0.6
	is_gambling = image_probs[0] > threshold

	label = "Gambling" if is_gambling else "Non-Gambling"
	confidence = image_probs[0].item() if is_gambling else 1 - image_probs[0].item()
	print(f"[Image-Only] URL: {url}")
	print(f"Prediction: {label} \| Confidence: {confidence:.2f}\n")
	return label, f"Confidence: {confidence:.2f}"

	else:
	image_tensor, input_ids, attention_mask = prepare_data_for_model(screenshot_path, text)

	with torch.no_grad():
	fused_logits, image_logits, text_logits, weights = fusion_model(image_tensor, input_ids, attention_mask)
	fused_probs = torch.sigmoid(fused_logits)
	image_probs = torch.sigmoid(image_logits)
	text_probs = torch.sigmoid(text_logits)

	threshold = 0.6
	is_gambling = fused_probs[0] > threshold

	label = "Gambling" if is_gambling else "Non-Gambling"
	confidence = fused_probs[0].item() if is_gambling else 1 - fused_probs[0].item()

	# ✨ Log detail
	print(f"[Fusion Model] URL: {url}")
	print(f"Image Model Prediction Probability: {image_probs[0]:.2f}")
	print(f"Text Model Prediction Probability: {text_probs[0]:.2f}")
	print(f"Fusion Final Prediction: {label} \| Confidence: {confidence:.2f}\n")

	return label, f"Confidence: {confidence:.2f}"

	def predict_batch_urls(file_obj):
	results = []
	content = file_obj.read().decode('utf-8')
	urls = [line.strip() for line in content.splitlines() if line.strip()]
	for url in urls:
	label, confidence = predict_single_url(url)
	results.append({"url": url, "label": label, "confidence": confidence})

	df = pd.DataFrame(results)
	print(f"Batch prediction completed for {len(urls)} URLs.")
	return df

	# --- Gradio App ---

	with gr.Blocks() as app:
	gr.Markdown("# 🕵️ Gambling Website Detection (URL Based)")
	gr.Markdown("### Using Playwright & Tesseract OCR")

	with gr.Tab("Single URL"):
	url_input = gr.Textbox(label="Enter Website URL")
	predict_button = gr.Button("Predict")
	label_output = gr.Label()
	confidence_output = gr.Textbox(label="Confidence", interactive=False)

	predict_button.click(fn=predict_single_url, inputs=url_input, outputs=[label_output, confidence_output])

	with gr.Tab("Batch URLs"):
	file_input = gr.File(label="Upload .txt file with URLs (one per line)")
	batch_predict_button = gr.Button("Batch Predict")
	batch_output = gr.DataFrame()

	batch_predict_button.click(fn=predict_batch_urls, inputs=file_input, outputs=batch_output)

	if __name__ == "__main__":
	app.launch(server_name="0.0.0.0", server_port=7860)