Azzan Dwi Riski commited on
Commit
c329d21
·
1 Parent(s): 50c7c91

initial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +44 -0
  2. app.py +306 -0
  3. requirements +11 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install essential packages
4
+ RUN apt-get update && apt-get install -y \
5
+ build-essential \
6
+ curl \
7
+ software-properties-common \
8
+ git \
9
+ tesseract-ocr \
10
+ tesseract-ocr-ind \
11
+ libgl1-mesa-glx \
12
+ libglib2.0-0 \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Install nodejs (required for Playwright)
16
+ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
17
+ apt-get install -y nodejs
18
+
19
+ # Set up working directory
20
+ WORKDIR /app
21
+
22
+ # Install Python dependencies
23
+ COPY requirements.txt /app/
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Install Playwright
27
+ RUN pip install playwright && \
28
+ playwright install chromium && \
29
+ playwright install-deps chromium
30
+
31
+ # Copy application code
32
+ COPY . /app/
33
+
34
+ # Create directory for screenshots
35
+ RUN mkdir -p screenshots
36
+
37
+ # Create directory for models
38
+ RUN mkdir -p models
39
+
40
+ # Make sure the app runs at port 7860 (Gradio default)
41
+ EXPOSE 7860
42
+
43
+ # Start the application
44
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import time
5
+ import torch
6
+ import torch.nn as nn
7
+ from PIL import Image
8
+ import pytesseract
9
+ from playwright.sync_api import sync_playwright
10
+ import asyncio
11
+ from transformers import AutoTokenizer
12
+ from torchvision import transforms
13
+ from torchvision import models
14
+ from torchvision.transforms import functional as F
15
+ import pandas as pd
16
+ from huggingface_hub import hf_hub_download
17
+ import warnings
18
+ warnings.filterwarnings("ignore")
19
+ from pathlib import Path
20
+
21
+ # --- Setup ---
22
+
23
+ # Device setup
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ print(f"Using device: {device}")
26
+
27
+ # Load tokenizer
28
+ tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
29
+
30
+ # Image transformation
31
+ class ResizePadToSquare:
32
+ def __init__(self, target_size=300):
33
+ self.target_size = target_size
34
+
35
+ def __call__(self, img):
36
+ img = img.convert("RGB")
37
+ img.thumbnail((self.target_size, self.target_size), Image.BILINEAR)
38
+ delta_w = self.target_size - img.size[0]
39
+ delta_h = self.target_size - img.size[1]
40
+ padding = (delta_w // 2, delta_h // 2, delta_w - delta_w // 2, delta_h - delta_h // 2)
41
+ img = F.pad(img, padding, fill=0, padding_mode='constant')
42
+ return img
43
+
44
+ transform = transforms.Compose([
45
+ ResizePadToSquare(300),
46
+ transforms.ToTensor(),
47
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
48
+ std=[0.229, 0.224, 0.225]),
49
+ ])
50
+
51
+ # Screenshot folder
52
+ SCREENSHOT_DIR = "screenshots"
53
+ os.makedirs(SCREENSHOT_DIR, exist_ok=True)
54
+
55
+ # Set Tesseract language
56
+ pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' # Path to tesseract in Docker
57
+ print("Tesseract OCR initialized.")
58
+
59
+ # --- Model ---
60
+
61
+ class LateFusionModel(nn.Module):
62
+ def __init__(self, image_model, text_model):
63
+ super(LateFusionModel, self).__init__()
64
+ self.image_model = image_model
65
+ self.text_model = text_model
66
+ self.image_weight = nn.Parameter(torch.tensor(0.5))
67
+ self.text_weight = nn.Parameter(torch.tensor(0.5))
68
+
69
+ def forward(self, images, input_ids, attention_mask):
70
+ with torch.no_grad():
71
+ image_logits = self.image_model(images).squeeze(1)
72
+ text_logits = self.text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
73
+
74
+ weights = torch.softmax(torch.stack([self.image_weight, self.text_weight]), dim=0)
75
+ fused_logits = weights[0] * image_logits + weights[1] * text_logits
76
+
77
+ return fused_logits, image_logits, text_logits, weights
78
+
79
+ # Load model
80
+ model_path = "models/best_fusion_model.pt"
81
+ if os.path.exists(model_path):
82
+ fusion_model = torch.load(model_path, map_location=device, weights_only=False)
83
+ else:
84
+ model_path = hf_hub_download(repo_id="azzandr/gambling-fusion-model", filename="best_fusion_model.pt")
85
+ fusion_model = torch.load(model_path, map_location=device, weights_only=False)
86
+
87
+ fusion_model.to(device)
88
+ fusion_model.eval()
89
+ print("Fusion model loaded successfully!")
90
+
91
+ # Load Image-Only Model
92
+ # Load image model from state_dict
93
+ image_model_path = "models/best_image_model_Adam_lr0.0001_bs32_state_dict.pt"
94
+ if os.path.exists(image_model_path):
95
+ image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
96
+ num_features = image_only_model.classifier[1].in_features
97
+ image_only_model.classifier = nn.Linear(num_features, 1)
98
+ image_only_model.load_state_dict(torch.load(image_model_path, map_location=device))
99
+ image_only_model.to(device)
100
+ image_only_model.eval()
101
+ print("Image-only model loaded from state_dict successfully!")
102
+ else:
103
+ # Download from HuggingFace if local file doesn't exist
104
+ image_model_path = hf_hub_download(repo_id="azzandr/gambling-fusion-model",
105
+ filename="best_image_model_Adam_lr0.0001_bs32_state_dict.pt")
106
+ image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
107
+ num_features = image_only_model.classifier[1].in_features
108
+ image_only_model.classifier = nn.Linear(num_features, 1)
109
+ image_only_model.load_state_dict(torch.load(image_model_path, map_location=device))
110
+ image_only_model.to(device)
111
+ image_only_model.eval()
112
+ print("Image-only model loaded from HuggingFace successfully!")
113
+
114
+ # --- Functions ---
115
+ def clean_text(text):
116
+ exceptions = {
117
+ "di", "ke", "ya"
118
+ }
119
+ # ----- BASIC CLEANING -----
120
+ text = re.sub(r"http\S+", "", text) # Hapus URL
121
+ text = re.sub(r"\n", " ", text) # Ganti newline dengan spasi
122
+ text = re.sub(r"[^a-zA-Z']", " ", text) # Hanya sisakan huruf dan apostrof
123
+ text = re.sub(r"\s{2,}", " ", text).strip().lower() # Hapus spasi ganda, ubah ke lowercase
124
+
125
+ # ----- FILTERING -----
126
+ words = text.split()
127
+ filtered_words = [
128
+ w for w in words
129
+ if (len(w) > 2 or w in exceptions) # Simpan kata >2 huruf atau ada di exceptions
130
+ ]
131
+ text = ' '.join(filtered_words)
132
+
133
+ # ----- REMOVE UNWANTED PATTERNS -----
134
+ text = re.sub(r'\b[aeiou]+\b', '', text) # Hapus kata semua vokal (panjang berapa pun)
135
+ text = re.sub(r'\b[^aeiou\s]+\b', '', text) # Hapus kata semua konsonan (panjang berapa pun)
136
+ text = re.sub(r'\b\w{20,}\b', '', text) # Hapus kata sangat panjang (≥20 huruf)
137
+ text = re.sub(r'\s+', ' ', text).strip() # Bersihkan spasi ekstra
138
+
139
+ # check words number
140
+ if len(text.split()) < 5:
141
+ print(f"Cleaned text too short ({len(text.split())} words). Ignoring text.")
142
+ return "" # empty return to use image-only
143
+ return text
144
+
145
+ def take_screenshot(url):
146
+ filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_') + '.png'
147
+ filepath = os.path.join(SCREENSHOT_DIR, filename)
148
+
149
+ try:
150
+ print(f"Taking screenshot with Playwright for URL: {url}")
151
+ with sync_playwright() as p:
152
+ browser = p.chromium.launch()
153
+ page = browser.new_page(viewport={"width": 1280, "height": 800})
154
+
155
+ # Add timeout and navigation options
156
+ page.set_default_timeout(60000) # 60 seconds timeout
157
+
158
+ # Navigate to the URL with wait until options
159
+ page.goto(url, wait_until="networkidle", timeout=60000)
160
+
161
+ # Wait a bit for dynamic content to load
162
+ page.wait_for_timeout(3000)
163
+
164
+ # Take full page screenshot
165
+ page.screenshot(path=filepath, full_page=True)
166
+ browser.close()
167
+
168
+ print(f"Screenshot taken for URL: {url}")
169
+ return filepath
170
+ except Exception as e:
171
+ print(f"Error taking screenshot with Playwright: {e}")
172
+ return None
173
+
174
+ def resize_if_needed(image_path, max_mb=1, target_width=720):
175
+ file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
176
+ if file_size > max_mb:
177
+ try:
178
+ with Image.open(image_path) as img:
179
+ width, height = img.size
180
+ if width > target_width:
181
+ ratio = target_width / float(width)
182
+ new_height = int((float(height) * float(ratio)))
183
+ img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
184
+ img.save(image_path, optimize=True, quality=85)
185
+ print(f"Image resized to {target_width}x{new_height}")
186
+ except Exception as e:
187
+ print(f"Resize error: {e}")
188
+
189
+ def extract_text_from_image(image_path):
190
+ try:
191
+ resize_if_needed(image_path, max_mb=1, target_width=720)
192
+
193
+ # Use Tesseract OCR with Indonesian language
194
+ text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
195
+ print(f"OCR text extracted with Tesseract: {len(text)} characters")
196
+
197
+ return text.strip()
198
+ except Exception as e:
199
+ print(f"Tesseract OCR error: {e}")
200
+ return ""
201
+
202
+ def prepare_data_for_model(image_path, text):
203
+ image = Image.open(image_path)
204
+ image_tensor = transform(image).unsqueeze(0).to(device)
205
+
206
+ clean_text_data = clean_text(text)
207
+ encoding = tokenizer.encode_plus(
208
+ clean_text_data,
209
+ add_special_tokens=True,
210
+ max_length=128,
211
+ padding='max_length',
212
+ truncation=True,
213
+ return_tensors='pt'
214
+ )
215
+
216
+ input_ids = encoding['input_ids'].to(device)
217
+ attention_mask = encoding['attention_mask'].to(device)
218
+
219
+ return image_tensor, input_ids, attention_mask
220
+
221
+ def predict_single_url(url):
222
+ if not url.startswith(('http://', 'https://')):
223
+ url = 'https://' + url
224
+
225
+ screenshot_path = take_screenshot(url)
226
+ if not screenshot_path:
227
+ return f"Error: Failed to take screenshot for {url}", None
228
+
229
+ text = extract_text_from_image(screenshot_path)
230
+
231
+ if not text.strip(): # Jika text kosong
232
+ print(f"No OCR text found for {url}. Using Image-Only Model.")
233
+ image = Image.open(screenshot_path)
234
+ image_tensor = transform(image).unsqueeze(0).to(device)
235
+
236
+ with torch.no_grad():
237
+ image_logits = image_only_model(image_tensor).squeeze(1)
238
+ image_probs = torch.sigmoid(image_logits)
239
+
240
+ threshold = 0.6
241
+ is_gambling = image_probs[0] > threshold
242
+
243
+ label = "Gambling" if is_gambling else "Non-Gambling"
244
+ confidence = image_probs[0].item() if is_gambling else 1 - image_probs[0].item()
245
+ print(f"[Image-Only] URL: {url}")
246
+ print(f"Prediction: {label} | Confidence: {confidence:.2f}\n")
247
+ return label, f"Confidence: {confidence:.2f}"
248
+
249
+ else:
250
+ image_tensor, input_ids, attention_mask = prepare_data_for_model(screenshot_path, text)
251
+
252
+ with torch.no_grad():
253
+ fused_logits, image_logits, text_logits, weights = fusion_model(image_tensor, input_ids, attention_mask)
254
+ fused_probs = torch.sigmoid(fused_logits)
255
+ image_probs = torch.sigmoid(image_logits)
256
+ text_probs = torch.sigmoid(text_logits)
257
+
258
+ threshold = 0.6
259
+ is_gambling = fused_probs[0] > threshold
260
+
261
+ label = "Gambling" if is_gambling else "Non-Gambling"
262
+ confidence = fused_probs[0].item() if is_gambling else 1 - fused_probs[0].item()
263
+
264
+ # ✨ Log detail
265
+ print(f"[Fusion Model] URL: {url}")
266
+ print(f"Image Model Prediction Probability: {image_probs[0]:.2f}")
267
+ print(f"Text Model Prediction Probability: {text_probs[0]:.2f}")
268
+ print(f"Fusion Final Prediction: {label} | Confidence: {confidence:.2f}\n")
269
+
270
+ return label, f"Confidence: {confidence:.2f}"
271
+
272
+ def predict_batch_urls(file_obj):
273
+ results = []
274
+ content = file_obj.read().decode('utf-8')
275
+ urls = [line.strip() for line in content.splitlines() if line.strip()]
276
+ for url in urls:
277
+ label, confidence = predict_single_url(url)
278
+ results.append({"url": url, "label": label, "confidence": confidence})
279
+
280
+ df = pd.DataFrame(results)
281
+ print(f"Batch prediction completed for {len(urls)} URLs.")
282
+ return df
283
+
284
+ # --- Gradio App ---
285
+
286
+ with gr.Blocks() as app:
287
+ gr.Markdown("# 🕵️ Gambling Website Detection (URL Based)")
288
+ gr.Markdown("### Using Playwright & Tesseract OCR")
289
+
290
+ with gr.Tab("Single URL"):
291
+ url_input = gr.Textbox(label="Enter Website URL")
292
+ predict_button = gr.Button("Predict")
293
+ label_output = gr.Label()
294
+ confidence_output = gr.Textbox(label="Confidence", interactive=False)
295
+
296
+ predict_button.click(fn=predict_single_url, inputs=url_input, outputs=[label_output, confidence_output])
297
+
298
+ with gr.Tab("Batch URLs"):
299
+ file_input = gr.File(label="Upload .txt file with URLs (one per line)")
300
+ batch_predict_button = gr.Button("Batch Predict")
301
+ batch_output = gr.DataFrame()
302
+
303
+ batch_predict_button.click(fn=predict_batch_urls, inputs=file_input, outputs=batch_output)
304
+
305
+ if __name__ == "__main__":
306
+ app.launch(server_name="0.0.0.0", server_port=7860)
requirements ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ gradio
3
+ torch
4
+ torchvision
5
+ pytesseract
6
+ playwright
7
+ Pillow
8
+ transformers
9
+ pandas
10
+ huggingface_hub
11
+ python-dotenv