Azzan Dwi Riski
commited on
Commit
Β·
4e933a0
1
Parent(s):
34765d1
fix tokenizer issues
Browse files- .dockerignore +21 -0
- Dockerfile +19 -17
- app.py +22 -25
- entrypoint.sh +0 -22
- verify_environment.py +121 -0
.dockerignore
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore common files and directories
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
.github
|
5 |
+
.vscode
|
6 |
+
__pycache__
|
7 |
+
*.pyc
|
8 |
+
*.pyo
|
9 |
+
*.pyd
|
10 |
+
.Python
|
11 |
+
env/
|
12 |
+
venv/
|
13 |
+
.env
|
14 |
+
.venv
|
15 |
+
.idea/
|
16 |
+
|
17 |
+
# Don't ignore the model directory
|
18 |
+
!models/
|
19 |
+
|
20 |
+
# Ignore screenshots that might exist in repo but are generated at runtime
|
21 |
+
screenshots/*
|
Dockerfile
CHANGED
@@ -19,39 +19,41 @@ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
|
|
19 |
# Set up working directory
|
20 |
WORKDIR /app
|
21 |
|
22 |
-
# Create non-root user for
|
23 |
RUN useradd -m -u 1000 user
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
RUN chown -R user:user
|
|
|
28 |
|
29 |
# Install Python dependencies
|
30 |
COPY requirements.txt /app/
|
31 |
RUN pip install --no-cache-dir -r requirements.txt
|
32 |
|
33 |
-
# Pre-download tokenizer
|
34 |
-
RUN pip install transformers
|
35 |
-
RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)"
|
36 |
-
|
37 |
# Install Playwright
|
38 |
RUN pip install playwright && \
|
39 |
playwright install chromium && \
|
40 |
playwright install-deps chromium
|
41 |
|
|
|
|
|
|
|
|
|
|
|
42 |
# Copy application code
|
43 |
COPY . /app/
|
44 |
-
|
45 |
-
# Create directory for screenshots and models with proper permissions
|
46 |
-
RUN mkdir -p screenshots models
|
47 |
RUN chown -R user:user /app
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
RUN
|
52 |
|
53 |
# Make sure the app runs at port 7860 (Gradio default)
|
54 |
EXPOSE 7860
|
55 |
|
56 |
-
#
|
57 |
-
|
|
|
|
|
|
|
|
19 |
# Set up working directory
|
20 |
WORKDIR /app
|
21 |
|
22 |
+
# Create non-root user for better security and permissions
|
23 |
RUN useradd -m -u 1000 user
|
24 |
+
RUN chown -R user:user /app
|
25 |
+
|
26 |
+
# Create cache directories with proper permissions
|
27 |
+
RUN mkdir -p /.cache && chown -R user:user /.cache && chmod -R 777 /.cache
|
28 |
+
RUN mkdir -p /home/user/.cache && chown -R user:user /home/user/.cache
|
29 |
|
30 |
# Install Python dependencies
|
31 |
COPY requirements.txt /app/
|
32 |
RUN pip install --no-cache-dir -r requirements.txt
|
33 |
|
|
|
|
|
|
|
|
|
34 |
# Install Playwright
|
35 |
RUN pip install playwright && \
|
36 |
playwright install chromium && \
|
37 |
playwright install-deps chromium
|
38 |
|
39 |
+
# Download and cache the tokenizer
|
40 |
+
RUN mkdir -p /app/tokenizers/indobert-base-p1
|
41 |
+
RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', cache_dir='/app/tokenizers')"
|
42 |
+
RUN chown -R user:user /app/tokenizers
|
43 |
+
|
44 |
# Copy application code
|
45 |
COPY . /app/
|
|
|
|
|
|
|
46 |
RUN chown -R user:user /app
|
47 |
|
48 |
+
# Create directory for screenshots and models
|
49 |
+
RUN mkdir -p screenshots models
|
50 |
+
RUN chown -R user:user screenshots models
|
51 |
|
52 |
# Make sure the app runs at port 7860 (Gradio default)
|
53 |
EXPOSE 7860
|
54 |
|
55 |
+
# Switch to non-root user
|
56 |
+
USER user
|
57 |
+
|
58 |
+
# Start the application
|
59 |
+
CMD ["python", "app.py"]
|
app.py
CHANGED
@@ -24,27 +24,24 @@ from pathlib import Path
|
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
print(f"Using device: {device}")
|
26 |
|
27 |
-
# Load tokenizer with
|
28 |
try:
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
except Exception as e:
|
32 |
print(f"Error loading tokenizer: {e}")
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
'indobenchmark/indobert-base-p1',
|
37 |
-
use_fast=False,
|
38 |
-
local_files_only=False,
|
39 |
-
cache_dir='/home/user/huggingface'
|
40 |
-
)
|
41 |
-
print("Tokenizer loaded with adjusted parameters!")
|
42 |
-
except Exception as e2:
|
43 |
-
print(f"Second attempt failed: {e2}")
|
44 |
-
# Fallback to basic tokenizer if absolutely necessary
|
45 |
-
from transformers import BertTokenizer
|
46 |
-
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
47 |
-
print("Loaded fallback tokenizer (bert-base-uncased)")
|
48 |
|
49 |
# Image transformation
|
50 |
class ResizePadToSquare:
|
@@ -190,24 +187,24 @@ def take_screenshot(url):
|
|
190 |
print(f"Error taking screenshot with Playwright: {e}")
|
191 |
return None
|
192 |
|
193 |
-
def resize_if_needed(image_path, max_mb=1,
|
194 |
file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
|
195 |
if file_size > max_mb:
|
196 |
try:
|
197 |
with Image.open(image_path) as img:
|
198 |
width, height = img.size
|
199 |
-
if
|
200 |
-
ratio =
|
201 |
-
|
202 |
-
img = img.resize((
|
203 |
img.save(image_path, optimize=True, quality=85)
|
204 |
-
print(f"Image resized to {
|
205 |
except Exception as e:
|
206 |
print(f"Resize error: {e}")
|
207 |
|
208 |
def extract_text_from_image(image_path):
|
209 |
try:
|
210 |
-
resize_if_needed(image_path, max_mb=1,
|
211 |
|
212 |
# Use Tesseract OCR with Indonesian language
|
213 |
text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
|
|
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
print(f"Using device: {device}")
|
26 |
|
27 |
+
# Load tokenizer with proper error handling
|
28 |
try:
|
29 |
+
# Try to load from local tokenizer directory
|
30 |
+
tokenizer_path = '/app/tokenizers/indobert-base-p1'
|
31 |
+
if os.path.exists(tokenizer_path):
|
32 |
+
print(f"Loading tokenizer from local path: {tokenizer_path}")
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
34 |
+
else:
|
35 |
+
# If local not available, try direct download with cache
|
36 |
+
print("Local tokenizer not found, downloading from Hugging Face...")
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1',
|
38 |
+
use_fast=True,
|
39 |
+
cache_dir='/app/tokenizers')
|
40 |
except Exception as e:
|
41 |
print(f"Error loading tokenizer: {e}")
|
42 |
+
# Fallback to default BERT tokenizer if needed
|
43 |
+
print("Falling back to default BERT tokenizer")
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
# Image transformation
|
47 |
class ResizePadToSquare:
|
|
|
187 |
print(f"Error taking screenshot with Playwright: {e}")
|
188 |
return None
|
189 |
|
190 |
+
def resize_if_needed(image_path, max_mb=1, target_width=720):
|
191 |
file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
|
192 |
if file_size > max_mb:
|
193 |
try:
|
194 |
with Image.open(image_path) as img:
|
195 |
width, height = img.size
|
196 |
+
if width > target_width:
|
197 |
+
ratio = target_width / float(width)
|
198 |
+
new_height = int((float(height) * float(ratio)))
|
199 |
+
img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
|
200 |
img.save(image_path, optimize=True, quality=85)
|
201 |
+
print(f"Image resized to {target_width}x{new_height}")
|
202 |
except Exception as e:
|
203 |
print(f"Resize error: {e}")
|
204 |
|
205 |
def extract_text_from_image(image_path):
|
206 |
try:
|
207 |
+
resize_if_needed(image_path, max_mb=1, target_width=720)
|
208 |
|
209 |
# Use Tesseract OCR with Indonesian language
|
210 |
text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
|
entrypoint.sh
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
set -e
|
3 |
-
|
4 |
-
# Create necessary directories with correct permissions
|
5 |
-
mkdir -p /home/user/.cache
|
6 |
-
mkdir -p /home/user/huggingface
|
7 |
-
mkdir -p /app/screenshots
|
8 |
-
mkdir -p /app/models
|
9 |
-
|
10 |
-
# Ensure permissions
|
11 |
-
chown -R user:user /home/user
|
12 |
-
chown -R user:user /app/screenshots
|
13 |
-
chown -R user:user /app/models
|
14 |
-
|
15 |
-
# Download tokenizer explicitly if needed
|
16 |
-
if [ ! -d "/home/user/huggingface/models--indobenchmark--indobert-base-p1" ]; then
|
17 |
-
echo "Pre-downloading tokenizer..."
|
18 |
-
python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False, cache_dir='/home/user/huggingface')"
|
19 |
-
fi
|
20 |
-
|
21 |
-
# Run the application
|
22 |
-
exec python app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
verify_environment.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script untuk memverifikasi apakah environment Docker memiliki semua yang diperlukan
|
4 |
+
untuk menjalankan aplikasi gambling detection.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import subprocess
|
10 |
+
import torch
|
11 |
+
from pathlib import Path
|
12 |
+
import importlib.util
|
13 |
+
|
14 |
+
def check_package(package_name):
|
15 |
+
"""Periksa apakah paket terinstall."""
|
16 |
+
return importlib.util.find_spec(package_name) is not None
|
17 |
+
|
18 |
+
def check_command(command):
|
19 |
+
"""Periksa apakah command tersedia di sistem."""
|
20 |
+
try:
|
21 |
+
subprocess.run([command, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
22 |
+
return True
|
23 |
+
except FileNotFoundError:
|
24 |
+
return False
|
25 |
+
|
26 |
+
def main():
|
27 |
+
"""Fungsi utama untuk memeriksa environment."""
|
28 |
+
# Header
|
29 |
+
print("="*50)
|
30 |
+
print("Environment Verification for Gambling Detection App")
|
31 |
+
print("="*50)
|
32 |
+
|
33 |
+
# Check Python version
|
34 |
+
print(f"Python version: {sys.version}")
|
35 |
+
|
36 |
+
# Check CUDA
|
37 |
+
print(f"PyTorch version: {torch.__version__}")
|
38 |
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
39 |
+
if torch.cuda.is_available():
|
40 |
+
print(f"CUDA version: {torch.version.cuda}")
|
41 |
+
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
42 |
+
|
43 |
+
# Check dependencies
|
44 |
+
dependencies = ["gradio", "torch", "transformers", "pytesseract", "playwright", "PIL", "pandas"]
|
45 |
+
print("\nChecking required packages:")
|
46 |
+
for package in dependencies:
|
47 |
+
status = "β
Installed" if check_package(package) else "β Missing"
|
48 |
+
print(f" - {package}: {status}")
|
49 |
+
|
50 |
+
# Check external tools
|
51 |
+
print("\nChecking external tools:")
|
52 |
+
|
53 |
+
# Check Tesseract
|
54 |
+
try:
|
55 |
+
result = subprocess.run(['tesseract', '--version'],
|
56 |
+
stdout=subprocess.PIPE,
|
57 |
+
stderr=subprocess.PIPE,
|
58 |
+
text=True)
|
59 |
+
print(f" - Tesseract OCR: β
Installed ({result.stdout.splitlines()[0] if result.stdout else 'version unknown'})")
|
60 |
+
except FileNotFoundError:
|
61 |
+
print(" - Tesseract OCR: β Missing")
|
62 |
+
|
63 |
+
# Check language support in Tesseract
|
64 |
+
try:
|
65 |
+
result = subprocess.run(['tesseract', '--list-langs'],
|
66 |
+
stdout=subprocess.PIPE,
|
67 |
+
stderr=subprocess.STDOUT,
|
68 |
+
text=True)
|
69 |
+
if 'ind' in result.stdout:
|
70 |
+
print(" - Tesseract Indonesian language: β
Installed")
|
71 |
+
else:
|
72 |
+
print(" - Tesseract Indonesian language: β Missing")
|
73 |
+
except FileNotFoundError:
|
74 |
+
print(" - Tesseract Indonesian language: β Could not check")
|
75 |
+
|
76 |
+
# Check Playwright
|
77 |
+
print("\nChecking Playwright:")
|
78 |
+
try:
|
79 |
+
playwright_installed = check_package("playwright")
|
80 |
+
print(f" - Playwright package: {'β
Installed' if playwright_installed else 'β Missing'}")
|
81 |
+
|
82 |
+
# Check if browsers are installed
|
83 |
+
if playwright_installed:
|
84 |
+
try:
|
85 |
+
from playwright.sync_api import sync_playwright
|
86 |
+
with sync_playwright() as p:
|
87 |
+
browser_types = []
|
88 |
+
try:
|
89 |
+
browser = p.chromium.launch()
|
90 |
+
browser.close()
|
91 |
+
browser_types.append("Chromium")
|
92 |
+
except Exception:
|
93 |
+
pass
|
94 |
+
|
95 |
+
print(f" - Installed browsers: {', '.join(browser_types) if browser_types else 'None'}")
|
96 |
+
except Exception as e:
|
97 |
+
print(f" - Error checking browsers: {e}")
|
98 |
+
except Exception as e:
|
99 |
+
print(f" - Error checking Playwright: {e}")
|
100 |
+
|
101 |
+
# Check directories and permissions
|
102 |
+
print("\nChecking directories and permissions:")
|
103 |
+
directories = [
|
104 |
+
'/app/tokenizers',
|
105 |
+
'/app/models',
|
106 |
+
'/app/screenshots',
|
107 |
+
'/.cache'
|
108 |
+
]
|
109 |
+
|
110 |
+
for directory in directories:
|
111 |
+
path = Path(directory)
|
112 |
+
if path.exists():
|
113 |
+
writable = os.access(path, os.W_OK)
|
114 |
+
print(f" - {directory}: β
Exists {'(Writable)' if writable else '(Not Writable)'}")
|
115 |
+
else:
|
116 |
+
print(f" - {directory}: β Does not exist")
|
117 |
+
|
118 |
+
print("\nVerification complete!")
|
119 |
+
|
120 |
+
if __name__ == "__main__":
|
121 |
+
main()
|