Azzan Dwi Riski commited on
Commit
4e933a0
Β·
1 Parent(s): 34765d1

fix tokenizer issues

Browse files
Files changed (5) hide show
  1. .dockerignore +21 -0
  2. Dockerfile +19 -17
  3. app.py +22 -25
  4. entrypoint.sh +0 -22
  5. verify_environment.py +121 -0
.dockerignore ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore common files and directories
2
+ .git
3
+ .gitignore
4
+ .github
5
+ .vscode
6
+ __pycache__
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+ .Python
11
+ env/
12
+ venv/
13
+ .env
14
+ .venv
15
+ .idea/
16
+
17
+ # Don't ignore the model directory
18
+ !models/
19
+
20
+ # Ignore screenshots that might exist in repo but are generated at runtime
21
+ screenshots/*
Dockerfile CHANGED
@@ -19,39 +19,41 @@ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
19
  # Set up working directory
20
  WORKDIR /app
21
 
22
- # Create non-root user for cache permission
23
  RUN useradd -m -u 1000 user
24
- ENV HOME=/home/user
25
- ENV HF_HOME=/home/user/huggingface
26
- RUN mkdir -p $HOME/.cache $HOME/huggingface
27
- RUN chown -R user:user $HOME
 
28
 
29
  # Install Python dependencies
30
  COPY requirements.txt /app/
31
  RUN pip install --no-cache-dir -r requirements.txt
32
 
33
- # Pre-download tokenizer
34
- RUN pip install transformers
35
- RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)"
36
-
37
  # Install Playwright
38
  RUN pip install playwright && \
39
  playwright install chromium && \
40
  playwright install-deps chromium
41
 
 
 
 
 
 
42
  # Copy application code
43
  COPY . /app/
44
-
45
- # Create directory for screenshots and models with proper permissions
46
- RUN mkdir -p screenshots models
47
  RUN chown -R user:user /app
48
 
49
- # Copy entrypoint script
50
- COPY entrypoint.sh /app/
51
- RUN chmod +x /app/entrypoint.sh
52
 
53
  # Make sure the app runs at port 7860 (Gradio default)
54
  EXPOSE 7860
55
 
56
- # Start the application using entrypoint script
57
- ENTRYPOINT ["/app/entrypoint.sh"]
 
 
 
 
19
  # Set up working directory
20
  WORKDIR /app
21
 
22
+ # Create non-root user for better security and permissions
23
  RUN useradd -m -u 1000 user
24
+ RUN chown -R user:user /app
25
+
26
+ # Create cache directories with proper permissions
27
+ RUN mkdir -p /.cache && chown -R user:user /.cache && chmod -R 777 /.cache
28
+ RUN mkdir -p /home/user/.cache && chown -R user:user /home/user/.cache
29
 
30
  # Install Python dependencies
31
  COPY requirements.txt /app/
32
  RUN pip install --no-cache-dir -r requirements.txt
33
 
 
 
 
 
34
  # Install Playwright
35
  RUN pip install playwright && \
36
  playwright install chromium && \
37
  playwright install-deps chromium
38
 
39
+ # Download and cache the tokenizer
40
+ RUN mkdir -p /app/tokenizers/indobert-base-p1
41
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', cache_dir='/app/tokenizers')"
42
+ RUN chown -R user:user /app/tokenizers
43
+
44
  # Copy application code
45
  COPY . /app/
 
 
 
46
  RUN chown -R user:user /app
47
 
48
+ # Create directory for screenshots and models
49
+ RUN mkdir -p screenshots models
50
+ RUN chown -R user:user screenshots models
51
 
52
  # Make sure the app runs at port 7860 (Gradio default)
53
  EXPOSE 7860
54
 
55
+ # Switch to non-root user
56
+ USER user
57
+
58
+ # Start the application
59
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -24,27 +24,24 @@ from pathlib import Path
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  print(f"Using device: {device}")
26
 
27
- # Load tokenizer with fallback
28
  try:
29
- tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)
30
- print("Tokenizer loaded successfully!")
 
 
 
 
 
 
 
 
 
31
  except Exception as e:
32
  print(f"Error loading tokenizer: {e}")
33
- print("Attempting to load with additional parameters...")
34
- try:
35
- tokenizer = AutoTokenizer.from_pretrained(
36
- 'indobenchmark/indobert-base-p1',
37
- use_fast=False,
38
- local_files_only=False,
39
- cache_dir='/home/user/huggingface'
40
- )
41
- print("Tokenizer loaded with adjusted parameters!")
42
- except Exception as e2:
43
- print(f"Second attempt failed: {e2}")
44
- # Fallback to basic tokenizer if absolutely necessary
45
- from transformers import BertTokenizer
46
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
47
- print("Loaded fallback tokenizer (bert-base-uncased)")
48
 
49
  # Image transformation
50
  class ResizePadToSquare:
@@ -190,24 +187,24 @@ def take_screenshot(url):
190
  print(f"Error taking screenshot with Playwright: {e}")
191
  return None
192
 
193
- def resize_if_needed(image_path, max_mb=1, target_height=720):
194
  file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
195
  if file_size > max_mb:
196
  try:
197
  with Image.open(image_path) as img:
198
  width, height = img.size
199
- if height > target_height:
200
- ratio = target_height / float(height)
201
- new_width = int(float(width) * ratio)
202
- img = img.resize((new_width, target_height), Image.Resampling.LANCZOS)
203
  img.save(image_path, optimize=True, quality=85)
204
- print(f"Image resized to {new_width}x{target_height}")
205
  except Exception as e:
206
  print(f"Resize error: {e}")
207
 
208
  def extract_text_from_image(image_path):
209
  try:
210
- resize_if_needed(image_path, max_mb=1, target_height=720)
211
 
212
  # Use Tesseract OCR with Indonesian language
213
  text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
 
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  print(f"Using device: {device}")
26
 
27
+ # Load tokenizer with proper error handling
28
  try:
29
+ # Try to load from local tokenizer directory
30
+ tokenizer_path = '/app/tokenizers/indobert-base-p1'
31
+ if os.path.exists(tokenizer_path):
32
+ print(f"Loading tokenizer from local path: {tokenizer_path}")
33
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
34
+ else:
35
+ # If local not available, try direct download with cache
36
+ print("Local tokenizer not found, downloading from Hugging Face...")
37
+ tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1',
38
+ use_fast=True,
39
+ cache_dir='/app/tokenizers')
40
  except Exception as e:
41
  print(f"Error loading tokenizer: {e}")
42
+ # Fallback to default BERT tokenizer if needed
43
+ print("Falling back to default BERT tokenizer")
44
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Image transformation
47
  class ResizePadToSquare:
 
187
  print(f"Error taking screenshot with Playwright: {e}")
188
  return None
189
 
190
+ def resize_if_needed(image_path, max_mb=1, target_width=720):
191
  file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
192
  if file_size > max_mb:
193
  try:
194
  with Image.open(image_path) as img:
195
  width, height = img.size
196
+ if width > target_width:
197
+ ratio = target_width / float(width)
198
+ new_height = int((float(height) * float(ratio)))
199
+ img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
200
  img.save(image_path, optimize=True, quality=85)
201
+ print(f"Image resized to {target_width}x{new_height}")
202
  except Exception as e:
203
  print(f"Resize error: {e}")
204
 
205
  def extract_text_from_image(image_path):
206
  try:
207
+ resize_if_needed(image_path, max_mb=1, target_width=720)
208
 
209
  # Use Tesseract OCR with Indonesian language
210
  text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
entrypoint.sh DELETED
@@ -1,22 +0,0 @@
1
- #!/bin/bash
2
- set -e
3
-
4
- # Create necessary directories with correct permissions
5
- mkdir -p /home/user/.cache
6
- mkdir -p /home/user/huggingface
7
- mkdir -p /app/screenshots
8
- mkdir -p /app/models
9
-
10
- # Ensure permissions
11
- chown -R user:user /home/user
12
- chown -R user:user /app/screenshots
13
- chown -R user:user /app/models
14
-
15
- # Download tokenizer explicitly if needed
16
- if [ ! -d "/home/user/huggingface/models--indobenchmark--indobert-base-p1" ]; then
17
- echo "Pre-downloading tokenizer..."
18
- python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False, cache_dir='/home/user/huggingface')"
19
- fi
20
-
21
- # Run the application
22
- exec python app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
verify_environment.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script untuk memverifikasi apakah environment Docker memiliki semua yang diperlukan
4
+ untuk menjalankan aplikasi gambling detection.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import subprocess
10
+ import torch
11
+ from pathlib import Path
12
+ import importlib.util
13
+
14
+ def check_package(package_name):
15
+ """Periksa apakah paket terinstall."""
16
+ return importlib.util.find_spec(package_name) is not None
17
+
18
+ def check_command(command):
19
+ """Periksa apakah command tersedia di sistem."""
20
+ try:
21
+ subprocess.run([command, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
22
+ return True
23
+ except FileNotFoundError:
24
+ return False
25
+
26
+ def main():
27
+ """Fungsi utama untuk memeriksa environment."""
28
+ # Header
29
+ print("="*50)
30
+ print("Environment Verification for Gambling Detection App")
31
+ print("="*50)
32
+
33
+ # Check Python version
34
+ print(f"Python version: {sys.version}")
35
+
36
+ # Check CUDA
37
+ print(f"PyTorch version: {torch.__version__}")
38
+ print(f"CUDA available: {torch.cuda.is_available()}")
39
+ if torch.cuda.is_available():
40
+ print(f"CUDA version: {torch.version.cuda}")
41
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
42
+
43
+ # Check dependencies
44
+ dependencies = ["gradio", "torch", "transformers", "pytesseract", "playwright", "PIL", "pandas"]
45
+ print("\nChecking required packages:")
46
+ for package in dependencies:
47
+ status = "βœ… Installed" if check_package(package) else "❌ Missing"
48
+ print(f" - {package}: {status}")
49
+
50
+ # Check external tools
51
+ print("\nChecking external tools:")
52
+
53
+ # Check Tesseract
54
+ try:
55
+ result = subprocess.run(['tesseract', '--version'],
56
+ stdout=subprocess.PIPE,
57
+ stderr=subprocess.PIPE,
58
+ text=True)
59
+ print(f" - Tesseract OCR: βœ… Installed ({result.stdout.splitlines()[0] if result.stdout else 'version unknown'})")
60
+ except FileNotFoundError:
61
+ print(" - Tesseract OCR: ❌ Missing")
62
+
63
+ # Check language support in Tesseract
64
+ try:
65
+ result = subprocess.run(['tesseract', '--list-langs'],
66
+ stdout=subprocess.PIPE,
67
+ stderr=subprocess.STDOUT,
68
+ text=True)
69
+ if 'ind' in result.stdout:
70
+ print(" - Tesseract Indonesian language: βœ… Installed")
71
+ else:
72
+ print(" - Tesseract Indonesian language: ❌ Missing")
73
+ except FileNotFoundError:
74
+ print(" - Tesseract Indonesian language: ❌ Could not check")
75
+
76
+ # Check Playwright
77
+ print("\nChecking Playwright:")
78
+ try:
79
+ playwright_installed = check_package("playwright")
80
+ print(f" - Playwright package: {'βœ… Installed' if playwright_installed else '❌ Missing'}")
81
+
82
+ # Check if browsers are installed
83
+ if playwright_installed:
84
+ try:
85
+ from playwright.sync_api import sync_playwright
86
+ with sync_playwright() as p:
87
+ browser_types = []
88
+ try:
89
+ browser = p.chromium.launch()
90
+ browser.close()
91
+ browser_types.append("Chromium")
92
+ except Exception:
93
+ pass
94
+
95
+ print(f" - Installed browsers: {', '.join(browser_types) if browser_types else 'None'}")
96
+ except Exception as e:
97
+ print(f" - Error checking browsers: {e}")
98
+ except Exception as e:
99
+ print(f" - Error checking Playwright: {e}")
100
+
101
+ # Check directories and permissions
102
+ print("\nChecking directories and permissions:")
103
+ directories = [
104
+ '/app/tokenizers',
105
+ '/app/models',
106
+ '/app/screenshots',
107
+ '/.cache'
108
+ ]
109
+
110
+ for directory in directories:
111
+ path = Path(directory)
112
+ if path.exists():
113
+ writable = os.access(path, os.W_OK)
114
+ print(f" - {directory}: βœ… Exists {'(Writable)' if writable else '(Not Writable)'}")
115
+ else:
116
+ print(f" - {directory}: ❌ Does not exist")
117
+
118
+ print("\nVerification complete!")
119
+
120
+ if __name__ == "__main__":
121
+ main()