Spaces:

azzandr
/

ID-gambling-website-detection

Running

App Files Files Community

Azzan Dwi Riski commited on May 11

Commit

4e933a0

1 Parent(s): 34765d1

fix tokenizer issues

Browse files

Files changed (5) hide show

.dockerignore +21 -0
Dockerfile +19 -17
app.py +22 -25
entrypoint.sh +0 -22
verify_environment.py +121 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# Ignore common files and directories
+.git
+.gitignore
+.github
+.vscode
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+.env
+.venv
+.idea/
+# Don't ignore the model directory
+!models/
+# Ignore screenshots that might exist in repo but are generated at runtime
+screenshots/*

Dockerfile CHANGED Viewed

@@ -19,39 +19,41 @@ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
 # Set up working directory
 WORKDIR /app
-# Create non-root user for cache permission
 RUN useradd -m -u 1000 user
-ENV HOME=/home/user
-ENV HF_HOME=/home/user/huggingface
-RUN mkdir -p $HOME/.cache $HOME/huggingface
-RUN chown -R user:user $HOME
 # Install Python dependencies
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
-# Pre-download tokenizer
-RUN pip install transformers
-RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)"
 # Install Playwright
 RUN pip install playwright && \
     playwright install chromium && \
     playwright install-deps chromium
 # Copy application code
 COPY . /app/
-# Create directory for screenshots and models with proper permissions
-RUN mkdir -p screenshots models
 RUN chown -R user:user /app
-# Copy entrypoint script
-COPY entrypoint.sh /app/
-RUN chmod +x /app/entrypoint.sh
 # Make sure the app runs at port 7860 (Gradio default)
 EXPOSE 7860
-# Start the application using entrypoint script
-ENTRYPOINT ["/app/entrypoint.sh"]

 # Set up working directory
 WORKDIR /app
+# Create non-root user for better security and permissions
 RUN useradd -m -u 1000 user
+RUN chown -R user:user /app
+# Create cache directories with proper permissions
+RUN mkdir -p /.cache && chown -R user:user /.cache && chmod -R 777 /.cache
+RUN mkdir -p /home/user/.cache && chown -R user:user /home/user/.cache
 # Install Python dependencies
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
 # Install Playwright
 RUN pip install playwright && \
     playwright install chromium && \
     playwright install-deps chromium
+# Download and cache the tokenizer
+RUN mkdir -p /app/tokenizers/indobert-base-p1
+RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', cache_dir='/app/tokenizers')"
+RUN chown -R user:user /app/tokenizers
 # Copy application code
 COPY . /app/
 RUN chown -R user:user /app
+# Create directory for screenshots and models
+RUN mkdir -p screenshots models
+RUN chown -R user:user screenshots models
 # Make sure the app runs at port 7860 (Gradio default)
 EXPOSE 7860
+# Switch to non-root user
+USER user
+# Start the application
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -24,27 +24,24 @@ from pathlib import Path
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
-# Load tokenizer with fallback
 try:
-    tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)
-    print("Tokenizer loaded successfully!")
 except Exception as e:
     print(f"Error loading tokenizer: {e}")
-    print("Attempting to load with additional parameters...")
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            'indobenchmark/indobert-base-p1',
-            use_fast=False,
-            local_files_only=False,
-            cache_dir='/home/user/huggingface'
-        )
-        print("Tokenizer loaded with adjusted parameters!")
-    except Exception as e2:
-        print(f"Second attempt failed: {e2}")
-        # Fallback to basic tokenizer if absolutely necessary
-        from transformers import BertTokenizer
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        print("Loaded fallback tokenizer (bert-base-uncased)")
 # Image transformation
 class ResizePadToSquare:
@@ -190,24 +187,24 @@ def take_screenshot(url):
         print(f"Error taking screenshot with Playwright: {e}")
         return None
-def resize_if_needed(image_path, max_mb=1, target_height=720):
     file_size = os.path.getsize(image_path) / (1024 * 1024)  # dalam MB
     if file_size > max_mb:
         try:
             with Image.open(image_path) as img:
                 width, height = img.size
-                if height > target_height:
-                    ratio = target_height / float(height)
-                    new_width = int(float(width) * ratio)
-                    img = img.resize((new_width, target_height), Image.Resampling.LANCZOS)
                     img.save(image_path, optimize=True, quality=85)
-                    print(f"Image resized to {new_width}x{target_height}")
         except Exception as e:
             print(f"Resize error: {e}")
 def extract_text_from_image(image_path):
     try:
-        resize_if_needed(image_path, max_mb=1, target_height=720)
         # Use Tesseract OCR with Indonesian language
         text = pytesseract.image_to_string(Image.open(image_path), lang='ind')

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+# Load tokenizer with proper error handling
 try:
+    # Try to load from local tokenizer directory
+    tokenizer_path = '/app/tokenizers/indobert-base-p1'
+    if os.path.exists(tokenizer_path):
+        print(f"Loading tokenizer from local path: {tokenizer_path}")
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    else:
+        # If local not available, try direct download with cache
+        print("Local tokenizer not found, downloading from Hugging Face...")
+        tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1',
+                                                 use_fast=True,
+                                                 cache_dir='/app/tokenizers')
 except Exception as e:
     print(f"Error loading tokenizer: {e}")
+    # Fallback to default BERT tokenizer if needed
+    print("Falling back to default BERT tokenizer")
+    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 # Image transformation
 class ResizePadToSquare:
         print(f"Error taking screenshot with Playwright: {e}")
         return None
+def resize_if_needed(image_path, max_mb=1, target_width=720):
     file_size = os.path.getsize(image_path) / (1024 * 1024)  # dalam MB
     if file_size > max_mb:
         try:
             with Image.open(image_path) as img:
                 width, height = img.size
+                if width > target_width:
+                    ratio = target_width / float(width)
+                    new_height = int((float(height) * float(ratio)))
+                    img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
                     img.save(image_path, optimize=True, quality=85)
+                    print(f"Image resized to {target_width}x{new_height}")
         except Exception as e:
             print(f"Resize error: {e}")
 def extract_text_from_image(image_path):
     try:
+        resize_if_needed(image_path, max_mb=1, target_width=720)
         # Use Tesseract OCR with Indonesian language
         text = pytesseract.image_to_string(Image.open(image_path), lang='ind')

entrypoint.sh DELETED Viewed

@@ -1,22 +0,0 @@
-#!/bin/bash
-set -e
-# Create necessary directories with correct permissions
-mkdir -p /home/user/.cache
-mkdir -p /home/user/huggingface
-mkdir -p /app/screenshots
-mkdir -p /app/models
-# Ensure permissions
-chown -R user:user /home/user
-chown -R user:user /app/screenshots
-chown -R user:user /app/models
-# Download tokenizer explicitly if needed
-if [ ! -d "/home/user/huggingface/models--indobenchmark--indobert-base-p1" ]; then
-    echo "Pre-downloading tokenizer..."
-    python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False, cache_dir='/home/user/huggingface')"
-fi
-# Run the application
-exec python app.py

verify_environment.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+"""
+Script untuk memverifikasi apakah environment Docker memiliki semua yang diperlukan
+untuk menjalankan aplikasi gambling detection.
+"""
+import os
+import sys
+import subprocess
+import torch
+from pathlib import Path
+import importlib.util
+def check_package(package_name):
+    """Periksa apakah paket terinstall."""
+    return importlib.util.find_spec(package_name) is not None
+def check_command(command):
+    """Periksa apakah command tersedia di sistem."""
+    try:
+        subprocess.run([command, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except FileNotFoundError:
+        return False
+def main():
+    """Fungsi utama untuk memeriksa environment."""
+    # Header
+    print("="*50)
+    print("Environment Verification for Gambling Detection App")
+    print("="*50)
+    # Check Python version
+    print(f"Python version: {sys.version}")
+    # Check CUDA
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"CUDA version: {torch.version.cuda}")
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+    # Check dependencies
+    dependencies = ["gradio", "torch", "transformers", "pytesseract", "playwright", "PIL", "pandas"]
+    print("\nChecking required packages:")
+    for package in dependencies:
+        status = "✅ Installed" if check_package(package) else "❌ Missing"
+        print(f"  - {package}: {status}")
+    # Check external tools
+    print("\nChecking external tools:")
+    # Check Tesseract
+    try:
+        result = subprocess.run(['tesseract', '--version'],
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE,
+                              text=True)
+        print(f"  - Tesseract OCR: ✅ Installed ({result.stdout.splitlines()[0] if result.stdout else 'version unknown'})")
+    except FileNotFoundError:
+        print("  - Tesseract OCR: ❌ Missing")
+    # Check language support in Tesseract
+    try:
+        result = subprocess.run(['tesseract', '--list-langs'],
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.STDOUT,
+                              text=True)
+        if 'ind' in result.stdout:
+            print("  - Tesseract Indonesian language: ✅ Installed")
+        else:
+            print("  - Tesseract Indonesian language: ❌ Missing")
+    except FileNotFoundError:
+        print("  - Tesseract Indonesian language: ❌ Could not check")
+    # Check Playwright
+    print("\nChecking Playwright:")
+    try:
+        playwright_installed = check_package("playwright")
+        print(f"  - Playwright package: {'✅ Installed' if playwright_installed else '❌ Missing'}")
+        # Check if browsers are installed
+        if playwright_installed:
+            try:
+                from playwright.sync_api import sync_playwright
+                with sync_playwright() as p:
+                    browser_types = []
+                    try:
+                        browser = p.chromium.launch()
+                        browser.close()
+                        browser_types.append("Chromium")
+                    except Exception:
+                        pass
+                    print(f"  - Installed browsers: {', '.join(browser_types) if browser_types else 'None'}")
+            except Exception as e:
+                print(f"  - Error checking browsers: {e}")
+    except Exception as e:
+        print(f"  - Error checking Playwright: {e}")
+    # Check directories and permissions
+    print("\nChecking directories and permissions:")
+    directories = [
+        '/app/tokenizers',
+        '/app/models',
+        '/app/screenshots',
+        '/.cache'
+    ]
+    for directory in directories:
+        path = Path(directory)
+        if path.exists():
+            writable = os.access(path, os.W_OK)
+            print(f"  - {directory}: ✅ Exists {'(Writable)' if writable else '(Not Writable)'}")
+        else:
+            print(f"  - {directory}: ❌ Does not exist")
+    print("\nVerification complete!")
+if __name__ == "__main__":
+    main()