Spaces:

azzandr
/

ID-gambling-website-detection

Running

App Files Files Community

Azzan Dwi Riski commited on 19 days ago

Commit

34765d1

1 Parent(s): ea04925

update tokenizer

Browse files

Files changed (3) hide show

Dockerfile +19 -6
app.py +29 -10
entrypoint.sh +22 -0

Dockerfile CHANGED Viewed

@@ -19,10 +19,21 @@ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
 # Set up working directory
 WORKDIR /app
 # Install Python dependencies
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
 # Install Playwright
 RUN pip install playwright && \
     playwright install chromium && \
@@ -31,14 +42,16 @@ RUN pip install playwright && \
 # Copy application code
 COPY . /app/
-# Create directory for screenshots
-RUN mkdir -p screenshots
-# Create directory for models
-RUN mkdir -p models
 # Make sure the app runs at port 7860 (Gradio default)
 EXPOSE 7860
-# Start the application
-CMD ["python", "app.py"]

 # Set up working directory
 WORKDIR /app
+# Create non-root user for cache permission
+RUN useradd -m -u 1000 user
+ENV HOME=/home/user
+ENV HF_HOME=/home/user/huggingface
+RUN mkdir -p $HOME/.cache $HOME/huggingface
+RUN chown -R user:user $HOME
 # Install Python dependencies
 COPY requirements.txt /app/
 RUN pip install --no-cache-dir -r requirements.txt
+# Pre-download tokenizer
+RUN pip install transformers
+RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)"
 # Install Playwright
 RUN pip install playwright && \
     playwright install chromium && \
 # Copy application code
 COPY . /app/
+# Create directory for screenshots and models with proper permissions
+RUN mkdir -p screenshots models
+RUN chown -R user:user /app
+# Copy entrypoint script
+COPY entrypoint.sh /app/
+RUN chmod +x /app/entrypoint.sh
 # Make sure the app runs at port 7860 (Gradio default)
 EXPOSE 7860
+# Start the application using entrypoint script
+ENTRYPOINT ["/app/entrypoint.sh"]

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from PIL import Image
 import pytesseract
 from playwright.sync_api import sync_playwright
 import asyncio
-from transformers import BertTokenizer
 from torchvision import transforms
 from torchvision import models
 from torchvision.transforms import functional as F
@@ -24,8 +24,27 @@ from pathlib import Path
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
-# Load tokenizer
-tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
 # Image transformation
 class ResizePadToSquare:
@@ -171,24 +190,24 @@ def take_screenshot(url):
         print(f"Error taking screenshot with Playwright: {e}")
         return None
-def resize_if_needed(image_path, max_mb=1, target_width=720):
     file_size = os.path.getsize(image_path) / (1024 * 1024)  # dalam MB
     if file_size > max_mb:
         try:
             with Image.open(image_path) as img:
                 width, height = img.size
-                if width > target_width:
-                    ratio = target_width / float(width)
-                    new_height = int((float(height) * float(ratio)))
-                    img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
                     img.save(image_path, optimize=True, quality=85)
-                    print(f"Image resized to {target_width}x{new_height}")
         except Exception as e:
             print(f"Resize error: {e}")
 def extract_text_from_image(image_path):
     try:
-        resize_if_needed(image_path, max_mb=1, target_width=720)
         # Use Tesseract OCR with Indonesian language
         text = pytesseract.image_to_string(Image.open(image_path), lang='ind')

 import pytesseract
 from playwright.sync_api import sync_playwright
 import asyncio
+from transformers import AutoTokenizer
 from torchvision import transforms
 from torchvision import models
 from torchvision.transforms import functional as F
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+# Load tokenizer with fallback
+try:
+    tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)
+    print("Tokenizer loaded successfully!")
+except Exception as e:
+    print(f"Error loading tokenizer: {e}")
+    print("Attempting to load with additional parameters...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            'indobenchmark/indobert-base-p1',
+            use_fast=False,
+            local_files_only=False,
+            cache_dir='/home/user/huggingface'
+        )
+        print("Tokenizer loaded with adjusted parameters!")
+    except Exception as e2:
+        print(f"Second attempt failed: {e2}")
+        # Fallback to basic tokenizer if absolutely necessary
+        from transformers import BertTokenizer
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        print("Loaded fallback tokenizer (bert-base-uncased)")
 # Image transformation
 class ResizePadToSquare:
         print(f"Error taking screenshot with Playwright: {e}")
         return None
+def resize_if_needed(image_path, max_mb=1, target_height=720):
     file_size = os.path.getsize(image_path) / (1024 * 1024)  # dalam MB
     if file_size > max_mb:
         try:
             with Image.open(image_path) as img:
                 width, height = img.size
+                if height > target_height:
+                    ratio = target_height / float(height)
+                    new_width = int(float(width) * ratio)
+                    img = img.resize((new_width, target_height), Image.Resampling.LANCZOS)
                     img.save(image_path, optimize=True, quality=85)
+                    print(f"Image resized to {new_width}x{target_height}")
         except Exception as e:
             print(f"Resize error: {e}")
 def extract_text_from_image(image_path):
     try:
+        resize_if_needed(image_path, max_mb=1, target_height=720)
         # Use Tesseract OCR with Indonesian language
         text = pytesseract.image_to_string(Image.open(image_path), lang='ind')

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/bin/bash
+set -e
+# Create necessary directories with correct permissions
+mkdir -p /home/user/.cache
+mkdir -p /home/user/huggingface
+mkdir -p /app/screenshots
+mkdir -p /app/models
+# Ensure permissions
+chown -R user:user /home/user
+chown -R user:user /app/screenshots
+chown -R user:user /app/models
+# Download tokenizer explicitly if needed
+if [ ! -d "/home/user/huggingface/models--indobenchmark--indobert-base-p1" ]; then
+    echo "Pre-downloading tokenizer..."
+    python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False, cache_dir='/home/user/huggingface')"
+fi
+# Run the application
+exec python app.py