Azzan Dwi Riski
commited on
Commit
·
34765d1
1
Parent(s):
ea04925
update tokenizer
Browse files- Dockerfile +19 -6
- app.py +29 -10
- entrypoint.sh +22 -0
Dockerfile
CHANGED
@@ -19,10 +19,21 @@ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
|
|
19 |
# Set up working directory
|
20 |
WORKDIR /app
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Install Python dependencies
|
23 |
COPY requirements.txt /app/
|
24 |
RUN pip install --no-cache-dir -r requirements.txt
|
25 |
|
|
|
|
|
|
|
|
|
26 |
# Install Playwright
|
27 |
RUN pip install playwright && \
|
28 |
playwright install chromium && \
|
@@ -31,14 +42,16 @@ RUN pip install playwright && \
|
|
31 |
# Copy application code
|
32 |
COPY . /app/
|
33 |
|
34 |
-
# Create directory for screenshots
|
35 |
-
RUN mkdir -p screenshots
|
|
|
36 |
|
37 |
-
#
|
38 |
-
|
|
|
39 |
|
40 |
# Make sure the app runs at port 7860 (Gradio default)
|
41 |
EXPOSE 7860
|
42 |
|
43 |
-
# Start the application
|
44 |
-
|
|
|
19 |
# Set up working directory
|
20 |
WORKDIR /app
|
21 |
|
22 |
+
# Create non-root user for cache permission
|
23 |
+
RUN useradd -m -u 1000 user
|
24 |
+
ENV HOME=/home/user
|
25 |
+
ENV HF_HOME=/home/user/huggingface
|
26 |
+
RUN mkdir -p $HOME/.cache $HOME/huggingface
|
27 |
+
RUN chown -R user:user $HOME
|
28 |
+
|
29 |
# Install Python dependencies
|
30 |
COPY requirements.txt /app/
|
31 |
RUN pip install --no-cache-dir -r requirements.txt
|
32 |
|
33 |
+
# Pre-download tokenizer
|
34 |
+
RUN pip install transformers
|
35 |
+
RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)"
|
36 |
+
|
37 |
# Install Playwright
|
38 |
RUN pip install playwright && \
|
39 |
playwright install chromium && \
|
|
|
42 |
# Copy application code
|
43 |
COPY . /app/
|
44 |
|
45 |
+
# Create directory for screenshots and models with proper permissions
|
46 |
+
RUN mkdir -p screenshots models
|
47 |
+
RUN chown -R user:user /app
|
48 |
|
49 |
+
# Copy entrypoint script
|
50 |
+
COPY entrypoint.sh /app/
|
51 |
+
RUN chmod +x /app/entrypoint.sh
|
52 |
|
53 |
# Make sure the app runs at port 7860 (Gradio default)
|
54 |
EXPOSE 7860
|
55 |
|
56 |
+
# Start the application using entrypoint script
|
57 |
+
ENTRYPOINT ["/app/entrypoint.sh"]
|
app.py
CHANGED
@@ -8,7 +8,7 @@ from PIL import Image
|
|
8 |
import pytesseract
|
9 |
from playwright.sync_api import sync_playwright
|
10 |
import asyncio
|
11 |
-
from transformers import
|
12 |
from torchvision import transforms
|
13 |
from torchvision import models
|
14 |
from torchvision.transforms import functional as F
|
@@ -24,8 +24,27 @@ from pathlib import Path
|
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
print(f"Using device: {device}")
|
26 |
|
27 |
-
# Load tokenizer
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Image transformation
|
31 |
class ResizePadToSquare:
|
@@ -171,24 +190,24 @@ def take_screenshot(url):
|
|
171 |
print(f"Error taking screenshot with Playwright: {e}")
|
172 |
return None
|
173 |
|
174 |
-
def resize_if_needed(image_path, max_mb=1,
|
175 |
file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
|
176 |
if file_size > max_mb:
|
177 |
try:
|
178 |
with Image.open(image_path) as img:
|
179 |
width, height = img.size
|
180 |
-
if
|
181 |
-
ratio =
|
182 |
-
|
183 |
-
img = img.resize((
|
184 |
img.save(image_path, optimize=True, quality=85)
|
185 |
-
print(f"Image resized to {
|
186 |
except Exception as e:
|
187 |
print(f"Resize error: {e}")
|
188 |
|
189 |
def extract_text_from_image(image_path):
|
190 |
try:
|
191 |
-
resize_if_needed(image_path, max_mb=1,
|
192 |
|
193 |
# Use Tesseract OCR with Indonesian language
|
194 |
text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
|
|
|
8 |
import pytesseract
|
9 |
from playwright.sync_api import sync_playwright
|
10 |
import asyncio
|
11 |
+
from transformers import AutoTokenizer
|
12 |
from torchvision import transforms
|
13 |
from torchvision import models
|
14 |
from torchvision.transforms import functional as F
|
|
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
25 |
print(f"Using device: {device}")
|
26 |
|
27 |
+
# Load tokenizer with fallback
|
28 |
+
try:
|
29 |
+
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)
|
30 |
+
print("Tokenizer loaded successfully!")
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error loading tokenizer: {e}")
|
33 |
+
print("Attempting to load with additional parameters...")
|
34 |
+
try:
|
35 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
36 |
+
'indobenchmark/indobert-base-p1',
|
37 |
+
use_fast=False,
|
38 |
+
local_files_only=False,
|
39 |
+
cache_dir='/home/user/huggingface'
|
40 |
+
)
|
41 |
+
print("Tokenizer loaded with adjusted parameters!")
|
42 |
+
except Exception as e2:
|
43 |
+
print(f"Second attempt failed: {e2}")
|
44 |
+
# Fallback to basic tokenizer if absolutely necessary
|
45 |
+
from transformers import BertTokenizer
|
46 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
47 |
+
print("Loaded fallback tokenizer (bert-base-uncased)")
|
48 |
|
49 |
# Image transformation
|
50 |
class ResizePadToSquare:
|
|
|
190 |
print(f"Error taking screenshot with Playwright: {e}")
|
191 |
return None
|
192 |
|
193 |
+
def resize_if_needed(image_path, max_mb=1, target_height=720):
|
194 |
file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
|
195 |
if file_size > max_mb:
|
196 |
try:
|
197 |
with Image.open(image_path) as img:
|
198 |
width, height = img.size
|
199 |
+
if height > target_height:
|
200 |
+
ratio = target_height / float(height)
|
201 |
+
new_width = int(float(width) * ratio)
|
202 |
+
img = img.resize((new_width, target_height), Image.Resampling.LANCZOS)
|
203 |
img.save(image_path, optimize=True, quality=85)
|
204 |
+
print(f"Image resized to {new_width}x{target_height}")
|
205 |
except Exception as e:
|
206 |
print(f"Resize error: {e}")
|
207 |
|
208 |
def extract_text_from_image(image_path):
|
209 |
try:
|
210 |
+
resize_if_needed(image_path, max_mb=1, target_height=720)
|
211 |
|
212 |
# Use Tesseract OCR with Indonesian language
|
213 |
text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
|
entrypoint.sh
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -e
|
3 |
+
|
4 |
+
# Create necessary directories with correct permissions
|
5 |
+
mkdir -p /home/user/.cache
|
6 |
+
mkdir -p /home/user/huggingface
|
7 |
+
mkdir -p /app/screenshots
|
8 |
+
mkdir -p /app/models
|
9 |
+
|
10 |
+
# Ensure permissions
|
11 |
+
chown -R user:user /home/user
|
12 |
+
chown -R user:user /app/screenshots
|
13 |
+
chown -R user:user /app/models
|
14 |
+
|
15 |
+
# Download tokenizer explicitly if needed
|
16 |
+
if [ ! -d "/home/user/huggingface/models--indobenchmark--indobert-base-p1" ]; then
|
17 |
+
echo "Pre-downloading tokenizer..."
|
18 |
+
python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False, cache_dir='/home/user/huggingface')"
|
19 |
+
fi
|
20 |
+
|
21 |
+
# Run the application
|
22 |
+
exec python app.py
|