Azzan Dwi Riski commited on
Commit
34765d1
·
1 Parent(s): ea04925

update tokenizer

Browse files
Files changed (3) hide show
  1. Dockerfile +19 -6
  2. app.py +29 -10
  3. entrypoint.sh +22 -0
Dockerfile CHANGED
@@ -19,10 +19,21 @@ RUN curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
19
  # Set up working directory
20
  WORKDIR /app
21
 
 
 
 
 
 
 
 
22
  # Install Python dependencies
23
  COPY requirements.txt /app/
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
 
 
 
 
26
  # Install Playwright
27
  RUN pip install playwright && \
28
  playwright install chromium && \
@@ -31,14 +42,16 @@ RUN pip install playwright && \
31
  # Copy application code
32
  COPY . /app/
33
 
34
- # Create directory for screenshots
35
- RUN mkdir -p screenshots
 
36
 
37
- # Create directory for models
38
- RUN mkdir -p models
 
39
 
40
  # Make sure the app runs at port 7860 (Gradio default)
41
  EXPOSE 7860
42
 
43
- # Start the application
44
- CMD ["python", "app.py"]
 
19
  # Set up working directory
20
  WORKDIR /app
21
 
22
+ # Create non-root user for cache permission
23
+ RUN useradd -m -u 1000 user
24
+ ENV HOME=/home/user
25
+ ENV HF_HOME=/home/user/huggingface
26
+ RUN mkdir -p $HOME/.cache $HOME/huggingface
27
+ RUN chown -R user:user $HOME
28
+
29
  # Install Python dependencies
30
  COPY requirements.txt /app/
31
  RUN pip install --no-cache-dir -r requirements.txt
32
 
33
+ # Pre-download tokenizer
34
+ RUN pip install transformers
35
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)"
36
+
37
  # Install Playwright
38
  RUN pip install playwright && \
39
  playwright install chromium && \
 
42
  # Copy application code
43
  COPY . /app/
44
 
45
+ # Create directory for screenshots and models with proper permissions
46
+ RUN mkdir -p screenshots models
47
+ RUN chown -R user:user /app
48
 
49
+ # Copy entrypoint script
50
+ COPY entrypoint.sh /app/
51
+ RUN chmod +x /app/entrypoint.sh
52
 
53
  # Make sure the app runs at port 7860 (Gradio default)
54
  EXPOSE 7860
55
 
56
+ # Start the application using entrypoint script
57
+ ENTRYPOINT ["/app/entrypoint.sh"]
app.py CHANGED
@@ -8,7 +8,7 @@ from PIL import Image
8
  import pytesseract
9
  from playwright.sync_api import sync_playwright
10
  import asyncio
11
- from transformers import BertTokenizer
12
  from torchvision import transforms
13
  from torchvision import models
14
  from torchvision.transforms import functional as F
@@ -24,8 +24,27 @@ from pathlib import Path
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  print(f"Using device: {device}")
26
 
27
- # Load tokenizer
28
- tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Image transformation
31
  class ResizePadToSquare:
@@ -171,24 +190,24 @@ def take_screenshot(url):
171
  print(f"Error taking screenshot with Playwright: {e}")
172
  return None
173
 
174
- def resize_if_needed(image_path, max_mb=1, target_width=720):
175
  file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
176
  if file_size > max_mb:
177
  try:
178
  with Image.open(image_path) as img:
179
  width, height = img.size
180
- if width > target_width:
181
- ratio = target_width / float(width)
182
- new_height = int((float(height) * float(ratio)))
183
- img = img.resize((target_width, new_height), Image.Resampling.LANCZOS)
184
  img.save(image_path, optimize=True, quality=85)
185
- print(f"Image resized to {target_width}x{new_height}")
186
  except Exception as e:
187
  print(f"Resize error: {e}")
188
 
189
  def extract_text_from_image(image_path):
190
  try:
191
- resize_if_needed(image_path, max_mb=1, target_width=720)
192
 
193
  # Use Tesseract OCR with Indonesian language
194
  text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
 
8
  import pytesseract
9
  from playwright.sync_api import sync_playwright
10
  import asyncio
11
+ from transformers import AutoTokenizer
12
  from torchvision import transforms
13
  from torchvision import models
14
  from torchvision.transforms import functional as F
 
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  print(f"Using device: {device}")
26
 
27
+ # Load tokenizer with fallback
28
+ try:
29
+ tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False)
30
+ print("Tokenizer loaded successfully!")
31
+ except Exception as e:
32
+ print(f"Error loading tokenizer: {e}")
33
+ print("Attempting to load with additional parameters...")
34
+ try:
35
+ tokenizer = AutoTokenizer.from_pretrained(
36
+ 'indobenchmark/indobert-base-p1',
37
+ use_fast=False,
38
+ local_files_only=False,
39
+ cache_dir='/home/user/huggingface'
40
+ )
41
+ print("Tokenizer loaded with adjusted parameters!")
42
+ except Exception as e2:
43
+ print(f"Second attempt failed: {e2}")
44
+ # Fallback to basic tokenizer if absolutely necessary
45
+ from transformers import BertTokenizer
46
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
47
+ print("Loaded fallback tokenizer (bert-base-uncased)")
48
 
49
  # Image transformation
50
  class ResizePadToSquare:
 
190
  print(f"Error taking screenshot with Playwright: {e}")
191
  return None
192
 
193
+ def resize_if_needed(image_path, max_mb=1, target_height=720):
194
  file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB
195
  if file_size > max_mb:
196
  try:
197
  with Image.open(image_path) as img:
198
  width, height = img.size
199
+ if height > target_height:
200
+ ratio = target_height / float(height)
201
+ new_width = int(float(width) * ratio)
202
+ img = img.resize((new_width, target_height), Image.Resampling.LANCZOS)
203
  img.save(image_path, optimize=True, quality=85)
204
+ print(f"Image resized to {new_width}x{target_height}")
205
  except Exception as e:
206
  print(f"Resize error: {e}")
207
 
208
  def extract_text_from_image(image_path):
209
  try:
210
+ resize_if_needed(image_path, max_mb=1, target_height=720)
211
 
212
  # Use Tesseract OCR with Indonesian language
213
  text = pytesseract.image_to_string(Image.open(image_path), lang='ind')
entrypoint.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Create necessary directories with correct permissions
5
+ mkdir -p /home/user/.cache
6
+ mkdir -p /home/user/huggingface
7
+ mkdir -p /app/screenshots
8
+ mkdir -p /app/models
9
+
10
+ # Ensure permissions
11
+ chown -R user:user /home/user
12
+ chown -R user:user /app/screenshots
13
+ chown -R user:user /app/models
14
+
15
+ # Download tokenizer explicitly if needed
16
+ if [ ! -d "/home/user/huggingface/models--indobenchmark--indobert-base-p1" ]; then
17
+ echo "Pre-downloading tokenizer..."
18
+ python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1', use_fast=False, cache_dir='/home/user/huggingface')"
19
+ fi
20
+
21
+ # Run the application
22
+ exec python app.py