Azzan Dwi Riski commited on
Commit
f9bdea7
·
1 Parent(s): ac5d2c9

fix issues

Browse files
Files changed (1) hide show
  1. app.py +33 -5
app.py CHANGED
@@ -17,6 +17,8 @@ from huggingface_hub import hf_hub_download
17
  import warnings
18
  warnings.filterwarnings("ignore")
19
  from pathlib import Path
 
 
20
 
21
  # --- Setup ---
22
 
@@ -65,6 +67,8 @@ transform = transforms.Compose([
65
  std=[0.229, 0.224, 0.225]),
66
  ])
67
 
 
 
68
  # Screenshot folder
69
  SCREENSHOT_DIR = "screenshots"
70
  os.makedirs(SCREENSHOT_DIR, exist_ok=True)
@@ -74,7 +78,6 @@ pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' # Path to tesserac
74
  print("Tesseract OCR initialized.")
75
 
76
  # --- Model ---
77
-
78
  class LateFusionModel(nn.Module):
79
  def __init__(self, image_model, text_model):
80
  super(LateFusionModel, self).__init__()
@@ -159,13 +162,30 @@ def clean_text(text):
159
  return "" # empty return to use image-only
160
  return text
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  def take_screenshot(url):
163
  filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_') + '.png'
164
  filepath = os.path.join(SCREENSHOT_DIR, filename)
165
 
166
  try:
167
- print(f"Taking screenshot with Playwright for URL: {url}")
 
 
168
  with sync_playwright() as p:
 
169
  browser = p.chromium.launch()
170
  page = browser.new_page(
171
  viewport={"width": 1280, "height": 800},
@@ -173,15 +193,23 @@ def take_screenshot(url):
173
  )
174
  page.set_default_timeout(60000)
175
  page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
 
 
176
  page.goto(url, wait_until="networkidle", timeout=60000)
177
  page.wait_for_timeout(3000)
 
 
178
  page.screenshot(path=filepath)
179
  browser.close()
180
-
181
- print(f"Screenshot taken for URL: {url}")
 
182
  return filepath
 
183
  except Exception as e:
184
- print(f"Error taking screenshot with Playwright: {e}")
 
 
185
  return None
186
 
187
  def resize_if_needed(image_path, max_mb=1, target_width=720):
 
17
  import warnings
18
  warnings.filterwarnings("ignore")
19
  from pathlib import Path
20
+ import subprocess
21
+ import traceback
22
 
23
  # --- Setup ---
24
 
 
67
  std=[0.229, 0.224, 0.225]),
68
  ])
69
 
70
+ ensure_playwright_chromium()
71
+
72
  # Screenshot folder
73
  SCREENSHOT_DIR = "screenshots"
74
  os.makedirs(SCREENSHOT_DIR, exist_ok=True)
 
78
  print("Tesseract OCR initialized.")
79
 
80
  # --- Model ---
 
81
  class LateFusionModel(nn.Module):
82
  def __init__(self, image_model, text_model):
83
  super(LateFusionModel, self).__init__()
 
162
  return "" # empty return to use image-only
163
  return text
164
 
165
+ # Jalankan ini sekali di awal startup aplikasi (misalnya di main file / sebelum model load)
166
+ def ensure_playwright_chromium():
167
+ try:
168
+ print("Checking and installing Playwright Chromium if not present...")
169
+ subprocess.run(["playwright", "install", "chromium"], check=True)
170
+ print("Playwright Chromium installation completed.")
171
+ except Exception as e:
172
+ print("Error during Playwright Chromium installation:", e)
173
+ traceback.print_exc()
174
+
175
+ # Pastikan dipanggil saat startup (di luar fungsi screenshot)
176
+ ensure_playwright_chromium()
177
+
178
+ # Fungsi untuk mengambil screenshot viewport
179
  def take_screenshot(url):
180
  filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_') + '.png'
181
  filepath = os.path.join(SCREENSHOT_DIR, filename)
182
 
183
  try:
184
+ print(f"\n=== [START SCREENSHOT] URL: {url} ===")
185
+ from playwright.sync_api import sync_playwright
186
+
187
  with sync_playwright() as p:
188
+ print("Launching Playwright Chromium...")
189
  browser = p.chromium.launch()
190
  page = browser.new_page(
191
  viewport={"width": 1280, "height": 800},
 
193
  )
194
  page.set_default_timeout(60000)
195
  page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})
196
+
197
+ print("Navigating to URL...")
198
  page.goto(url, wait_until="networkidle", timeout=60000)
199
  page.wait_for_timeout(3000)
200
+
201
+ print("Taking screenshot (viewport only)...")
202
  page.screenshot(path=filepath)
203
  browser.close()
204
+ print(f"Screenshot saved to {filepath}")
205
+
206
+ print(f"=== [END SCREENSHOT] ===\n")
207
  return filepath
208
+
209
  except Exception as e:
210
+ print(f"[ERROR] Failed to take screenshot for URL: {url}")
211
+ print(f"Exception: {e}")
212
+ traceback.print_exc()
213
  return None
214
 
215
  def resize_if_needed(image_path, max_mb=1, target_width=720):