Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
ae7097b
1
Parent(s):
4b890a6
defaults defaults defaults (and types)
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- python/common/defaults.py +15 -0
- python/common/mkdir.py +6 -0
- python/entity.py +3 -1
- python/imtool.py +2 -4
- python/main.py +6 -9
- python/screenshot.py +11 -6
- python/vendor.py +2 -5
- python/web.py +6 -3
python/common/defaults.py
CHANGED
|
@@ -1 +1,16 @@
|
|
| 1 |
DATA_PATH='./data'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
DATA_PATH='./data'
|
| 2 |
+
|
| 3 |
+
LABELS_PATH=f'{DATA_PATH}/labels'
|
| 4 |
+
IMAGES_PATH=f'{DATA_PATH}/images'
|
| 5 |
+
CERTS_PATH=f'{DATA_PATH}/certs'
|
| 6 |
+
|
| 7 |
+
SQUARES_DATA_PATH=f'{DATA_PATH}/squares'
|
| 8 |
+
SQUARES_LABELS_PATH=f'{SQUARES_DATA_PATH}/labels'
|
| 9 |
+
SQUARES_IMAGES_PATH=f'{SQUARES_DATA_PATH}/images'
|
| 10 |
+
|
| 11 |
+
DEBUG_PATH=f'{DATA_PATH}/debug'
|
| 12 |
+
DEBUG_SQUARES_PATH=f'{DEBUG_PATH}/squares'
|
| 13 |
+
|
| 14 |
+
LOGOS_DATA_PATH=f'{DATA_PATH}/logos'
|
| 15 |
+
|
| 16 |
+
MAIN_CSV_PATH=f'{DATA_PATH}/entities.csv'
|
python/common/mkdir.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
|
| 3 |
+
def make_dirs(dirs: [str]):
|
| 4 |
+
for p in dirs:
|
| 5 |
+
pathlib.Path(p).mkdir(parents=True, exist_ok=True)
|
| 6 |
+
|
python/entity.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
| 2 |
import csv
|
| 3 |
from typing import NamedTuple
|
| 4 |
|
|
|
|
|
|
|
| 5 |
def read_entities(fn):
|
| 6 |
-
with open(
|
| 7 |
reader = csv.DictReader(csvfile)
|
| 8 |
bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
|
| 9 |
return bcos
|
|
|
|
| 2 |
import csv
|
| 3 |
from typing import NamedTuple
|
| 4 |
|
| 5 |
+
from common import defaults
|
| 6 |
+
|
| 7 |
def read_entities(fn):
|
| 8 |
+
with open(defaults.MAIN_DATA_PATH, newline='') as csvfile:
|
| 9 |
reader = csv.DictReader(csvfile)
|
| 10 |
bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
|
| 11 |
return bcos
|
python/imtool.py
CHANGED
|
@@ -3,10 +3,10 @@
|
|
| 3 |
import os
|
| 4 |
import math
|
| 5 |
import cv2
|
| 6 |
-
import pathlib
|
| 7 |
from typing import NamedTuple
|
| 8 |
|
| 9 |
from entity import Entity
|
|
|
|
| 10 |
|
| 11 |
TILE_SIZE = 416
|
| 12 |
TILE_OVERLAP = 0.8
|
|
@@ -69,9 +69,7 @@ def crop(id, fn, logos):
|
|
| 69 |
img_out = f"./data/squares/images"
|
| 70 |
txt_out = f"./data/squares/labels"
|
| 71 |
debug_out = f"./data/debug"
|
| 72 |
-
|
| 73 |
-
pathlib.Path(img_out).mkdir(parents=True, exist_ok=True)
|
| 74 |
-
pathlib.Path(txt_out).mkdir(parents=True, exist_ok=True)
|
| 75 |
|
| 76 |
im = cv2.imread(fn)
|
| 77 |
rim = cv2.imread(fn)
|
|
|
|
| 3 |
import os
|
| 4 |
import math
|
| 5 |
import cv2
|
|
|
|
| 6 |
from typing import NamedTuple
|
| 7 |
|
| 8 |
from entity import Entity
|
| 9 |
+
from common import mkdir
|
| 10 |
|
| 11 |
TILE_SIZE = 416
|
| 12 |
TILE_OVERLAP = 0.8
|
|
|
|
| 69 |
img_out = f"./data/squares/images"
|
| 70 |
txt_out = f"./data/squares/labels"
|
| 71 |
debug_out = f"./data/debug"
|
| 72 |
+
mkdir.make_dirs[debug_out, img_out, txt_out]
|
|
|
|
|
|
|
| 73 |
|
| 74 |
im = cv2.imread(fn)
|
| 75 |
rim = cv2.imread(fn)
|
python/main.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import csv
|
| 2 |
-
import pathlib
|
| 3 |
import requests
|
| 4 |
import shutil
|
| 5 |
|
|
@@ -7,18 +6,16 @@ from bs4 import BeautifulSoup
|
|
| 7 |
from progress.bar import ChargingBar
|
| 8 |
|
| 9 |
from entity import Entity
|
| 10 |
-
from common import selectors
|
| 11 |
-
from common import defaults
|
| 12 |
|
| 13 |
-
pathlib.Path(f'{defaults.DATA_PATH}/logos').mkdir(parents=True, exist_ok=True)
|
| 14 |
-
|
| 15 |
-
DATA_FILE = './data/entidades.csv'
|
| 16 |
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
| 17 |
page = requests.get(URL)
|
| 18 |
soup = BeautifulSoup(page.content, 'html.parser')
|
| 19 |
|
| 20 |
options = soup.find(class_='form-control').find_all('option')
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
writer = csv.writer(csvfile)
|
| 23 |
writer.writerow(Entity.row_names())
|
| 24 |
|
|
@@ -46,11 +43,11 @@ with open(f'{DATA_FILE}.tmp', 'w', newline='') as csvfile:
|
|
| 46 |
except TypeError:
|
| 47 |
print('ERROR', a)
|
| 48 |
|
| 49 |
-
e = Entity(name, id=i, bco=bco, logo=img, url=a)
|
| 50 |
writer.writerow(e.to_row())
|
| 51 |
i+=1
|
| 52 |
bar.next()
|
| 53 |
bar.finish()
|
| 54 |
|
| 55 |
-
shutil.move(f'{
|
| 56 |
print('scrape finished')
|
|
|
|
| 1 |
import csv
|
|
|
|
| 2 |
import requests
|
| 3 |
import shutil
|
| 4 |
|
|
|
|
| 6 |
from progress.bar import ChargingBar
|
| 7 |
|
| 8 |
from entity import Entity
|
| 9 |
+
from common import selectors, defaults, mkdir
|
|
|
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
| 12 |
page = requests.get(URL)
|
| 13 |
soup = BeautifulSoup(page.content, 'html.parser')
|
| 14 |
|
| 15 |
options = soup.find(class_='form-control').find_all('option')
|
| 16 |
+
mkdir.make_dirs([defaults.DATA_PATH])
|
| 17 |
+
|
| 18 |
+
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
| 19 |
writer = csv.writer(csvfile)
|
| 20 |
writer.writerow(Entity.row_names())
|
| 21 |
|
|
|
|
| 43 |
except TypeError:
|
| 44 |
print('ERROR', a)
|
| 45 |
|
| 46 |
+
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
|
| 47 |
writer.writerow(e.to_row())
|
| 48 |
i+=1
|
| 49 |
bar.next()
|
| 50 |
bar.finish()
|
| 51 |
|
| 52 |
+
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
| 53 |
print('scrape finished')
|
python/screenshot.py
CHANGED
|
@@ -9,7 +9,7 @@ from selenium.webdriver.common.by import By
|
|
| 9 |
|
| 10 |
from common import selectors
|
| 11 |
from entity import Entity
|
| 12 |
-
from common import defaults
|
| 13 |
|
| 14 |
options = webdriver.FirefoxOptions()
|
| 15 |
options.add_argument("--headless")
|
|
@@ -22,18 +22,23 @@ def coord_to_point(c):
|
|
| 22 |
|
| 23 |
driver = webdriver.Firefox(options=options)
|
| 24 |
def sc_entity(e: Entity):
|
| 25 |
-
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
driver.implicitly_wait(10)
|
| 27 |
driver.get(e.url)
|
| 28 |
-
driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
|
| 29 |
-
driver.save_full_page_screenshot(f"{defaults.
|
| 30 |
|
| 31 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
| 32 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
| 33 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
| 34 |
-
with open(f"{defaults.
|
| 35 |
for i in logos:
|
| 36 |
-
f.write(f"{e.
|
| 37 |
|
| 38 |
if __name__ == '__main__':
|
| 39 |
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
|
|
|
| 9 |
|
| 10 |
from common import selectors
|
| 11 |
from entity import Entity
|
| 12 |
+
from common import defaults,mkdir
|
| 13 |
|
| 14 |
options = webdriver.FirefoxOptions()
|
| 15 |
options.add_argument("--headless")
|
|
|
|
| 22 |
|
| 23 |
driver = webdriver.Firefox(options=options)
|
| 24 |
def sc_entity(e: Entity):
|
| 25 |
+
print(f'screenshoting: {e}')
|
| 26 |
+
mkdir.make_dirs([
|
| 27 |
+
defaults.IMAGES_PATH,
|
| 28 |
+
defaults.LABELS_PATH,
|
| 29 |
+
])
|
| 30 |
+
|
| 31 |
driver.implicitly_wait(10)
|
| 32 |
driver.get(e.url)
|
| 33 |
+
#driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
|
| 34 |
+
driver.save_full_page_screenshot(f"{defaults.IMAGES_PATH}/{e.bco}.full.png")
|
| 35 |
|
| 36 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
| 37 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
| 38 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
| 39 |
+
with open(f"{defaults.LABELS_PATH}/{e.bco}.full.txt", 'w') as f:
|
| 40 |
for i in logos:
|
| 41 |
+
f.write(f"{e.id} {coord_to_point(i.rect)}\n")
|
| 42 |
|
| 43 |
if __name__ == '__main__':
|
| 44 |
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
python/vendor.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import pathlib
|
| 3 |
import csv
|
| 4 |
import concurrent.futures
|
| 5 |
import requests
|
|
@@ -7,7 +6,7 @@ import requests
|
|
| 7 |
from progress.bar import ChargingBar
|
| 8 |
|
| 9 |
from entity import Entity
|
| 10 |
-
from common import defaults
|
| 11 |
import screenshot
|
| 12 |
import web
|
| 13 |
|
|
@@ -40,6 +39,4 @@ def from_csv(fn):
|
|
| 40 |
#exit()
|
| 41 |
|
| 42 |
if __name__ == '__main__':
|
| 43 |
-
|
| 44 |
-
pathlib.Path(f"{defaults.DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
|
| 45 |
-
from_csv(f"{defaults.DATA_PATH}/entidades.csv")
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
| 2 |
import csv
|
| 3 |
import concurrent.futures
|
| 4 |
import requests
|
|
|
|
| 6 |
from progress.bar import ChargingBar
|
| 7 |
|
| 8 |
from entity import Entity
|
| 9 |
+
from common import defaults,mkdir
|
| 10 |
import screenshot
|
| 11 |
import web
|
| 12 |
|
|
|
|
| 39 |
#exit()
|
| 40 |
|
| 41 |
if __name__ == '__main__':
|
| 42 |
+
from_csv(defaults.MAIN_CSV_PATH)
|
|
|
|
|
|
python/web.py
CHANGED
|
@@ -5,7 +5,7 @@ import requests
|
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
|
| 7 |
from entity import Entity
|
| 8 |
-
from common import selectors, defaults
|
| 9 |
|
| 10 |
def get_page(e: Entity):
|
| 11 |
try:
|
|
@@ -17,9 +17,10 @@ def get_page(e: Entity):
|
|
| 17 |
|
| 18 |
def get_cert(e: Entity):
|
| 19 |
ssl_url = e.url.split("/")[2]
|
|
|
|
| 20 |
try:
|
| 21 |
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
| 22 |
-
fn = f"{defaults.
|
| 23 |
with open(fn, 'w') as f:
|
| 24 |
f.write(cert)
|
| 25 |
except Exception as err:
|
|
@@ -39,6 +40,8 @@ def get_logos(e: Entity, page):
|
|
| 39 |
logos.extend(soup.select(selectors.id_logo))
|
| 40 |
logos.extend(soup.select(selectors.cls_logo))
|
| 41 |
|
|
|
|
|
|
|
| 42 |
i = 0
|
| 43 |
lfn = []
|
| 44 |
for l in logos:
|
|
@@ -46,7 +49,7 @@ def get_logos(e: Entity, page):
|
|
| 46 |
src = l.attrs['src']
|
| 47 |
ext = src.split('.')[-1].split('/')[-1]
|
| 48 |
if not src.startswith('http'): src = e.url + src
|
| 49 |
-
fn = f"{defaults.
|
| 50 |
lfn.append(get_img_logo(src, fn))
|
| 51 |
i+=1
|
| 52 |
return lfn
|
|
|
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
|
| 7 |
from entity import Entity
|
| 8 |
+
from common import selectors, defaults, mkdir
|
| 9 |
|
| 10 |
def get_page(e: Entity):
|
| 11 |
try:
|
|
|
|
| 17 |
|
| 18 |
def get_cert(e: Entity):
|
| 19 |
ssl_url = e.url.split("/")[2]
|
| 20 |
+
mkdir.make_dirs(defaults.CERTS_PATH)
|
| 21 |
try:
|
| 22 |
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
| 23 |
+
fn = f"{defaults.CERTS_PATH}/{e.bco}.cert"
|
| 24 |
with open(fn, 'w') as f:
|
| 25 |
f.write(cert)
|
| 26 |
except Exception as err:
|
|
|
|
| 40 |
logos.extend(soup.select(selectors.id_logo))
|
| 41 |
logos.extend(soup.select(selectors.cls_logo))
|
| 42 |
|
| 43 |
+
mkdir.make_dirs(defaults.LOGOS_DATA_PATH)
|
| 44 |
+
|
| 45 |
i = 0
|
| 46 |
lfn = []
|
| 47 |
for l in logos:
|
|
|
|
| 49 |
src = l.attrs['src']
|
| 50 |
ext = src.split('.')[-1].split('/')[-1]
|
| 51 |
if not src.startswith('http'): src = e.url + src
|
| 52 |
+
fn = f"{defaults.LOGOS_DATA_PATH}/{e.bco}.{i}.{ext}"
|
| 53 |
lfn.append(get_img_logo(src, fn))
|
| 54 |
i+=1
|
| 55 |
return lfn
|