Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
7c115c7
1
Parent(s):
cab11aa
reorganize vendor.py and log
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- crawler/vendor.py +21 -11
crawler/vendor.py
CHANGED
|
@@ -12,22 +12,20 @@ from entity import Entity
|
|
| 12 |
from common import selectors
|
| 13 |
import screenshot
|
| 14 |
|
| 15 |
-
def
|
| 16 |
-
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
|
| 17 |
-
|
| 18 |
ssl_url = e.url.split("/")[2]
|
| 19 |
try:
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
|
|
|
|
|
|
| 25 |
logos = soup.select(selectors.logo)
|
| 26 |
-
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
| 27 |
|
| 28 |
-
fn = f"{e.DATA_PATH}/cert"
|
| 29 |
-
with open(fn, 'w') as f:
|
| 30 |
-
f.write(cert)
|
| 31 |
i = 0
|
| 32 |
lfn = []
|
| 33 |
for l in logos:
|
|
@@ -43,6 +41,18 @@ def query_vendor_site(e: Entity):
|
|
| 43 |
shutil.copyfileobj(res.raw, f)
|
| 44 |
lfn.append(fn)
|
| 45 |
i+=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
screenshot.sc_entity(e)
|
| 47 |
return (fn, lfn)
|
| 48 |
|
|
|
|
| 12 |
from common import selectors
|
| 13 |
import screenshot
|
| 14 |
|
| 15 |
+
def write_cert(e: Entity):
|
|
|
|
|
|
|
| 16 |
ssl_url = e.url.split("/")[2]
|
| 17 |
try:
|
| 18 |
+
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
| 19 |
+
with open(f"{e.DATA_PATH}/cert", 'w') as f:
|
| 20 |
+
f.write(cert)
|
| 21 |
+
except Exception as err:
|
| 22 |
+
with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
|
| 23 |
+
f.write(str(err))
|
| 24 |
|
| 25 |
+
def get_logos(e: Entity, page):
|
| 26 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
| 27 |
logos = soup.select(selectors.logo)
|
|
|
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
i = 0
|
| 30 |
lfn = []
|
| 31 |
for l in logos:
|
|
|
|
| 41 |
shutil.copyfileobj(res.raw, f)
|
| 42 |
lfn.append(fn)
|
| 43 |
i+=1
|
| 44 |
+
|
| 45 |
+
def query_vendor_site(e: Entity):
|
| 46 |
+
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
page = requests.get(e.url)
|
| 50 |
+
except Exception:
|
| 51 |
+
e.url = e.url.replace('http', 'https')
|
| 52 |
+
page = requests.get(e.url)
|
| 53 |
+
|
| 54 |
+
write_cert(e)
|
| 55 |
+
get_logos(e, page)
|
| 56 |
screenshot.sc_entity(e)
|
| 57 |
return (fn, lfn)
|
| 58 |
|