Spaces:
Sleeping
Sleeping
| import urllib.request | |
| import re | |
| import json | |
| import urllib.parse | |
| from urllib.parse import urlsplit, quote | |
| from urllib.request import Request, urlopen | |
| from bs4 import BeautifulSoup | |
| from tqdm import tqdm | |
| import pandas as pd | |
| url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)' | |
| url_info = urlsplit(url) | |
| encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}' | |
| info = [] | |
| erros = [] | |
| target_number = 1017 | |
| cnt = 0 | |
| for _ in tqdm(range(target_number+2)): | |
| cnt += 1 | |
| req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'}) | |
| res = urlopen(req) | |
| html = res.read() | |
| soup = BeautifulSoup(html, 'html.parser') | |
| name = soup.find("div", {"class": "name-ko"}).text.strip() | |
| number = soup.find("div", {"class": "index"}).text.strip() | |
| doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")]) | |
| types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')] | |
| evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;")) | |
| info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")])) | |
| next_monster = soup.find("table").findAll("a")[-1]['href'] | |
| encoded_url = "https://pokemon.fandom.com" + next_monster | |
| if number == f"No.{target_number:04d}": | |
| break | |
| if cnt >= target_number: | |
| break | |
| with open('pokemon_evolve.json', 'w') as f: | |
| json.dump(info, f, ensure_ascii=False, indent=4) |