Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| import json | |
| import time | |
| import string | |
| import concurrent.futures | |
| import os | |
| from .. import config | |
| # --- Configuration --- | |
| # The number of parallel threads to use for scraping fighter details. | |
| # Increase this to scrape faster, but be mindful of rate limits. | |
| MAX_WORKERS = 10 | |
| # The delay in seconds between each request to a fighter's detail page. | |
| # This is a politeness measure to avoid overwhelming the server. | |
| REQUEST_DELAY = 0.1 | |
| # --- End Configuration --- | |
| BASE_URL = "http://ufcstats.com/statistics/fighters?page=all" | |
| def get_soup(url): | |
| """Fetches and parses a URL into a BeautifulSoup object.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.text, 'html.parser') | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching {url}: {e}") | |
| return None | |
| def scrape_fighter_details(fighter_url): | |
| """Scrapes detailed statistics for a single fighter from their page.""" | |
| print(f" Scraping fighter details from: {fighter_url}") | |
| soup = get_soup(fighter_url) | |
| if not soup: | |
| return None | |
| details = {} | |
| # Career stats are usually in a list format on the fighter's page. | |
| # This finds all list items within the career statistics div and extracts the data. | |
| career_stats_div = soup.find('div', class_='b-list__info-box_style_small-width') | |
| if career_stats_div: | |
| stats_list = career_stats_div.find_all('li', class_='b-list__box-list-item') | |
| for item in stats_list: | |
| text = item.text.strip() | |
| if ":" in text: | |
| parts = text.split(":", 1) | |
| key = parts[0].strip().lower().replace(' ', '_').replace('.', '') | |
| value = parts[1].strip() | |
| details[key] = value | |
| return details | |
| def process_fighter(fighter_data): | |
| """ | |
| Worker function for the thread pool. Scrapes details for a single fighter, | |
| updates the dictionary, and applies a delay. | |
| """ | |
| fighter_url = fighter_data['url'] | |
| try: | |
| details = scrape_fighter_details(fighter_url) | |
| if details: | |
| fighter_data.update(details) | |
| except Exception as e: | |
| print(f" Could not scrape details for {fighter_url}: {e}") | |
| time.sleep(REQUEST_DELAY) | |
| return fighter_data | |
| def scrape_all_fighters(json_path): | |
| """Scrapes all fighters from a-z pages using parallel processing.""" | |
| # Step 1: Sequentially scrape all fighter list pages. This is fast. | |
| initial_fighter_list = [] | |
| alphabet = string.ascii_lowercase | |
| print("--- Step 1: Collecting basic fighter info from all list pages ---") | |
| for char in alphabet: | |
| page_url = f"http://ufcstats.com/statistics/fighters?char={char}&page=all" | |
| print(f"Scanning page: {page_url}") | |
| soup = get_soup(page_url) | |
| if not soup: | |
| continue | |
| table = soup.find('table', class_='b-statistics__table') | |
| if not table: | |
| print(f"Could not find fighters table on page {page_url}") | |
| continue | |
| fighter_rows = table.find('tbody').find_all('tr')[1:] | |
| if not fighter_rows: | |
| continue | |
| for row in fighter_rows: | |
| cols = row.find_all('td') | |
| if len(cols) < 11: | |
| continue | |
| fighter_link_tag = cols[0].find('a') | |
| if not fighter_link_tag or not fighter_link_tag.has_attr('href'): | |
| continue | |
| initial_fighter_list.append({ | |
| 'first_name': cols[0].text.strip(), | |
| 'last_name': cols[1].text.strip(), | |
| 'nickname': cols[2].text.strip(), | |
| 'height': cols[3].text.strip(), | |
| 'weight_lbs': cols[4].text.strip(), | |
| 'reach_in': cols[5].text.strip(), | |
| 'stance': cols[6].text.strip(), | |
| 'wins': cols[7].text.strip(), | |
| 'losses': cols[8].text.strip(), | |
| 'draws': cols[9].text.strip(), | |
| 'belt': False if not cols[10].find('img') else True, | |
| 'url': fighter_link_tag['href'] | |
| }) | |
| print(f"\n--- Step 2: Scraping details for {len(initial_fighter_list)} fighters in parallel (using up to {MAX_WORKERS} workers) ---") | |
| fighters_with_details = [] | |
| total_fighters = len(initial_fighter_list) | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: | |
| results = executor.map(process_fighter, initial_fighter_list) | |
| for i, fighter_data in enumerate(results): | |
| fighters_with_details.append(fighter_data) | |
| print(f"Progress: {i + 1}/{total_fighters} fighters scraped.") | |
| if (i + 1) > 0 and (i + 1) % 50 == 0: | |
| fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name'])) | |
| with open(json_path, 'w') as f: | |
| json.dump(fighters_with_details, f, indent=4) | |
| fighters_with_details.sort(key=lambda x: (x['last_name'], x['first_name'])) | |
| return fighters_with_details | |
| if __name__ == "__main__": | |
| all_fighters_data = scrape_all_fighters(config.FIGHTERS_JSON_PATH) | |
| if not os.path.exists(config.OUTPUT_DIR): | |
| os.makedirs(config.OUTPUT_DIR) | |
| with open(config.FIGHTERS_JSON_PATH, 'w') as f: | |
| json.dump(all_fighters_data, f, indent=4) | |
| print(f"\nScraping complete. Final data for {len(all_fighters_data)} fighters saved to {config.FIGHTERS_JSON_PATH}") |