Spaces:

AlvaroMros
/

ufc-predictor

Sleeping

App Files Files Community

ufc-predictor / src /scrape /scrape_fights.py

AlvaroMros

Refactor imports to use absolute paths and clean up scripts

9678fdb about 1 month ago

raw

history blame

10.1 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import time
	import concurrent.futures
	from ..config import EVENTS_JSON_PATH

	# --- Configuration ---
	# The number of parallel threads to use for scraping fight details.
	# Increase this to scrape faster, but be mindful of rate limits.
	MAX_WORKERS = 10
	# The delay in seconds between each request to a fight's detail page.
	# This is a politeness measure to avoid overwhelming the server.
	REQUEST_DELAY = 0.1
	# --- End Configuration ---

	BASE_URL = "http://ufcstats.com/statistics/events/completed?page=all"

	def get_soup(url):
	response = requests.get(url)
	response.raise_for_status() # Raise an exception for bad status codes
	return BeautifulSoup(response.text, 'html.parser')

	def scrape_fight_details(fight_url):
	print(f" Scraping fight: {fight_url}")
	soup = get_soup(fight_url)

	# On upcoming fight pages, there's a specific div. If it exists, skip.
	if soup.find('div', class_='b-fight-details__content-abbreviated'):
	print(f" Upcoming fight, no details available: {fight_url}")
	return None

	tables = soup.find_all('table', class_='b-fight-details__table')

	if not tables:
	print(f" No stats tables found on {fight_url}")
	return None

	fight_details = {"fighter_1_stats": {}, "fighter_2_stats": {}}

	# Helper to extract stats. The stats for both fighters are in <p> tags within a single <td>
	def extract_stats_from_cell(cell, col_name):
	ps = cell.find_all('p')
	if len(ps) == 2:
	fight_details["fighter_1_stats"][col_name] = ps[0].text.strip()
	fight_details["fighter_2_stats"][col_name] = ps[1].text.strip()

	# --- Totals Table ---
	# The first table contains overall stats
	totals_table = tables[0]
	totals_tbody = totals_table.find('tbody')
	if totals_tbody:
	totals_row = totals_tbody.find('tr')
	if totals_row:
	totals_cols = totals_row.find_all('td')
	stat_cols = {
	1: 'kd', 2: 'sig_str', 3: 'sig_str_percent', 4: 'total_str',
	5: 'td', 6: 'td_percent', 7: 'sub_att', 8: 'rev', 9: 'ctrl'
	}
	for index, name in stat_cols.items():
	if index < len(totals_cols):
	extract_stats_from_cell(totals_cols[index], name)

	# --- Significant Strikes Table ---
	# The second table contains significant strike details
	if len(tables) > 1:
	sig_strikes_table = tables[1]
	sig_strikes_tbody = sig_strikes_table.find('tbody')
	if sig_strikes_tbody:
	sig_strikes_row = sig_strikes_tbody.find('tr')
	if sig_strikes_row:
	sig_strikes_cols = sig_strikes_row.find_all('td')
	stat_cols = {
	2: 'sig_str_head', 3: 'sig_str_body', 4: 'sig_str_leg',
	5: 'sig_str_distance', 6: 'sig_str_clinch', 7: 'sig_str_ground'
	}
	for index, name in stat_cols.items():
	if index < len(sig_strikes_cols):
	extract_stats_from_cell(sig_strikes_cols[index], name)

	return fight_details

	def fetch_fight_details_worker(fight_url):
	"""
	Worker function for the thread pool. Scrapes details for a single fight
	and applies a delay to be polite to the server.
	"""
	try:
	details = scrape_fight_details(fight_url)
	time.sleep(REQUEST_DELAY)
	return details
	except Exception as e:
	print(f" Could not scrape fight details for {fight_url}: {e}")
	time.sleep(REQUEST_DELAY) # Also sleep on failure to be safe
	return None

	def scrape_event_details(event_url):
	print(f"Scraping event: {event_url}")
	soup = get_soup(event_url)
	event_details = {}

	# Extract event name
	event_details['name'] = soup.find('h2', class_='b-content__title').text.strip()

	# Extract event date and location
	info_list = soup.find('ul', class_='b-list__box-list')
	list_items = info_list.find_all('li', class_='b-list__box-list-item')
	event_details['date'] = list_items[0].text.split(':')[1].strip()
	event_details['location'] = list_items[1].text.split(':')[1].strip()

	# Step 1: Gather base info and URLs for all fights on the event page.
	fights_to_process = []
	fight_table = soup.find('table', class_='b-fight-details__table')
	if fight_table:
	rows = fight_table.find('tbody').find_all('tr', class_='b-fight-details__table-row')
	for row in rows:
	cols = row.find_all('td', class_='b-fight-details__table-col')

	fighter1 = cols[1].find_all('p')[0].text.strip()
	fighter2 = cols[1].find_all('p')[1].text.strip()

	# Determine the winner from the W/L column based on the example provided.
	winner = None
	result_ps = cols[0].find_all('p')

	# This logic handles the structure seen in the example file.
	if len(result_ps) == 1:
	result_text = result_ps[0].text.strip().lower()
	if 'win' in result_text:
	# When one 'win' is present, it corresponds to the first fighter listed.
	winner = fighter1
	elif 'draw' in result_text:
	winner = "Draw"
	elif 'nc' in result_text:
	winner = "NC"

	# This is a defensive case in case the structure has two <p> tags.
	elif len(result_ps) == 2:
	if 'win' in result_ps[0].text.strip().lower():
	winner = fighter1
	elif 'win' in result_ps[1].text.strip().lower():
	winner = fighter2
	elif 'draw' in result_ps[0].text.strip().lower():
	winner = "Draw"
	elif 'nc' in result_ps[0].text.strip().lower():
	winner = "NC"

	fight = {
	'fighter_1': fighter1,
	'fighter_2': fighter2,
	'winner': winner,
	'weight_class': cols[6].text.strip(),
	'method': ' '.join(cols[7].stripped_strings),
	'round': cols[8].text.strip(),
	'time': cols[9].text.strip(),
	'url': row['data-link']
	}
	fights_to_process.append(fight)

	# Step 2: Scrape the details for all fights in parallel.
	fight_urls = [fight['url'] for fight in fights_to_process]
	completed_fights = []

	if fight_urls:
	with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
	# The map function maintains the order of results.
	fight_details_list = executor.map(fetch_fight_details_worker, fight_urls)

	for i, details in enumerate(fight_details_list):
	fight_data = fights_to_process[i]
	del fight_data['url'] # Clean up the temporary URL
	fight_data['details'] = details if details else None
	completed_fights.append(fight_data)

	event_details['fights'] = completed_fights
	return event_details

	def scrape_all_events(json_path):
	soup = get_soup(BASE_URL)
	events = []

	table = soup.find('table', class_='b-statistics__table-events')
	if not table:
	print("Could not find events table on the page.")
	return []

	event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')]
	total_events = len(event_rows)
	print(f"Found {total_events} events to scrape.")

	for i, row in enumerate(event_rows):
	event_link_tag = row.find('a', class_='b-link b-link_style_black')
	if not event_link_tag or not event_link_tag.has_attr('href'):
	continue

	event_url = event_link_tag['href']

	try:
	event_data = scrape_event_details(event_url)
	if event_data:
	events.append(event_data)

	print(f"Progress: {i+1}/{total_events} events scraped.")

	if (i + 1) % 10 == 0:
	print(f"--- Saving progress: {i + 1} of {total_events} events saved. ---")
	with open(json_path, 'w') as f:
	json.dump(events, f, indent=4)
	except Exception as e:
	print(f"Could not process event {event_url}. Error: {e}")

	return events

	def scrape_latest_events(json_path, num_events=5):
	"""
	Scrapes only the latest N events from UFC stats.
	This is useful for incremental updates to avoid re-scraping all data.

	Args:
	json_path (str): Path to save the latest events JSON file
	num_events (int): Number of latest events to scrape (default: 5)

	Returns:
	list: List of scraped event data
	"""
	soup = get_soup(BASE_URL)
	events = []

	table = soup.find('table', class_='b-statistics__table-events')
	if not table:
	print("Could not find events table on the page.")
	return []

	event_rows = [row for row in table.find_all('tr', class_='b-statistics__table-row') if row.find('td')]

	# Limit to the latest N events (events are ordered chronologically with most recent first)
	latest_event_rows = event_rows[:num_events]
	total_events = len(latest_event_rows)
	print(f"Found {len(event_rows)} total events. Scraping latest {total_events} events.")

	for i, row in enumerate(latest_event_rows):
	event_link_tag = row.find('a', class_='b-link b-link_style_black')
	if not event_link_tag or not event_link_tag.has_attr('href'):
	continue

	event_url = event_link_tag['href']

	try:
	event_data = scrape_event_details(event_url)
	if event_data:
	events.append(event_data)

	print(f"Progress: {i+1}/{total_events} latest events scraped.")
	except Exception as e:
	print(f"Could not process event {event_url}. Error: {e}")

	return events