Spaces:
Running
Running
Alvaro
commited on
Commit
·
bf7e729
1
Parent(s):
32a1590
Add logistic regression model and ML preprocessing
Browse filesIntroduces a new LogisticRegressionModel for fight outcome prediction, including a comprehensive ML preprocessing module (preprocess.py) for feature engineering and data preparation. Refactors model interfaces to accept fight dictionaries, updates the pipeline and main entrypoint for model selection and reporting, and improves data cleaning in the scraping and preprocessing steps. Also fixes fighter name formatting and adds pandas and scikit-learn as dependencies.
- output/ufc_fighters.csv +16 -16
- requirements.txt +3 -1
- src/predict/main.py +23 -16
- src/predict/models.py +108 -16
- src/predict/pipeline.py +2 -1
- src/predict/preprocess.py +227 -0
- src/scrape/main.py +13 -21
- src/scrape/preprocess.py +20 -11
output/ufc_fighters.csv
CHANGED
@@ -374,7 +374,7 @@ Brad,Blackburn,Bad Brad,18,13,1,False,178,170,73.0,Orthodox,"May 25, 1977",http:
|
|
374 |
Jason,Blackford,,3,3,0,False,,,,,,http://ufcstats.com/fighter-details/619d807fa54ae8f7,,,1500
|
375 |
Sherrard,Blackledge,The Thriller,5,1,0,False,180,155,75.0,Orthodox,"Aug 16, 1993",http://ufcstats.com/fighter-details/0e5c79b3594ff0ad,75,155,1500
|
376 |
Tom,Blackledge,,10,7,0,False,,205,,Orthodox,,http://ufcstats.com/fighter-details/2adb11835acd815b,,205,1477
|
377 |
-
Da
|
378 |
Chasen,Blair,Mestizo,6,3,0,False,178,155,71.0,Orthodox,"Oct 14, 1998",http://ufcstats.com/fighter-details/0f4a536507f33576,71,155,1500
|
379 |
Erin,Blanchfield,Cold Blooded,13,2,0,False,163,125,66.0,Orthodox,"May 04, 1999",http://ufcstats.com/fighter-details/669970f7feba8ecd,66,125,1622
|
380 |
David,Blanco,,2,0,0,False,170,145,,,,http://ufcstats.com/fighter-details/ebf298f8ac7e232b,,145,1500
|
@@ -831,7 +831,7 @@ Will,Currie,Drago,12,4,0,False,190,185,76.0,Switch,"Nov 12, 1998",http://ufcstat
|
|
831 |
Chris,Curtis,Action-Man,31,12,0,False,178,170,75.0,Orthodox,"Jul 15, 1987",http://ufcstats.com/fighter-details/5442f1bc4b47eaf3,75,170,1529
|
832 |
Ion,Cutelaba,The Hulk,19,11,1,False,185,205,75.0,Southpaw,"Dec 14, 1993",http://ufcstats.com/fighter-details/cd13728ae1151f46,75,205,1492
|
833 |
Gleidson,Cutis,Poney,7,4,0,False,175,155,,Orthodox,"Feb 07, 1989",http://ufcstats.com/fighter-details/44a94bbde42246e4,,155,1500
|
834 |
-
Sarah,D
|
835 |
Alex,Da Silva,Leko,21,4,0,False,173,155,73.0,Orthodox,"Feb 04, 1996",http://ufcstats.com/fighter-details/c3ded6f7155f9ea4,73,155,1466
|
836 |
Paulo,Da Silva,PH,11,1,0,False,,125,,,"Jul 03, 2002",http://ufcstats.com/fighter-details/3fe8ad7e1594537a,,125,1500
|
837 |
Radley,Da Silva,Snake Eyes,7,1,0,False,,145,,,"Jun 20, 1995",http://ufcstats.com/fighter-details/0d68c7bfdff1dc34,,145,1500
|
@@ -1925,7 +1925,7 @@ Tetsuji,Kato,,20,10,0,False,,155,,Orthodox,,http://ufcstats.com/fighter-details/
|
|
1925 |
Brad,Katona,Superman,16,5,0,False,168,135,64.0,Orthodox,"Dec 19, 1991",http://ufcstats.com/fighter-details/7b433309b0fd12aa,64,135,1471
|
1926 |
Calvin,Kattar,,23,9,0,False,180,145,72.0,Orthodox,"Mar 26, 1988",http://ufcstats.com/fighter-details/751de04455cfaac0,72,145,1530
|
1927 |
Sarah,Kaufman,,18,4,0,False,165,135,66.0,Orthodox,"Sep 20, 1985",http://ufcstats.com/fighter-details/36df8e119aec6175,66,135,1481
|
1928 |
-
Lone
|
1929 |
Yusuke,Kawaguchi,,18,12,0,False,183,255,,,"Aug 14, 1980",http://ufcstats.com/fighter-details/fa2320781bfe4f49,,255,1500
|
1930 |
Canaan,Kawaihae,,6,2,0,False,183,145,71.0,Southpaw,"Aug 26, 1997",http://ufcstats.com/fighter-details/58d42b9e920b25fc,71,145,1500
|
1931 |
Tatsuya,Kawajiri,Crusher,36,11,2,False,170,145,69.0,Orthodox,"May 08, 1978",http://ufcstats.com/fighter-details/80d918336163b80c,69,145,1509
|
@@ -2432,7 +2432,7 @@ Nick,Maximov,,8,2,0,False,183,185,76.0,Southpaw,"Dec 23, 1997",http://ufcstats.c
|
|
2432 |
Elaina,Maxwell,Beef,7,4,0,False,173,145,,Orthodox,"Dec 16, 1978",http://ufcstats.com/fighter-details/a16ce18149021139,,145,1500
|
2433 |
Jack,May,The Outlaw,9,3,0,False,203,255,,Switch,"Apr 14, 1981",http://ufcstats.com/fighter-details/2c6e81426dd7573c,,255,1462
|
2434 |
Jeremy,May,,13,10,0,False,185,185,,,"Oct 23, 1986",http://ufcstats.com/fighter-details/c32fdfe75cda5b22,,185,1500
|
2435 |
-
Don
|
2436 |
Gray,Maynard,,13,7,1,False,175,155,70.0,Orthodox,"May 09, 1979",http://ufcstats.com/fighter-details/7d96bc577e5178b2,70,155,1544
|
2437 |
Brooke,Mayo,The Bully,0,1,0,False,170,125,,,,http://ufcstats.com/fighter-details/1b41c21d947d6f2f,,125,1500
|
2438 |
Gina,Mazany,Danger,7,6,0,False,168,125,68.0,Southpaw,"Aug 19, 1988",http://ufcstats.com/fighter-details/016a8d958883167c,68,125,1429
|
@@ -2824,16 +2824,16 @@ Said,Nurmagomedov,,18,4,0,False,173,135,70.0,Orthodox,"Apr 05, 1992",http://ufcs
|
|
2824 |
Umar,Nurmagomedov,,18,1,0,False,173,135,69.0,Orthodox,"Jan 03, 1996",http://ufcstats.com/fighter-details/2b6fc1c02736833d,69,135,1605
|
2825 |
Adilet,Nurmatov,Kok-Zhal,13,2,0,False,,155,,,"Aug 02, 1997",http://ufcstats.com/fighter-details/689502703bbfe5f2,,155,1500
|
2826 |
Kennedy,Nzechukwu,African Savage,14,5,0,False,196,236,83.0,Southpaw,"Jun 13, 1992",http://ufcstats.com/fighter-details/8667caa0451d245b,83,236,1550
|
2827 |
-
Jake,O
|
2828 |
-
TJ,O
|
2829 |
-
Sean,O
|
2830 |
-
Dan,O
|
2831 |
-
Sean,O
|
2832 |
-
Sean,O
|
2833 |
-
Jeremiah,O
|
2834 |
-
Chuck,O
|
2835 |
-
Casey,O
|
2836 |
-
Brendan,O
|
2837 |
Takahiro,Oba,,5,7,1,False,173,200,,Southpaw,,http://ufcstats.com/fighter-details/7139cd2ae4bf6a29,,200,1500
|
2838 |
Nobuhiro,Obiya,,12,5,1,False,173,145,,Orthodox,"Jan 15, 1981",http://ufcstats.com/fighter-details/6e3282d57d2467a0,,145,1500
|
2839 |
Jose,Ochoa,Kalzifer,8,1,0,False,170,125,67.0,Southpaw,"Dec 31, 2000",http://ufcstats.com/fighter-details/88be62d6c1e6dadb,67,125,1502
|
@@ -2905,7 +2905,7 @@ Craig,Oxley,,0,3,0,False,175,155,,,"Feb 14, 1973",http://ufcstats.com/fighter-de
|
|
2905 |
Shungo,Oyama,,14,19,0,False,180,185,,Southpaw,"Apr 11, 1974",http://ufcstats.com/fighter-details/47b7e4e60813b7b2,,185,1500
|
2906 |
Ren,Ozaki,,6,1,2,False,170,135,68.0,Orthodox,"Dec 15, 2001",http://ufcstats.com/fighter-details/8997ee20b6a43d76,68,135,1500
|
2907 |
Alptekin,Ozkilic,The Turkish Delight,9,5,0,False,165,125,65.0,Orthodox,"Mar 27, 1986",http://ufcstats.com/fighter-details/e18a19001a3f7c7d,65,125,1463
|
2908 |
-
Raquel,Pa
|
2909 |
Nick,Pace,,8,3,0,False,170,135,68.0,Orthodox,"Apr 17, 1987",http://ufcstats.com/fighter-details/8cb76103cd8a1562,68,135,1479
|
2910 |
Angel,Pacheco,,7,3,0,False,173,135,70.0,Orthodox,"Jan 13, 1992",http://ufcstats.com/fighter-details/07797f10b9569cfc,70,135,1479
|
2911 |
Larissa,Pacheco,,10,2,0,False,170,135,,Orthodox,"Sep 07, 1994",http://ufcstats.com/fighter-details/16b89be2f5c16fba,,135,1462
|
@@ -3438,7 +3438,7 @@ Hugo,Sandoval,,2,3,0,False,,135,,,,http://ufcstats.com/fighter-details/9d51bcc28
|
|
3438 |
Joseph,Sandoval,,7,7,0,False,170,135,,Southpaw,"May 11, 1986",http://ufcstats.com/fighter-details/696002b59f09d73b,,135,1461
|
3439 |
Raul,Sandoval,,3,3,0,False,,130,,,,http://ufcstats.com/fighter-details/f9ad10f6a49e5452,,130,1500
|
3440 |
Chris,Sanford,,5,1,0,False,180,185,,Orthodox,"Mar 12, 1968",http://ufcstats.com/fighter-details/29f935654825331b,,185,1480
|
3441 |
-
Roldan,Sangcha
|
3442 |
Martin,Sano,Spartan,4,3,1,False,180,170,75.0,Orthodox,"May 30, 1991",http://ufcstats.com/fighter-details/16a64f93f6678b7b,75,170,1481
|
3443 |
Yuhi,Sano,,0,4,0,False,180,243,,Orthodox,"Feb 02, 1965",http://ufcstats.com/fighter-details/4c12aa7ca246e7a4,,243,1500
|
3444 |
Jonathan,Santa Maria,,3,4,0,False,175,125,,,,http://ufcstats.com/fighter-details/3143e5daff9e5b71,,125,1500
|
|
|
374 |
Jason,Blackford,,3,3,0,False,,,,,,http://ufcstats.com/fighter-details/619d807fa54ae8f7,,,1500
|
375 |
Sherrard,Blackledge,The Thriller,5,1,0,False,180,155,75.0,Orthodox,"Aug 16, 1993",http://ufcstats.com/fighter-details/0e5c79b3594ff0ad,75,155,1500
|
376 |
Tom,Blackledge,,10,7,0,False,,205,,Orthodox,,http://ufcstats.com/fighter-details/2adb11835acd815b,,205,1477
|
377 |
+
Da'Mon,Blackshear,The Monster,17,7,1,False,178,135,72.0,Switch,"Aug 12, 1994",http://ufcstats.com/fighter-details/da22387a0407a2dc,72,135,1500
|
378 |
Chasen,Blair,Mestizo,6,3,0,False,178,155,71.0,Orthodox,"Oct 14, 1998",http://ufcstats.com/fighter-details/0f4a536507f33576,71,155,1500
|
379 |
Erin,Blanchfield,Cold Blooded,13,2,0,False,163,125,66.0,Orthodox,"May 04, 1999",http://ufcstats.com/fighter-details/669970f7feba8ecd,66,125,1622
|
380 |
David,Blanco,,2,0,0,False,170,145,,,,http://ufcstats.com/fighter-details/ebf298f8ac7e232b,,145,1500
|
|
|
831 |
Chris,Curtis,Action-Man,31,12,0,False,178,170,75.0,Orthodox,"Jul 15, 1987",http://ufcstats.com/fighter-details/5442f1bc4b47eaf3,75,170,1529
|
832 |
Ion,Cutelaba,The Hulk,19,11,1,False,185,205,75.0,Southpaw,"Dec 14, 1993",http://ufcstats.com/fighter-details/cd13728ae1151f46,75,205,1492
|
833 |
Gleidson,Cutis,Poney,7,4,0,False,175,155,,Orthodox,"Feb 07, 1989",http://ufcstats.com/fighter-details/44a94bbde42246e4,,155,1500
|
834 |
+
Sarah,D'alelio,The Monster,11,7,0,False,170,145,,,"Dec 13, 1980",http://ufcstats.com/fighter-details/ac45450f75d14f16,,145,1500
|
835 |
Alex,Da Silva,Leko,21,4,0,False,173,155,73.0,Orthodox,"Feb 04, 1996",http://ufcstats.com/fighter-details/c3ded6f7155f9ea4,73,155,1466
|
836 |
Paulo,Da Silva,PH,11,1,0,False,,125,,,"Jul 03, 2002",http://ufcstats.com/fighter-details/3fe8ad7e1594537a,,125,1500
|
837 |
Radley,Da Silva,Snake Eyes,7,1,0,False,,145,,,"Jun 20, 1995",http://ufcstats.com/fighter-details/0d68c7bfdff1dc34,,145,1500
|
|
|
1925 |
Brad,Katona,Superman,16,5,0,False,168,135,64.0,Orthodox,"Dec 19, 1991",http://ufcstats.com/fighter-details/7b433309b0fd12aa,64,135,1471
|
1926 |
Calvin,Kattar,,23,9,0,False,180,145,72.0,Orthodox,"Mar 26, 1988",http://ufcstats.com/fighter-details/751de04455cfaac0,72,145,1530
|
1927 |
Sarah,Kaufman,,18,4,0,False,165,135,66.0,Orthodox,"Sep 20, 1985",http://ufcstats.com/fighter-details/36df8e119aec6175,66,135,1481
|
1928 |
+
Lone'er,Kavanagh,,9,0,0,False,163,125,67.0,Orthodox,"Jun 09, 1999",http://ufcstats.com/fighter-details/bb2c3c3a466224af,67,125,1500
|
1929 |
Yusuke,Kawaguchi,,18,12,0,False,183,255,,,"Aug 14, 1980",http://ufcstats.com/fighter-details/fa2320781bfe4f49,,255,1500
|
1930 |
Canaan,Kawaihae,,6,2,0,False,183,145,71.0,Southpaw,"Aug 26, 1997",http://ufcstats.com/fighter-details/58d42b9e920b25fc,71,145,1500
|
1931 |
Tatsuya,Kawajiri,Crusher,36,11,2,False,170,145,69.0,Orthodox,"May 08, 1978",http://ufcstats.com/fighter-details/80d918336163b80c,69,145,1509
|
|
|
2432 |
Elaina,Maxwell,Beef,7,4,0,False,173,145,,Orthodox,"Dec 16, 1978",http://ufcstats.com/fighter-details/a16ce18149021139,,145,1500
|
2433 |
Jack,May,The Outlaw,9,3,0,False,203,255,,Switch,"Apr 14, 1981",http://ufcstats.com/fighter-details/2c6e81426dd7573c,,255,1462
|
2434 |
Jeremy,May,,13,10,0,False,185,185,,,"Oct 23, 1986",http://ufcstats.com/fighter-details/c32fdfe75cda5b22,,185,1500
|
2435 |
+
Don'Tale,Mayes,Lord Kong,11,9,0,False,198,260,81.0,Orthodox,"Jan 16, 1992",http://ufcstats.com/fighter-details/1a9480fc288e55d7,81,260,1500
|
2436 |
Gray,Maynard,,13,7,1,False,175,155,70.0,Orthodox,"May 09, 1979",http://ufcstats.com/fighter-details/7d96bc577e5178b2,70,155,1544
|
2437 |
Brooke,Mayo,The Bully,0,1,0,False,170,125,,,,http://ufcstats.com/fighter-details/1b41c21d947d6f2f,,125,1500
|
2438 |
Gina,Mazany,Danger,7,6,0,False,168,125,68.0,Southpaw,"Aug 19, 1988",http://ufcstats.com/fighter-details/016a8d958883167c,68,125,1429
|
|
|
2824 |
Umar,Nurmagomedov,,18,1,0,False,173,135,69.0,Orthodox,"Jan 03, 1996",http://ufcstats.com/fighter-details/2b6fc1c02736833d,69,135,1605
|
2825 |
Adilet,Nurmatov,Kok-Zhal,13,2,0,False,,155,,,"Aug 02, 1997",http://ufcstats.com/fighter-details/689502703bbfe5f2,,155,1500
|
2826 |
Kennedy,Nzechukwu,African Savage,14,5,0,False,196,236,83.0,Southpaw,"Jun 13, 1992",http://ufcstats.com/fighter-details/8667caa0451d245b,83,236,1550
|
2827 |
+
Jake,O'Brien,Irish,15,4,0,False,190,205,76.0,Orthodox,"Sep 25, 1984",http://ufcstats.com/fighter-details/20bcc9966affb19c,76,205,1500
|
2828 |
+
TJ,O'Brien,The Spider,20,8,0,False,188,155,,Orthodox,"Jan 01, 1987",http://ufcstats.com/fighter-details/d25b93992f285953,,155,1500
|
2829 |
+
Sean,O'Connell,The Real OC,17,9,0,False,185,205,74.0,Orthodox,"Sep 02, 1983",http://ufcstats.com/fighter-details/cb52f9490c2dc069,74,205,1500
|
2830 |
+
Dan,O'Connor,Johnny Irish,6,6,0,False,163,115,,,"Nov 17, 1982",http://ufcstats.com/fighter-details/69ea0119f6f0dfe0,,115,1500
|
2831 |
+
Sean,O'Haire,,4,2,0,False,196,270,,Orthodox,"Feb 25, 1971",http://ufcstats.com/fighter-details/46effbd1135423c5,,270,1500
|
2832 |
+
Sean,O'Malley,Suga,18,3,0,False,180,135,72.0,Switch,"Oct 24, 1994",http://ufcstats.com/fighter-details/b50a426a33da0012,72,135,1500
|
2833 |
+
Jeremiah,O'Neal,,13,25,0,False,173,260,,,"Oct 25, 1977",http://ufcstats.com/fighter-details/338fda4ec7034c5d,,260,1500
|
2834 |
+
Chuck,O'Neil,Cold Steel,17,9,0,False,188,170,,,"Sep 22, 1985",http://ufcstats.com/fighter-details/56bc9ccb609df534,,170,1500
|
2835 |
+
Casey,O'Neill,King,10,2,0,False,168,125,69.0,Orthodox,"Oct 07, 1997",http://ufcstats.com/fighter-details/04835018f90b118c,69,125,1500
|
2836 |
+
Brendan,O'Reilly,Badger,6,3,0,False,170,170,69.0,Orthodox,"Jun 24, 1987",http://ufcstats.com/fighter-details/494b0bfdbac74502,69,170,1500
|
2837 |
Takahiro,Oba,,5,7,1,False,173,200,,Southpaw,,http://ufcstats.com/fighter-details/7139cd2ae4bf6a29,,200,1500
|
2838 |
Nobuhiro,Obiya,,12,5,1,False,173,145,,Orthodox,"Jan 15, 1981",http://ufcstats.com/fighter-details/6e3282d57d2467a0,,145,1500
|
2839 |
Jose,Ochoa,Kalzifer,8,1,0,False,170,125,67.0,Southpaw,"Dec 31, 2000",http://ufcstats.com/fighter-details/88be62d6c1e6dadb,67,125,1502
|
|
|
2905 |
Shungo,Oyama,,14,19,0,False,180,185,,Southpaw,"Apr 11, 1974",http://ufcstats.com/fighter-details/47b7e4e60813b7b2,,185,1500
|
2906 |
Ren,Ozaki,,6,1,2,False,170,135,68.0,Orthodox,"Dec 15, 2001",http://ufcstats.com/fighter-details/8997ee20b6a43d76,68,135,1500
|
2907 |
Alptekin,Ozkilic,The Turkish Delight,9,5,0,False,165,125,65.0,Orthodox,"Mar 27, 1986",http://ufcstats.com/fighter-details/e18a19001a3f7c7d,65,125,1463
|
2908 |
+
Raquel,Pa'aluhi,Rocky,6,5,0,False,170,135,,,,http://ufcstats.com/fighter-details/373be586f370d400,,135,1500
|
2909 |
Nick,Pace,,8,3,0,False,170,135,68.0,Orthodox,"Apr 17, 1987",http://ufcstats.com/fighter-details/8cb76103cd8a1562,68,135,1479
|
2910 |
Angel,Pacheco,,7,3,0,False,173,135,70.0,Orthodox,"Jan 13, 1992",http://ufcstats.com/fighter-details/07797f10b9569cfc,70,135,1479
|
2911 |
Larissa,Pacheco,,10,2,0,False,170,135,,Orthodox,"Sep 07, 1994",http://ufcstats.com/fighter-details/16b89be2f5c16fba,,135,1462
|
|
|
3438 |
Joseph,Sandoval,,7,7,0,False,170,135,,Southpaw,"May 11, 1986",http://ufcstats.com/fighter-details/696002b59f09d73b,,135,1461
|
3439 |
Raul,Sandoval,,3,3,0,False,,130,,,,http://ufcstats.com/fighter-details/f9ad10f6a49e5452,,130,1500
|
3440 |
Chris,Sanford,,5,1,0,False,180,185,,Orthodox,"Mar 12, 1968",http://ufcstats.com/fighter-details/29f935654825331b,,185,1480
|
3441 |
+
Roldan,Sangcha'an,The Executioner,4,2,0,False,163,125,,Orthodox,"Dec 04, 1990",http://ufcstats.com/fighter-details/57887765f831e228,,125,1500
|
3442 |
Martin,Sano,Spartan,4,3,1,False,180,170,75.0,Orthodox,"May 30, 1991",http://ufcstats.com/fighter-details/16a64f93f6678b7b,75,170,1481
|
3443 |
Yuhi,Sano,,0,4,0,False,180,243,,Orthodox,"Feb 02, 1965",http://ufcstats.com/fighter-details/4c12aa7ca246e7a4,,243,1500
|
3444 |
Jonathan,Santa Maria,,3,4,0,False,175,125,,,,http://ufcstats.com/fighter-details/3143e5daff9e5b71,,125,1500
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
requests
|
2 |
-
beautifulsoup4
|
|
|
|
|
|
1 |
requests
|
2 |
+
beautifulsoup4
|
3 |
+
pandas
|
4 |
+
scikit-learn
|
src/predict/main.py
CHANGED
@@ -1,30 +1,37 @@
|
|
1 |
-
|
|
|
2 |
from .pipeline import PredictionPipeline
|
3 |
|
4 |
def main():
|
5 |
"""
|
6 |
-
|
7 |
-
|
8 |
"""
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
|
14 |
-
# Add other models here to compare them, e.g.:
|
15 |
-
# logistic_model = LogisticRegressionModel()
|
16 |
-
|
17 |
-
# 2. Create a list of the models to evaluate
|
18 |
models_to_run = [
|
19 |
-
|
20 |
-
|
21 |
]
|
|
|
22 |
|
23 |
-
# 3. Initialize and run the pipeline
|
24 |
pipeline = PredictionPipeline(models=models_to_run)
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
|
29 |
if __name__ == '__main__':
|
30 |
main()
|
|
|
1 |
+
import argparse
|
2 |
+
from .models import EloBaselineModel, LogisticRegressionModel
|
3 |
from .pipeline import PredictionPipeline
|
4 |
|
5 |
def main():
|
6 |
"""
|
7 |
+
Main entry point to run the prediction pipeline.
|
8 |
+
You can specify which models to run and the reporting format.
|
9 |
"""
|
10 |
+
parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
|
11 |
+
parser.add_argument(
|
12 |
+
'--report',
|
13 |
+
type=str,
|
14 |
+
default='detailed',
|
15 |
+
choices=['detailed', 'summary'],
|
16 |
+
help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
|
17 |
+
)
|
18 |
+
args = parser.parse_args()
|
19 |
|
20 |
+
# --- Define Models to Run ---
|
21 |
+
# Instantiate all the models you want to evaluate here.
|
|
|
|
|
|
|
|
|
|
|
22 |
models_to_run = [
|
23 |
+
EloBaselineModel(),
|
24 |
+
LogisticRegressionModel(),
|
25 |
]
|
26 |
+
# --- End of Model Definition ---
|
27 |
|
|
|
28 |
pipeline = PredictionPipeline(models=models_to_run)
|
29 |
|
30 |
+
try:
|
31 |
+
pipeline.run(detailed_report=(args.report == 'detailed'))
|
32 |
+
except FileNotFoundError as e:
|
33 |
+
print(f"Error: {e}")
|
34 |
+
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
|
35 |
|
36 |
if __name__ == '__main__':
|
37 |
main()
|
src/predict/models.py
CHANGED
@@ -2,6 +2,10 @@ from abc import ABC, abstractmethod
|
|
2 |
import sys
|
3 |
import os
|
4 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
|
|
|
|
|
|
|
|
5 |
|
6 |
class BaseModel(ABC):
|
7 |
"""
|
@@ -18,12 +22,11 @@ class BaseModel(ABC):
|
|
18 |
pass
|
19 |
|
20 |
@abstractmethod
|
21 |
-
def predict(self,
|
22 |
"""
|
23 |
Predicts the winner of a single fight.
|
24 |
|
25 |
-
:param
|
26 |
-
:param fighter2_name: The name of the second fighter.
|
27 |
:return: The name of the predicted winning fighter.
|
28 |
"""
|
29 |
pass
|
@@ -33,24 +36,113 @@ class EloBaselineModel(BaseModel):
|
|
33 |
A baseline prediction model that predicts the winner based on the higher ELO rating.
|
34 |
"""
|
35 |
def __init__(self):
|
36 |
-
self.
|
37 |
|
38 |
def train(self, train_fights):
|
39 |
"""
|
40 |
-
|
41 |
-
|
42 |
"""
|
43 |
-
print("Training ELO
|
44 |
-
self.
|
45 |
-
|
|
|
46 |
|
47 |
-
def predict(self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
"""
|
49 |
-
Predicts the
|
50 |
-
If a fighter has no ELO rating, the default initial ELO is used.
|
51 |
"""
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
return fighter1_name if elo1 > elo2 else fighter2_name
|
|
|
2 |
import sys
|
3 |
import os
|
4 |
from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
|
5 |
+
import pandas as pd
|
6 |
+
from sklearn.linear_model import LogisticRegression
|
7 |
+
from ..config import FIGHTERS_CSV_PATH
|
8 |
+
from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
|
9 |
|
10 |
class BaseModel(ABC):
|
11 |
"""
|
|
|
22 |
pass
|
23 |
|
24 |
@abstractmethod
|
25 |
+
def predict(self, fight):
|
26 |
"""
|
27 |
Predicts the winner of a single fight.
|
28 |
|
29 |
+
:param fight: A dictionary representing a single fight.
|
|
|
30 |
:return: The name of the predicted winning fighter.
|
31 |
"""
|
32 |
pass
|
|
|
36 |
A baseline prediction model that predicts the winner based on the higher ELO rating.
|
37 |
"""
|
38 |
def __init__(self):
|
39 |
+
self.fighters_df = None
|
40 |
|
41 |
def train(self, train_fights):
|
42 |
"""
|
43 |
+
For the ELO baseline, 'training' simply consists of loading the fighter data
|
44 |
+
to access their ELO scores during prediction.
|
45 |
"""
|
46 |
+
print("Training EloBaselineModel: Loading fighter ELO data...")
|
47 |
+
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
48 |
+
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
49 |
+
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
50 |
|
51 |
+
def predict(self, fight):
|
52 |
+
"""Predicts the winner based on who has the higher ELO score."""
|
53 |
+
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
54 |
+
|
55 |
+
try:
|
56 |
+
f1_elo = self.fighters_df.loc[f1_name, 'elo']
|
57 |
+
f2_elo = self.fighters_df.loc[f2_name, 'elo']
|
58 |
+
|
59 |
+
return f1_name if f1_elo > f2_elo else f2_name
|
60 |
+
except KeyError as e:
|
61 |
+
# If a fighter isn't found, we can't make a prediction.
|
62 |
+
# Returning None or a default is a design choice.
|
63 |
+
print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
|
64 |
+
return None
|
65 |
+
|
66 |
+
class LogisticRegressionModel(BaseModel):
|
67 |
+
"""
|
68 |
+
A model that uses logistic regression to predict fight outcomes based on differential features.
|
69 |
+
"""
|
70 |
+
def __init__(self):
|
71 |
+
self.model = LogisticRegression(solver='liblinear', random_state=42)
|
72 |
+
self.fighters_df = None
|
73 |
+
self.fighter_histories = {}
|
74 |
+
|
75 |
+
def train(self, train_fights):
|
76 |
+
"""
|
77 |
+
Trains the logistic regression model by preprocessing the training data
|
78 |
+
and fitting the model.
|
79 |
+
"""
|
80 |
+
print("Training LogisticRegressionModel...")
|
81 |
+
|
82 |
+
# 1. Prepare data for prediction-time feature generation
|
83 |
+
self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
|
84 |
+
self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
|
85 |
+
self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
|
86 |
+
for col in ['height_cm', 'reach_in', 'elo']:
|
87 |
+
if col in self.fighters_df.columns:
|
88 |
+
self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
|
89 |
+
|
90 |
+
# 2. Pre-calculate fighter histories for efficient lookup during prediction
|
91 |
+
train_fights_with_dates = []
|
92 |
+
for fight in train_fights:
|
93 |
+
fight['date_obj'] = pd.to_datetime(fight['event_date'])
|
94 |
+
train_fights_with_dates.append(fight)
|
95 |
+
|
96 |
+
for fighter_name in self.fighters_df.index:
|
97 |
+
history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
|
98 |
+
self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
|
99 |
+
|
100 |
+
# 3. Preprocess training data and fit the model
|
101 |
+
X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
|
102 |
+
print(f"Fitting model on {X_train.shape[0]} samples...")
|
103 |
+
self.model.fit(X_train, y_train)
|
104 |
+
print("Model training complete.")
|
105 |
+
|
106 |
+
def predict(self, fight):
|
107 |
"""
|
108 |
+
Predicts the outcome of a single fight by generating its feature vector.
|
|
|
109 |
"""
|
110 |
+
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
111 |
+
fight_date = pd.to_datetime(fight['event_date'])
|
112 |
+
|
113 |
+
if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
|
114 |
+
print(f"Warning: Fighter not found in data. Skipping prediction for {f1_name} vs {f2_name}")
|
115 |
+
return None
|
116 |
+
|
117 |
+
# 1. Get base stats
|
118 |
+
f1_stats, f2_stats = self.fighters_df.loc[f1_name], self.fighters_df.loc[f2_name]
|
119 |
+
if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
|
120 |
+
if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
|
121 |
+
|
122 |
+
# 2. Get historical stats
|
123 |
+
f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, self.fighter_histories.get(f1_name, []), self.fighters_df)
|
124 |
+
f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, self.fighter_histories.get(f2_name, []), self.fighters_df)
|
125 |
+
|
126 |
+
# 3. Create differential features
|
127 |
+
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
128 |
+
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
129 |
+
|
130 |
+
features = {
|
131 |
+
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
132 |
+
'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
|
133 |
+
'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
|
134 |
+
'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
|
135 |
+
'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
|
136 |
+
'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
|
137 |
+
'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
|
138 |
+
'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
|
139 |
+
'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
|
140 |
+
}
|
141 |
+
|
142 |
+
feature_vector = pd.DataFrame([features]).fillna(0)
|
143 |
+
|
144 |
+
# 4. Predict
|
145 |
+
# The model predicts the probability of class '1' (a win for fighter_1)
|
146 |
+
prediction = self.model.predict(feature_vector)[0]
|
147 |
|
148 |
+
return f1_name if prediction == 1 else f2_name
|
|
src/predict/pipeline.py
CHANGED
@@ -64,7 +64,8 @@ class PredictionPipeline:
|
|
64 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
65 |
actual_winner = fight['winner']
|
66 |
event_name = fight.get('event_name', 'Unknown Event')
|
67 |
-
|
|
|
68 |
|
69 |
is_correct = (predicted_winner == actual_winner)
|
70 |
if is_correct:
|
|
|
64 |
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
65 |
actual_winner = fight['winner']
|
66 |
event_name = fight.get('event_name', 'Unknown Event')
|
67 |
+
|
68 |
+
predicted_winner = model.predict(fight)
|
69 |
|
70 |
is_correct = (predicted_winner == actual_winner)
|
71 |
if is_correct:
|
src/predict/preprocess.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
from datetime import datetime
|
5 |
+
from ..config import FIGHTERS_CSV_PATH
|
6 |
+
|
7 |
+
def _clean_numeric_column(series):
|
8 |
+
"""A helper to clean string columns into numbers, handling errors."""
|
9 |
+
series_str = series.astype(str)
|
10 |
+
return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
|
11 |
+
|
12 |
+
def _calculate_age(dob_str, fight_date_str):
|
13 |
+
"""Calculates age in years from a date of birth string and fight date string."""
|
14 |
+
if pd.isna(dob_str) or not dob_str:
|
15 |
+
return None
|
16 |
+
try:
|
17 |
+
dob = datetime.strptime(dob_str, '%b %d, %Y')
|
18 |
+
fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
|
19 |
+
return (fight_date - dob).days / 365.25
|
20 |
+
except (ValueError, TypeError):
|
21 |
+
return None
|
22 |
+
|
23 |
+
def _parse_round_time_to_seconds(round_str, time_str):
|
24 |
+
"""Converts fight duration from round and time to total seconds."""
|
25 |
+
try:
|
26 |
+
rounds = int(round_str)
|
27 |
+
minutes, seconds = map(int, time_str.split(':'))
|
28 |
+
# Assuming 5-minute rounds for calculation simplicity
|
29 |
+
return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
|
30 |
+
except (ValueError, TypeError, AttributeError):
|
31 |
+
return 0
|
32 |
+
|
33 |
+
def _parse_striking_stats(stat_str):
|
34 |
+
"""Parses striking stats string like '10 of 20' into (landed, attempted)."""
|
35 |
+
try:
|
36 |
+
landed, attempted = map(int, stat_str.split(' of '))
|
37 |
+
return landed, attempted
|
38 |
+
except (ValueError, TypeError, AttributeError):
|
39 |
+
return 0, 0
|
40 |
+
|
41 |
+
def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
|
42 |
+
"""
|
43 |
+
Calculates performance statistics for a fighter based on their last n fights.
|
44 |
+
"""
|
45 |
+
past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
|
46 |
+
last_n_fights = past_fights[-n:]
|
47 |
+
|
48 |
+
if not last_n_fights:
|
49 |
+
# Return a default dictionary with the correct keys for a fighter with no history
|
50 |
+
return {
|
51 |
+
'wins_last_n': 0,
|
52 |
+
'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
|
53 |
+
'ko_percent_last_n': 0,
|
54 |
+
'sig_str_landed_per_min_last_n': 0,
|
55 |
+
}
|
56 |
+
|
57 |
+
stats = {
|
58 |
+
'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
|
59 |
+
'sig_str_landed': 0, 'opponent_elos': []
|
60 |
+
}
|
61 |
+
|
62 |
+
for fight in last_n_fights:
|
63 |
+
is_fighter_1 = (fight['fighter_1'] == fighter_name)
|
64 |
+
opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
|
65 |
+
|
66 |
+
if fight['winner'] == fighter_name:
|
67 |
+
stats['wins'] += 1
|
68 |
+
if 'KO' in fight['method']:
|
69 |
+
stats['ko_wins'] += 1
|
70 |
+
|
71 |
+
if opponent_name in fighters_df.index:
|
72 |
+
opp_elo = fighters_df.loc[opponent_name, 'elo']
|
73 |
+
stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
|
74 |
+
|
75 |
+
stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
|
76 |
+
|
77 |
+
sig_str_stat = fight.get(f'f1_sig_str' if is_fighter_1 else 'f2_sig_str', '0 of 0')
|
78 |
+
landed, _ = _parse_striking_stats(sig_str_stat)
|
79 |
+
stats['sig_str_landed'] += landed
|
80 |
+
|
81 |
+
# Final calculations
|
82 |
+
avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
|
83 |
+
|
84 |
+
return {
|
85 |
+
'wins_last_n': stats['wins'],
|
86 |
+
'avg_opp_elo_last_n': avg_opp_elo,
|
87 |
+
'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
|
88 |
+
'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] * 60 / stats['total_time_secs']) if stats['total_time_secs'] > 0 else 0,
|
89 |
+
}
|
90 |
+
|
91 |
+
def preprocess_for_ml(fights_to_process, fighters_csv_path):
|
92 |
+
"""
|
93 |
+
Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
|
94 |
+
suitable for a binary classification machine learning model.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
fights_to_process (list of dict): The list of fights to process.
|
98 |
+
fighters_csv_path (str): Path to the CSV file with all fighter stats.
|
99 |
+
|
100 |
+
Returns:
|
101 |
+
pd.DataFrame: Feature matrix X.
|
102 |
+
pd.Series: Target vector y.
|
103 |
+
pd.DataFrame: Metadata DataFrame.
|
104 |
+
"""
|
105 |
+
if not os.path.exists(fighters_csv_path):
|
106 |
+
raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
|
107 |
+
|
108 |
+
fighters_df = pd.read_csv(fighters_csv_path)
|
109 |
+
|
110 |
+
# 1. Prepare fighters data for merging
|
111 |
+
fighters_prepared = fighters_df.copy()
|
112 |
+
fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
|
113 |
+
|
114 |
+
# Handle duplicate fighter names by keeping the first entry
|
115 |
+
fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
|
116 |
+
fighters_prepared = fighters_prepared.set_index('full_name')
|
117 |
+
|
118 |
+
for col in ['height_cm', 'reach_in', 'elo']:
|
119 |
+
if col in fighters_prepared.columns:
|
120 |
+
fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
|
121 |
+
|
122 |
+
# 2. Pre-calculate fighter histories to speed up lookups
|
123 |
+
# And convert date strings to datetime objects once
|
124 |
+
for fight in fights_to_process:
|
125 |
+
fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
|
126 |
+
|
127 |
+
fighter_histories = {}
|
128 |
+
for fighter_name in fighters_prepared.index:
|
129 |
+
history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
|
130 |
+
fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
|
131 |
+
|
132 |
+
# 3. Process fights to create features and targets
|
133 |
+
feature_list = []
|
134 |
+
target_list = []
|
135 |
+
metadata_list = []
|
136 |
+
|
137 |
+
for fight in fights_to_process:
|
138 |
+
# Per the dataset's design, fighter_1 is always the winner.
|
139 |
+
f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
|
140 |
+
|
141 |
+
if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
|
142 |
+
continue
|
143 |
+
|
144 |
+
f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
|
145 |
+
|
146 |
+
if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
|
147 |
+
if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
|
148 |
+
|
149 |
+
# Calculate ages for both fighters
|
150 |
+
f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
|
151 |
+
f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
|
152 |
+
|
153 |
+
# Get historical stats for both fighters
|
154 |
+
f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
|
155 |
+
f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
|
156 |
+
|
157 |
+
# --- Create two training examples from each fight for a balanced dataset ---
|
158 |
+
|
159 |
+
# 1. The "Win" case: (fighter_1 - fighter_2)
|
160 |
+
features_win = {
|
161 |
+
# Original diffs
|
162 |
+
'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
|
163 |
+
'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
|
164 |
+
'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
|
165 |
+
'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
|
166 |
+
'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
|
167 |
+
# New historical diffs
|
168 |
+
'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
|
169 |
+
'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
|
170 |
+
'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
|
171 |
+
'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
|
172 |
+
}
|
173 |
+
feature_list.append(features_win)
|
174 |
+
target_list.append(1) # 1 represents a win
|
175 |
+
|
176 |
+
# 2. The "Loss" case: (fighter_2 - fighter_1)
|
177 |
+
# We invert the differences for the losing case.
|
178 |
+
features_loss = {key: -value for key, value in features_win.items()}
|
179 |
+
# Stance difference is symmetric; it doesn't get inverted.
|
180 |
+
features_loss['stance_is_different'] = features_win['stance_is_different']
|
181 |
+
|
182 |
+
feature_list.append(features_loss)
|
183 |
+
target_list.append(0) # 0 represents a loss
|
184 |
+
|
185 |
+
# Add metadata for both generated samples
|
186 |
+
# The 'winner' and 'loser' are consistent with the original data structure
|
187 |
+
metadata_list.append({
|
188 |
+
'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
|
189 |
+
})
|
190 |
+
metadata_list.append({
|
191 |
+
'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
|
192 |
+
})
|
193 |
+
|
194 |
+
X = pd.DataFrame(feature_list).fillna(0)
|
195 |
+
y = pd.Series(target_list, name='winner')
|
196 |
+
metadata = pd.DataFrame(metadata_list)
|
197 |
+
|
198 |
+
print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
|
199 |
+
return X, y, metadata
|
200 |
+
|
201 |
+
if __name__ == '__main__':
|
202 |
+
from .pipeline import PredictionPipeline
|
203 |
+
|
204 |
+
print("--- Running Preprocessing Example ---")
|
205 |
+
|
206 |
+
pipeline = PredictionPipeline(models=[])
|
207 |
+
try:
|
208 |
+
pipeline._load_and_split_data()
|
209 |
+
if pipeline.train_fights:
|
210 |
+
X_train, y_train, metadata_train = preprocess_for_ml(pipeline.train_fights, FIGHTERS_CSV_PATH)
|
211 |
+
print("\nTraining Data Shape:")
|
212 |
+
print("X_train:", X_train.shape)
|
213 |
+
print("y_train:", y_train.shape)
|
214 |
+
print("metadata_train:", metadata_train.shape)
|
215 |
+
|
216 |
+
print("\nLast 5 rows of X_train (showing populated historical features):")
|
217 |
+
print(X_train.tail())
|
218 |
+
|
219 |
+
print("\nTarget distribution (0=Loss, 1=Win):")
|
220 |
+
print(y_train.value_counts())
|
221 |
+
|
222 |
+
print("\nMetadata for last 5 rows:")
|
223 |
+
print(metadata_train.tail())
|
224 |
+
|
225 |
+
except FileNotFoundError as e:
|
226 |
+
print(e)
|
227 |
+
print("Please run the scraping pipeline first ('python -m src.scrape.main').")
|
src/scrape/main.py
CHANGED
@@ -8,37 +8,29 @@ from .. import config
|
|
8 |
|
9 |
def main():
|
10 |
"""
|
11 |
-
Main
|
12 |
"""
|
13 |
# Ensure the output directory exists
|
14 |
if not os.path.exists(config.OUTPUT_DIR):
|
15 |
os.makedirs(config.OUTPUT_DIR)
|
16 |
print(f"Created directory: {config.OUTPUT_DIR}")
|
17 |
|
18 |
-
# --- Step 1: Scrape
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
json.dump(all_events_data, f, indent=4)
|
23 |
-
print(f"Scraping for events complete. Data saved to {config.EVENTS_JSON_PATH}")
|
24 |
|
25 |
-
# --- Step 2:
|
26 |
-
|
27 |
-
all_fighters_data = scrape_all_fighters()
|
28 |
-
with open(config.FIGHTERS_JSON_PATH, 'w') as f:
|
29 |
-
json.dump(all_fighters_data, f, indent=4)
|
30 |
-
print(f"Scraping for fighters complete. Data saved to {config.FIGHTERS_JSON_PATH}")
|
31 |
-
|
32 |
-
# --- Step 3: Convert JSON to CSV ---
|
33 |
-
print("\n--- Converting all JSON files to CSV ---")
|
34 |
json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
|
35 |
fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
|
36 |
|
37 |
-
# --- Step
|
38 |
-
|
39 |
-
|
|
|
40 |
|
41 |
-
# --- Step
|
42 |
print("\n--- Deleting temporary JSON files ---")
|
43 |
try:
|
44 |
if os.path.exists(config.EVENTS_JSON_PATH):
|
@@ -50,7 +42,7 @@ def main():
|
|
50 |
except OSError as e:
|
51 |
print(f"Error deleting JSON files: {e}")
|
52 |
|
53 |
-
print("\n--- Pipeline Finished ---")
|
54 |
|
55 |
if __name__ == '__main__':
|
56 |
main()
|
|
|
8 |
|
9 |
def main():
|
10 |
"""
|
11 |
+
Main function to run the complete scraping and preprocessing pipeline.
|
12 |
"""
|
13 |
# Ensure the output directory exists
|
14 |
if not os.path.exists(config.OUTPUT_DIR):
|
15 |
os.makedirs(config.OUTPUT_DIR)
|
16 |
print(f"Created directory: {config.OUTPUT_DIR}")
|
17 |
|
18 |
+
# --- Step 1: Scrape all data from the website ---
|
19 |
+
# This will generate fighters.json and events.json
|
20 |
+
scrape_all_fighters()
|
21 |
+
scrape_all_events()
|
|
|
|
|
22 |
|
23 |
+
# --- Step 2: Convert the scraped JSON data to CSV format ---
|
24 |
+
# This will generate fighters.csv and fights.csv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
|
26 |
fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
|
27 |
|
28 |
+
# --- Step 3: Run post-processing on the generated CSV files ---
|
29 |
+
# This cleans names, converts height, etc.
|
30 |
+
print("\n--- Running post-scraping preprocessing ---")
|
31 |
+
preprocess_fighters_csv()
|
32 |
|
33 |
+
# --- Step 4: Clean up temporary JSON files ---
|
34 |
print("\n--- Deleting temporary JSON files ---")
|
35 |
try:
|
36 |
if os.path.exists(config.EVENTS_JSON_PATH):
|
|
|
42 |
except OSError as e:
|
43 |
print(f"Error deleting JSON files: {e}")
|
44 |
|
45 |
+
print("\n\n--- Scraping and Preprocessing Pipeline Finished ---")
|
46 |
|
47 |
if __name__ == '__main__':
|
48 |
main()
|
src/scrape/preprocess.py
CHANGED
@@ -26,7 +26,7 @@ def convert_height_to_cm(height_str):
|
|
26 |
|
27 |
def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
|
28 |
"""
|
29 |
-
Reads the fighters CSV, converts height to cm,
|
30 |
and saves the changes back to the same file.
|
31 |
"""
|
32 |
if not os.path.exists(file_path):
|
@@ -46,18 +46,24 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
|
|
46 |
headers = reader.fieldnames
|
47 |
rows = list(reader)
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
return
|
53 |
-
|
54 |
# Process the rows in memory
|
55 |
for row in rows:
|
56 |
-
#
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
# Update the header name
|
60 |
-
|
|
|
61 |
|
62 |
# Write the modified data back to the same file, overwriting it
|
63 |
with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
|
@@ -66,7 +72,10 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
|
|
66 |
writer.writerows(rows)
|
67 |
|
68 |
print(f"Successfully processed file: {file_path}")
|
69 |
-
|
|
|
|
|
|
|
70 |
|
71 |
except Exception as e:
|
72 |
print(f"An error occurred: {e}")
|
|
|
26 |
|
27 |
def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
|
28 |
"""
|
29 |
+
Reads the fighters CSV, cleans names, converts height to cm,
|
30 |
and saves the changes back to the same file.
|
31 |
"""
|
32 |
if not os.path.exists(file_path):
|
|
|
46 |
headers = reader.fieldnames
|
47 |
rows = list(reader)
|
48 |
|
49 |
+
# --- Data Cleaning and Processing ---
|
50 |
+
|
51 |
+
name_cleaned_count = 0
|
|
|
|
|
52 |
# Process the rows in memory
|
53 |
for row in rows:
|
54 |
+
# Clean fighter names (e.g., "O ftMalley" -> "O'Malley")
|
55 |
+
for col in ['first_name', 'last_name']:
|
56 |
+
if col in row and ' ft' in row[col]:
|
57 |
+
row[col] = row[col].replace(' ft', "'")
|
58 |
+
name_cleaned_count += 1
|
59 |
+
|
60 |
+
# Convert height to cm and remove the old column
|
61 |
+
if 'height' in row:
|
62 |
+
row['height_cm'] = convert_height_to_cm(row.pop('height'))
|
63 |
|
64 |
+
# Update the header name if 'height' was present
|
65 |
+
if 'height' in headers:
|
66 |
+
headers[headers.index('height')] = 'height_cm'
|
67 |
|
68 |
# Write the modified data back to the same file, overwriting it
|
69 |
with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
|
|
|
72 |
writer.writerows(rows)
|
73 |
|
74 |
print(f"Successfully processed file: {file_path}")
|
75 |
+
if name_cleaned_count > 0:
|
76 |
+
print(f"Cleaned {name_cleaned_count} instances of ' ft' in fighter names.")
|
77 |
+
if 'height_cm' in headers:
|
78 |
+
print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
|
79 |
|
80 |
except Exception as e:
|
81 |
print(f"An error occurred: {e}")
|