Alvaro commited on
Commit
bf7e729
·
1 Parent(s): 32a1590

Add logistic regression model and ML preprocessing

Browse files

Introduces a new LogisticRegressionModel for fight outcome prediction, including a comprehensive ML preprocessing module (preprocess.py) for feature engineering and data preparation. Refactors model interfaces to accept fight dictionaries, updates the pipeline and main entrypoint for model selection and reporting, and improves data cleaning in the scraping and preprocessing steps. Also fixes fighter name formatting and adds pandas and scikit-learn as dependencies.

output/ufc_fighters.csv CHANGED
@@ -374,7 +374,7 @@ Brad,Blackburn,Bad Brad,18,13,1,False,178,170,73.0,Orthodox,"May 25, 1977",http:
374
  Jason,Blackford,,3,3,0,False,,,,,,http://ufcstats.com/fighter-details/619d807fa54ae8f7,,,1500
375
  Sherrard,Blackledge,The Thriller,5,1,0,False,180,155,75.0,Orthodox,"Aug 16, 1993",http://ufcstats.com/fighter-details/0e5c79b3594ff0ad,75,155,1500
376
  Tom,Blackledge,,10,7,0,False,,205,,Orthodox,,http://ufcstats.com/fighter-details/2adb11835acd815b,,205,1477
377
- Da ftMon,Blackshear,The Monster,17,7,1,False,178,135,72.0,Switch,"Aug 12, 1994",http://ufcstats.com/fighter-details/da22387a0407a2dc,72,135,1500
378
  Chasen,Blair,Mestizo,6,3,0,False,178,155,71.0,Orthodox,"Oct 14, 1998",http://ufcstats.com/fighter-details/0f4a536507f33576,71,155,1500
379
  Erin,Blanchfield,Cold Blooded,13,2,0,False,163,125,66.0,Orthodox,"May 04, 1999",http://ufcstats.com/fighter-details/669970f7feba8ecd,66,125,1622
380
  David,Blanco,,2,0,0,False,170,145,,,,http://ufcstats.com/fighter-details/ebf298f8ac7e232b,,145,1500
@@ -831,7 +831,7 @@ Will,Currie,Drago,12,4,0,False,190,185,76.0,Switch,"Nov 12, 1998",http://ufcstat
831
  Chris,Curtis,Action-Man,31,12,0,False,178,170,75.0,Orthodox,"Jul 15, 1987",http://ufcstats.com/fighter-details/5442f1bc4b47eaf3,75,170,1529
832
  Ion,Cutelaba,The Hulk,19,11,1,False,185,205,75.0,Southpaw,"Dec 14, 1993",http://ufcstats.com/fighter-details/cd13728ae1151f46,75,205,1492
833
  Gleidson,Cutis,Poney,7,4,0,False,175,155,,Orthodox,"Feb 07, 1989",http://ufcstats.com/fighter-details/44a94bbde42246e4,,155,1500
834
- Sarah,D ftalelio,The Monster,11,7,0,False,170,145,,,"Dec 13, 1980",http://ufcstats.com/fighter-details/ac45450f75d14f16,,145,1500
835
  Alex,Da Silva,Leko,21,4,0,False,173,155,73.0,Orthodox,"Feb 04, 1996",http://ufcstats.com/fighter-details/c3ded6f7155f9ea4,73,155,1466
836
  Paulo,Da Silva,PH,11,1,0,False,,125,,,"Jul 03, 2002",http://ufcstats.com/fighter-details/3fe8ad7e1594537a,,125,1500
837
  Radley,Da Silva,Snake Eyes,7,1,0,False,,145,,,"Jun 20, 1995",http://ufcstats.com/fighter-details/0d68c7bfdff1dc34,,145,1500
@@ -1925,7 +1925,7 @@ Tetsuji,Kato,,20,10,0,False,,155,,Orthodox,,http://ufcstats.com/fighter-details/
1925
  Brad,Katona,Superman,16,5,0,False,168,135,64.0,Orthodox,"Dec 19, 1991",http://ufcstats.com/fighter-details/7b433309b0fd12aa,64,135,1471
1926
  Calvin,Kattar,,23,9,0,False,180,145,72.0,Orthodox,"Mar 26, 1988",http://ufcstats.com/fighter-details/751de04455cfaac0,72,145,1530
1927
  Sarah,Kaufman,,18,4,0,False,165,135,66.0,Orthodox,"Sep 20, 1985",http://ufcstats.com/fighter-details/36df8e119aec6175,66,135,1481
1928
- Lone fter,Kavanagh,,9,0,0,False,163,125,67.0,Orthodox,"Jun 09, 1999",http://ufcstats.com/fighter-details/bb2c3c3a466224af,67,125,1500
1929
  Yusuke,Kawaguchi,,18,12,0,False,183,255,,,"Aug 14, 1980",http://ufcstats.com/fighter-details/fa2320781bfe4f49,,255,1500
1930
  Canaan,Kawaihae,,6,2,0,False,183,145,71.0,Southpaw,"Aug 26, 1997",http://ufcstats.com/fighter-details/58d42b9e920b25fc,71,145,1500
1931
  Tatsuya,Kawajiri,Crusher,36,11,2,False,170,145,69.0,Orthodox,"May 08, 1978",http://ufcstats.com/fighter-details/80d918336163b80c,69,145,1509
@@ -2432,7 +2432,7 @@ Nick,Maximov,,8,2,0,False,183,185,76.0,Southpaw,"Dec 23, 1997",http://ufcstats.c
2432
  Elaina,Maxwell,Beef,7,4,0,False,173,145,,Orthodox,"Dec 16, 1978",http://ufcstats.com/fighter-details/a16ce18149021139,,145,1500
2433
  Jack,May,The Outlaw,9,3,0,False,203,255,,Switch,"Apr 14, 1981",http://ufcstats.com/fighter-details/2c6e81426dd7573c,,255,1462
2434
  Jeremy,May,,13,10,0,False,185,185,,,"Oct 23, 1986",http://ufcstats.com/fighter-details/c32fdfe75cda5b22,,185,1500
2435
- Don ftTale,Mayes,Lord Kong,11,9,0,False,198,260,81.0,Orthodox,"Jan 16, 1992",http://ufcstats.com/fighter-details/1a9480fc288e55d7,81,260,1500
2436
  Gray,Maynard,,13,7,1,False,175,155,70.0,Orthodox,"May 09, 1979",http://ufcstats.com/fighter-details/7d96bc577e5178b2,70,155,1544
2437
  Brooke,Mayo,The Bully,0,1,0,False,170,125,,,,http://ufcstats.com/fighter-details/1b41c21d947d6f2f,,125,1500
2438
  Gina,Mazany,Danger,7,6,0,False,168,125,68.0,Southpaw,"Aug 19, 1988",http://ufcstats.com/fighter-details/016a8d958883167c,68,125,1429
@@ -2824,16 +2824,16 @@ Said,Nurmagomedov,,18,4,0,False,173,135,70.0,Orthodox,"Apr 05, 1992",http://ufcs
2824
  Umar,Nurmagomedov,,18,1,0,False,173,135,69.0,Orthodox,"Jan 03, 1996",http://ufcstats.com/fighter-details/2b6fc1c02736833d,69,135,1605
2825
  Adilet,Nurmatov,Kok-Zhal,13,2,0,False,,155,,,"Aug 02, 1997",http://ufcstats.com/fighter-details/689502703bbfe5f2,,155,1500
2826
  Kennedy,Nzechukwu,African Savage,14,5,0,False,196,236,83.0,Southpaw,"Jun 13, 1992",http://ufcstats.com/fighter-details/8667caa0451d245b,83,236,1550
2827
- Jake,O ftBrien,Irish,15,4,0,False,190,205,76.0,Orthodox,"Sep 25, 1984",http://ufcstats.com/fighter-details/20bcc9966affb19c,76,205,1500
2828
- TJ,O ftBrien,The Spider,20,8,0,False,188,155,,Orthodox,"Jan 01, 1987",http://ufcstats.com/fighter-details/d25b93992f285953,,155,1500
2829
- Sean,O ftConnell,The Real OC,17,9,0,False,185,205,74.0,Orthodox,"Sep 02, 1983",http://ufcstats.com/fighter-details/cb52f9490c2dc069,74,205,1500
2830
- Dan,O ftConnor,Johnny Irish,6,6,0,False,163,115,,,"Nov 17, 1982",http://ufcstats.com/fighter-details/69ea0119f6f0dfe0,,115,1500
2831
- Sean,O ftHaire,,4,2,0,False,196,270,,Orthodox,"Feb 25, 1971",http://ufcstats.com/fighter-details/46effbd1135423c5,,270,1500
2832
- Sean,O ftMalley,Suga,18,3,0,False,180,135,72.0,Switch,"Oct 24, 1994",http://ufcstats.com/fighter-details/b50a426a33da0012,72,135,1500
2833
- Jeremiah,O ftNeal,,13,25,0,False,173,260,,,"Oct 25, 1977",http://ufcstats.com/fighter-details/338fda4ec7034c5d,,260,1500
2834
- Chuck,O ftNeil,Cold Steel,17,9,0,False,188,170,,,"Sep 22, 1985",http://ufcstats.com/fighter-details/56bc9ccb609df534,,170,1500
2835
- Casey,O ftNeill,King,10,2,0,False,168,125,69.0,Orthodox,"Oct 07, 1997",http://ufcstats.com/fighter-details/04835018f90b118c,69,125,1500
2836
- Brendan,O ftReilly,Badger,6,3,0,False,170,170,69.0,Orthodox,"Jun 24, 1987",http://ufcstats.com/fighter-details/494b0bfdbac74502,69,170,1500
2837
  Takahiro,Oba,,5,7,1,False,173,200,,Southpaw,,http://ufcstats.com/fighter-details/7139cd2ae4bf6a29,,200,1500
2838
  Nobuhiro,Obiya,,12,5,1,False,173,145,,Orthodox,"Jan 15, 1981",http://ufcstats.com/fighter-details/6e3282d57d2467a0,,145,1500
2839
  Jose,Ochoa,Kalzifer,8,1,0,False,170,125,67.0,Southpaw,"Dec 31, 2000",http://ufcstats.com/fighter-details/88be62d6c1e6dadb,67,125,1502
@@ -2905,7 +2905,7 @@ Craig,Oxley,,0,3,0,False,175,155,,,"Feb 14, 1973",http://ufcstats.com/fighter-de
2905
  Shungo,Oyama,,14,19,0,False,180,185,,Southpaw,"Apr 11, 1974",http://ufcstats.com/fighter-details/47b7e4e60813b7b2,,185,1500
2906
  Ren,Ozaki,,6,1,2,False,170,135,68.0,Orthodox,"Dec 15, 2001",http://ufcstats.com/fighter-details/8997ee20b6a43d76,68,135,1500
2907
  Alptekin,Ozkilic,The Turkish Delight,9,5,0,False,165,125,65.0,Orthodox,"Mar 27, 1986",http://ufcstats.com/fighter-details/e18a19001a3f7c7d,65,125,1463
2908
- Raquel,Pa ftaluhi,Rocky,6,5,0,False,170,135,,,,http://ufcstats.com/fighter-details/373be586f370d400,,135,1500
2909
  Nick,Pace,,8,3,0,False,170,135,68.0,Orthodox,"Apr 17, 1987",http://ufcstats.com/fighter-details/8cb76103cd8a1562,68,135,1479
2910
  Angel,Pacheco,,7,3,0,False,173,135,70.0,Orthodox,"Jan 13, 1992",http://ufcstats.com/fighter-details/07797f10b9569cfc,70,135,1479
2911
  Larissa,Pacheco,,10,2,0,False,170,135,,Orthodox,"Sep 07, 1994",http://ufcstats.com/fighter-details/16b89be2f5c16fba,,135,1462
@@ -3438,7 +3438,7 @@ Hugo,Sandoval,,2,3,0,False,,135,,,,http://ufcstats.com/fighter-details/9d51bcc28
3438
  Joseph,Sandoval,,7,7,0,False,170,135,,Southpaw,"May 11, 1986",http://ufcstats.com/fighter-details/696002b59f09d73b,,135,1461
3439
  Raul,Sandoval,,3,3,0,False,,130,,,,http://ufcstats.com/fighter-details/f9ad10f6a49e5452,,130,1500
3440
  Chris,Sanford,,5,1,0,False,180,185,,Orthodox,"Mar 12, 1968",http://ufcstats.com/fighter-details/29f935654825331b,,185,1480
3441
- Roldan,Sangcha ftan,The Executioner,4,2,0,False,163,125,,Orthodox,"Dec 04, 1990",http://ufcstats.com/fighter-details/57887765f831e228,,125,1500
3442
  Martin,Sano,Spartan,4,3,1,False,180,170,75.0,Orthodox,"May 30, 1991",http://ufcstats.com/fighter-details/16a64f93f6678b7b,75,170,1481
3443
  Yuhi,Sano,,0,4,0,False,180,243,,Orthodox,"Feb 02, 1965",http://ufcstats.com/fighter-details/4c12aa7ca246e7a4,,243,1500
3444
  Jonathan,Santa Maria,,3,4,0,False,175,125,,,,http://ufcstats.com/fighter-details/3143e5daff9e5b71,,125,1500
 
374
  Jason,Blackford,,3,3,0,False,,,,,,http://ufcstats.com/fighter-details/619d807fa54ae8f7,,,1500
375
  Sherrard,Blackledge,The Thriller,5,1,0,False,180,155,75.0,Orthodox,"Aug 16, 1993",http://ufcstats.com/fighter-details/0e5c79b3594ff0ad,75,155,1500
376
  Tom,Blackledge,,10,7,0,False,,205,,Orthodox,,http://ufcstats.com/fighter-details/2adb11835acd815b,,205,1477
377
+ Da'Mon,Blackshear,The Monster,17,7,1,False,178,135,72.0,Switch,"Aug 12, 1994",http://ufcstats.com/fighter-details/da22387a0407a2dc,72,135,1500
378
  Chasen,Blair,Mestizo,6,3,0,False,178,155,71.0,Orthodox,"Oct 14, 1998",http://ufcstats.com/fighter-details/0f4a536507f33576,71,155,1500
379
  Erin,Blanchfield,Cold Blooded,13,2,0,False,163,125,66.0,Orthodox,"May 04, 1999",http://ufcstats.com/fighter-details/669970f7feba8ecd,66,125,1622
380
  David,Blanco,,2,0,0,False,170,145,,,,http://ufcstats.com/fighter-details/ebf298f8ac7e232b,,145,1500
 
831
  Chris,Curtis,Action-Man,31,12,0,False,178,170,75.0,Orthodox,"Jul 15, 1987",http://ufcstats.com/fighter-details/5442f1bc4b47eaf3,75,170,1529
832
  Ion,Cutelaba,The Hulk,19,11,1,False,185,205,75.0,Southpaw,"Dec 14, 1993",http://ufcstats.com/fighter-details/cd13728ae1151f46,75,205,1492
833
  Gleidson,Cutis,Poney,7,4,0,False,175,155,,Orthodox,"Feb 07, 1989",http://ufcstats.com/fighter-details/44a94bbde42246e4,,155,1500
834
+ Sarah,D'alelio,The Monster,11,7,0,False,170,145,,,"Dec 13, 1980",http://ufcstats.com/fighter-details/ac45450f75d14f16,,145,1500
835
  Alex,Da Silva,Leko,21,4,0,False,173,155,73.0,Orthodox,"Feb 04, 1996",http://ufcstats.com/fighter-details/c3ded6f7155f9ea4,73,155,1466
836
  Paulo,Da Silva,PH,11,1,0,False,,125,,,"Jul 03, 2002",http://ufcstats.com/fighter-details/3fe8ad7e1594537a,,125,1500
837
  Radley,Da Silva,Snake Eyes,7,1,0,False,,145,,,"Jun 20, 1995",http://ufcstats.com/fighter-details/0d68c7bfdff1dc34,,145,1500
 
1925
  Brad,Katona,Superman,16,5,0,False,168,135,64.0,Orthodox,"Dec 19, 1991",http://ufcstats.com/fighter-details/7b433309b0fd12aa,64,135,1471
1926
  Calvin,Kattar,,23,9,0,False,180,145,72.0,Orthodox,"Mar 26, 1988",http://ufcstats.com/fighter-details/751de04455cfaac0,72,145,1530
1927
  Sarah,Kaufman,,18,4,0,False,165,135,66.0,Orthodox,"Sep 20, 1985",http://ufcstats.com/fighter-details/36df8e119aec6175,66,135,1481
1928
+ Lone'er,Kavanagh,,9,0,0,False,163,125,67.0,Orthodox,"Jun 09, 1999",http://ufcstats.com/fighter-details/bb2c3c3a466224af,67,125,1500
1929
  Yusuke,Kawaguchi,,18,12,0,False,183,255,,,"Aug 14, 1980",http://ufcstats.com/fighter-details/fa2320781bfe4f49,,255,1500
1930
  Canaan,Kawaihae,,6,2,0,False,183,145,71.0,Southpaw,"Aug 26, 1997",http://ufcstats.com/fighter-details/58d42b9e920b25fc,71,145,1500
1931
  Tatsuya,Kawajiri,Crusher,36,11,2,False,170,145,69.0,Orthodox,"May 08, 1978",http://ufcstats.com/fighter-details/80d918336163b80c,69,145,1509
 
2432
  Elaina,Maxwell,Beef,7,4,0,False,173,145,,Orthodox,"Dec 16, 1978",http://ufcstats.com/fighter-details/a16ce18149021139,,145,1500
2433
  Jack,May,The Outlaw,9,3,0,False,203,255,,Switch,"Apr 14, 1981",http://ufcstats.com/fighter-details/2c6e81426dd7573c,,255,1462
2434
  Jeremy,May,,13,10,0,False,185,185,,,"Oct 23, 1986",http://ufcstats.com/fighter-details/c32fdfe75cda5b22,,185,1500
2435
+ Don'Tale,Mayes,Lord Kong,11,9,0,False,198,260,81.0,Orthodox,"Jan 16, 1992",http://ufcstats.com/fighter-details/1a9480fc288e55d7,81,260,1500
2436
  Gray,Maynard,,13,7,1,False,175,155,70.0,Orthodox,"May 09, 1979",http://ufcstats.com/fighter-details/7d96bc577e5178b2,70,155,1544
2437
  Brooke,Mayo,The Bully,0,1,0,False,170,125,,,,http://ufcstats.com/fighter-details/1b41c21d947d6f2f,,125,1500
2438
  Gina,Mazany,Danger,7,6,0,False,168,125,68.0,Southpaw,"Aug 19, 1988",http://ufcstats.com/fighter-details/016a8d958883167c,68,125,1429
 
2824
  Umar,Nurmagomedov,,18,1,0,False,173,135,69.0,Orthodox,"Jan 03, 1996",http://ufcstats.com/fighter-details/2b6fc1c02736833d,69,135,1605
2825
  Adilet,Nurmatov,Kok-Zhal,13,2,0,False,,155,,,"Aug 02, 1997",http://ufcstats.com/fighter-details/689502703bbfe5f2,,155,1500
2826
  Kennedy,Nzechukwu,African Savage,14,5,0,False,196,236,83.0,Southpaw,"Jun 13, 1992",http://ufcstats.com/fighter-details/8667caa0451d245b,83,236,1550
2827
+ Jake,O'Brien,Irish,15,4,0,False,190,205,76.0,Orthodox,"Sep 25, 1984",http://ufcstats.com/fighter-details/20bcc9966affb19c,76,205,1500
2828
+ TJ,O'Brien,The Spider,20,8,0,False,188,155,,Orthodox,"Jan 01, 1987",http://ufcstats.com/fighter-details/d25b93992f285953,,155,1500
2829
+ Sean,O'Connell,The Real OC,17,9,0,False,185,205,74.0,Orthodox,"Sep 02, 1983",http://ufcstats.com/fighter-details/cb52f9490c2dc069,74,205,1500
2830
+ Dan,O'Connor,Johnny Irish,6,6,0,False,163,115,,,"Nov 17, 1982",http://ufcstats.com/fighter-details/69ea0119f6f0dfe0,,115,1500
2831
+ Sean,O'Haire,,4,2,0,False,196,270,,Orthodox,"Feb 25, 1971",http://ufcstats.com/fighter-details/46effbd1135423c5,,270,1500
2832
+ Sean,O'Malley,Suga,18,3,0,False,180,135,72.0,Switch,"Oct 24, 1994",http://ufcstats.com/fighter-details/b50a426a33da0012,72,135,1500
2833
+ Jeremiah,O'Neal,,13,25,0,False,173,260,,,"Oct 25, 1977",http://ufcstats.com/fighter-details/338fda4ec7034c5d,,260,1500
2834
+ Chuck,O'Neil,Cold Steel,17,9,0,False,188,170,,,"Sep 22, 1985",http://ufcstats.com/fighter-details/56bc9ccb609df534,,170,1500
2835
+ Casey,O'Neill,King,10,2,0,False,168,125,69.0,Orthodox,"Oct 07, 1997",http://ufcstats.com/fighter-details/04835018f90b118c,69,125,1500
2836
+ Brendan,O'Reilly,Badger,6,3,0,False,170,170,69.0,Orthodox,"Jun 24, 1987",http://ufcstats.com/fighter-details/494b0bfdbac74502,69,170,1500
2837
  Takahiro,Oba,,5,7,1,False,173,200,,Southpaw,,http://ufcstats.com/fighter-details/7139cd2ae4bf6a29,,200,1500
2838
  Nobuhiro,Obiya,,12,5,1,False,173,145,,Orthodox,"Jan 15, 1981",http://ufcstats.com/fighter-details/6e3282d57d2467a0,,145,1500
2839
  Jose,Ochoa,Kalzifer,8,1,0,False,170,125,67.0,Southpaw,"Dec 31, 2000",http://ufcstats.com/fighter-details/88be62d6c1e6dadb,67,125,1502
 
2905
  Shungo,Oyama,,14,19,0,False,180,185,,Southpaw,"Apr 11, 1974",http://ufcstats.com/fighter-details/47b7e4e60813b7b2,,185,1500
2906
  Ren,Ozaki,,6,1,2,False,170,135,68.0,Orthodox,"Dec 15, 2001",http://ufcstats.com/fighter-details/8997ee20b6a43d76,68,135,1500
2907
  Alptekin,Ozkilic,The Turkish Delight,9,5,0,False,165,125,65.0,Orthodox,"Mar 27, 1986",http://ufcstats.com/fighter-details/e18a19001a3f7c7d,65,125,1463
2908
+ Raquel,Pa'aluhi,Rocky,6,5,0,False,170,135,,,,http://ufcstats.com/fighter-details/373be586f370d400,,135,1500
2909
  Nick,Pace,,8,3,0,False,170,135,68.0,Orthodox,"Apr 17, 1987",http://ufcstats.com/fighter-details/8cb76103cd8a1562,68,135,1479
2910
  Angel,Pacheco,,7,3,0,False,173,135,70.0,Orthodox,"Jan 13, 1992",http://ufcstats.com/fighter-details/07797f10b9569cfc,70,135,1479
2911
  Larissa,Pacheco,,10,2,0,False,170,135,,Orthodox,"Sep 07, 1994",http://ufcstats.com/fighter-details/16b89be2f5c16fba,,135,1462
 
3438
  Joseph,Sandoval,,7,7,0,False,170,135,,Southpaw,"May 11, 1986",http://ufcstats.com/fighter-details/696002b59f09d73b,,135,1461
3439
  Raul,Sandoval,,3,3,0,False,,130,,,,http://ufcstats.com/fighter-details/f9ad10f6a49e5452,,130,1500
3440
  Chris,Sanford,,5,1,0,False,180,185,,Orthodox,"Mar 12, 1968",http://ufcstats.com/fighter-details/29f935654825331b,,185,1480
3441
+ Roldan,Sangcha'an,The Executioner,4,2,0,False,163,125,,Orthodox,"Dec 04, 1990",http://ufcstats.com/fighter-details/57887765f831e228,,125,1500
3442
  Martin,Sano,Spartan,4,3,1,False,180,170,75.0,Orthodox,"May 30, 1991",http://ufcstats.com/fighter-details/16a64f93f6678b7b,75,170,1481
3443
  Yuhi,Sano,,0,4,0,False,180,243,,Orthodox,"Feb 02, 1965",http://ufcstats.com/fighter-details/4c12aa7ca246e7a4,,243,1500
3444
  Jonathan,Santa Maria,,3,4,0,False,175,125,,,,http://ufcstats.com/fighter-details/3143e5daff9e5b71,,125,1500
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  requests
2
- beautifulsoup4
 
 
 
1
  requests
2
+ beautifulsoup4
3
+ pandas
4
+ scikit-learn
src/predict/main.py CHANGED
@@ -1,30 +1,37 @@
1
- from .models import EloBaselineModel
 
2
  from .pipeline import PredictionPipeline
3
 
4
  def main():
5
  """
6
- Sets up the models and runs the prediction pipeline.
7
- This is where you can add new models to compare them.
8
  """
9
- print("--- Initializing Machine Learning Prediction Pipeline ---")
 
 
 
 
 
 
 
 
10
 
11
- # 1. Initialize the models you want to test
12
- elo_model = EloBaselineModel()
13
-
14
- # Add other models here to compare them, e.g.:
15
- # logistic_model = LogisticRegressionModel()
16
-
17
- # 2. Create a list of the models to evaluate
18
  models_to_run = [
19
- elo_model,
20
- # logistic_model
21
  ]
 
22
 
23
- # 3. Initialize and run the pipeline
24
  pipeline = PredictionPipeline(models=models_to_run)
25
 
26
- # Set detailed_report=False for a summary, or True for a full detailed report
27
- pipeline.run(detailed_report=True)
 
 
 
28
 
29
  if __name__ == '__main__':
30
  main()
 
1
+ import argparse
2
+ from .models import EloBaselineModel, LogisticRegressionModel
3
  from .pipeline import PredictionPipeline
4
 
5
  def main():
6
  """
7
+ Main entry point to run the prediction pipeline.
8
+ You can specify which models to run and the reporting format.
9
  """
10
+ parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
11
+ parser.add_argument(
12
+ '--report',
13
+ type=str,
14
+ default='detailed',
15
+ choices=['detailed', 'summary'],
16
+ help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
17
+ )
18
+ args = parser.parse_args()
19
 
20
+ # --- Define Models to Run ---
21
+ # Instantiate all the models you want to evaluate here.
 
 
 
 
 
22
  models_to_run = [
23
+ EloBaselineModel(),
24
+ LogisticRegressionModel(),
25
  ]
26
+ # --- End of Model Definition ---
27
 
 
28
  pipeline = PredictionPipeline(models=models_to_run)
29
 
30
+ try:
31
+ pipeline.run(detailed_report=(args.report == 'detailed'))
32
+ except FileNotFoundError as e:
33
+ print(f"Error: {e}")
34
+ print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
35
 
36
  if __name__ == '__main__':
37
  main()
src/predict/models.py CHANGED
@@ -2,6 +2,10 @@ from abc import ABC, abstractmethod
2
  import sys
3
  import os
4
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
 
 
 
 
5
 
6
  class BaseModel(ABC):
7
  """
@@ -18,12 +22,11 @@ class BaseModel(ABC):
18
  pass
19
 
20
  @abstractmethod
21
- def predict(self, fighter1_name, fighter2_name):
22
  """
23
  Predicts the winner of a single fight.
24
 
25
- :param fighter1_name: The name of the first fighter.
26
- :param fighter2_name: The name of the second fighter.
27
  :return: The name of the predicted winning fighter.
28
  """
29
  pass
@@ -33,24 +36,113 @@ class EloBaselineModel(BaseModel):
33
  A baseline prediction model that predicts the winner based on the higher ELO rating.
34
  """
35
  def __init__(self):
36
- self.historical_elos = {}
37
 
38
  def train(self, train_fights):
39
  """
40
- Calculates the ELO ratings for all fighters based on historical data.
41
- These ratings are then stored to be used for predictions.
42
  """
43
- print("Training ELO Baseline Model...")
44
- self.historical_elos = process_fights_for_elo(train_fights)
45
- print("ELO Model training complete.")
 
46
 
47
- def predict(self, fighter1_name, fighter2_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
- Predicts the winner based on which fighter has the higher historical ELO.
50
- If a fighter has no ELO rating, the default initial ELO is used.
51
  """
52
- elo1 = self.historical_elos.get(fighter1_name, INITIAL_ELO)
53
- elo2 = self.historical_elos.get(fighter2_name, INITIAL_ELO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- # Return the name of the fighter with the higher ELO
56
- return fighter1_name if elo1 > elo2 else fighter2_name
 
2
  import sys
3
  import os
4
  from ..analysis.elo import process_fights_for_elo, INITIAL_ELO
5
+ import pandas as pd
6
+ from sklearn.linear_model import LogisticRegression
7
+ from ..config import FIGHTERS_CSV_PATH
8
+ from .preprocess import preprocess_for_ml, _get_fighter_history_stats, _calculate_age
9
 
10
  class BaseModel(ABC):
11
  """
 
22
  pass
23
 
24
  @abstractmethod
25
+ def predict(self, fight):
26
  """
27
  Predicts the winner of a single fight.
28
 
29
+ :param fight: A dictionary representing a single fight.
 
30
  :return: The name of the predicted winning fighter.
31
  """
32
  pass
 
36
  A baseline prediction model that predicts the winner based on the higher ELO rating.
37
  """
38
  def __init__(self):
39
+ self.fighters_df = None
40
 
41
  def train(self, train_fights):
42
  """
43
+ For the ELO baseline, 'training' simply consists of loading the fighter data
44
+ to access their ELO scores during prediction.
45
  """
46
+ print("Training EloBaselineModel: Loading fighter ELO data...")
47
+ self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
48
+ self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
49
+ self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
50
 
51
+ def predict(self, fight):
52
+ """Predicts the winner based on who has the higher ELO score."""
53
+ f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
54
+
55
+ try:
56
+ f1_elo = self.fighters_df.loc[f1_name, 'elo']
57
+ f2_elo = self.fighters_df.loc[f2_name, 'elo']
58
+
59
+ return f1_name if f1_elo > f2_elo else f2_name
60
+ except KeyError as e:
61
+ # If a fighter isn't found, we can't make a prediction.
62
+ # Returning None or a default is a design choice.
63
+ print(f"Warning: Could not find ELO for fighter {e}. Skipping prediction.")
64
+ return None
65
+
66
+ class LogisticRegressionModel(BaseModel):
67
+ """
68
+ A model that uses logistic regression to predict fight outcomes based on differential features.
69
+ """
70
+ def __init__(self):
71
+ self.model = LogisticRegression(solver='liblinear', random_state=42)
72
+ self.fighters_df = None
73
+ self.fighter_histories = {}
74
+
75
+ def train(self, train_fights):
76
+ """
77
+ Trains the logistic regression model by preprocessing the training data
78
+ and fitting the model.
79
+ """
80
+ print("Training LogisticRegressionModel...")
81
+
82
+ # 1. Prepare data for prediction-time feature generation
83
+ self.fighters_df = pd.read_csv(FIGHTERS_CSV_PATH)
84
+ self.fighters_df['full_name'] = self.fighters_df['first_name'] + ' ' + self.fighters_df['last_name']
85
+ self.fighters_df = self.fighters_df.drop_duplicates(subset=['full_name']).set_index('full_name')
86
+ for col in ['height_cm', 'reach_in', 'elo']:
87
+ if col in self.fighters_df.columns:
88
+ self.fighters_df[col] = pd.to_numeric(self.fighters_df[col], errors='coerce')
89
+
90
+ # 2. Pre-calculate fighter histories for efficient lookup during prediction
91
+ train_fights_with_dates = []
92
+ for fight in train_fights:
93
+ fight['date_obj'] = pd.to_datetime(fight['event_date'])
94
+ train_fights_with_dates.append(fight)
95
+
96
+ for fighter_name in self.fighters_df.index:
97
+ history = [f for f in train_fights_with_dates if fighter_name in (f['fighter_1'], f['fighter_2'])]
98
+ self.fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
99
+
100
+ # 3. Preprocess training data and fit the model
101
+ X_train, y_train, _ = preprocess_for_ml(train_fights, FIGHTERS_CSV_PATH)
102
+ print(f"Fitting model on {X_train.shape[0]} samples...")
103
+ self.model.fit(X_train, y_train)
104
+ print("Model training complete.")
105
+
106
+ def predict(self, fight):
107
  """
108
+ Predicts the outcome of a single fight by generating its feature vector.
 
109
  """
110
+ f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
111
+ fight_date = pd.to_datetime(fight['event_date'])
112
+
113
+ if f1_name not in self.fighters_df.index or f2_name not in self.fighters_df.index:
114
+ print(f"Warning: Fighter not found in data. Skipping prediction for {f1_name} vs {f2_name}")
115
+ return None
116
+
117
+ # 1. Get base stats
118
+ f1_stats, f2_stats = self.fighters_df.loc[f1_name], self.fighters_df.loc[f2_name]
119
+ if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
120
+ if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
121
+
122
+ # 2. Get historical stats
123
+ f1_hist_stats = _get_fighter_history_stats(f1_name, fight_date, self.fighter_histories.get(f1_name, []), self.fighters_df)
124
+ f2_hist_stats = _get_fighter_history_stats(f2_name, fight_date, self.fighter_histories.get(f2_name, []), self.fighters_df)
125
+
126
+ # 3. Create differential features
127
+ f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
128
+ f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
129
+
130
+ features = {
131
+ 'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
132
+ 'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
133
+ 'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
134
+ 'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
135
+ 'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
136
+ 'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
137
+ 'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
138
+ 'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
139
+ 'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
140
+ }
141
+
142
+ feature_vector = pd.DataFrame([features]).fillna(0)
143
+
144
+ # 4. Predict
145
+ # The model predicts the probability of class '1' (a win for fighter_1)
146
+ prediction = self.model.predict(feature_vector)[0]
147
 
148
+ return f1_name if prediction == 1 else f2_name
 
src/predict/pipeline.py CHANGED
@@ -64,7 +64,8 @@ class PredictionPipeline:
64
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
65
  actual_winner = fight['winner']
66
  event_name = fight.get('event_name', 'Unknown Event')
67
- predicted_winner = model.predict(f1_name, f2_name)
 
68
 
69
  is_correct = (predicted_winner == actual_winner)
70
  if is_correct:
 
64
  f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
65
  actual_winner = fight['winner']
66
  event_name = fight.get('event_name', 'Unknown Event')
67
+
68
+ predicted_winner = model.predict(fight)
69
 
70
  is_correct = (predicted_winner == actual_winner)
71
  if is_correct:
src/predict/preprocess.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import sys
4
+ from datetime import datetime
5
+ from ..config import FIGHTERS_CSV_PATH
6
+
7
+ def _clean_numeric_column(series):
8
+ """A helper to clean string columns into numbers, handling errors."""
9
+ series_str = series.astype(str)
10
+ return pd.to_numeric(series_str.str.replace(r'[^0-9.]', '', regex=True), errors='coerce')
11
+
12
+ def _calculate_age(dob_str, fight_date_str):
13
+ """Calculates age in years from a date of birth string and fight date string."""
14
+ if pd.isna(dob_str) or not dob_str:
15
+ return None
16
+ try:
17
+ dob = datetime.strptime(dob_str, '%b %d, %Y')
18
+ fight_date = datetime.strptime(fight_date_str, '%B %d, %Y')
19
+ return (fight_date - dob).days / 365.25
20
+ except (ValueError, TypeError):
21
+ return None
22
+
23
+ def _parse_round_time_to_seconds(round_str, time_str):
24
+ """Converts fight duration from round and time to total seconds."""
25
+ try:
26
+ rounds = int(round_str)
27
+ minutes, seconds = map(int, time_str.split(':'))
28
+ # Assuming 5-minute rounds for calculation simplicity
29
+ return ((rounds - 1) * 5 * 60) + (minutes * 60) + seconds
30
+ except (ValueError, TypeError, AttributeError):
31
+ return 0
32
+
33
+ def _parse_striking_stats(stat_str):
34
+ """Parses striking stats string like '10 of 20' into (landed, attempted)."""
35
+ try:
36
+ landed, attempted = map(int, stat_str.split(' of '))
37
+ return landed, attempted
38
+ except (ValueError, TypeError, AttributeError):
39
+ return 0, 0
40
+
41
+ def _get_fighter_history_stats(fighter_name, current_fight_date, fighter_history, fighters_df, n=5):
42
+ """
43
+ Calculates performance statistics for a fighter based on their last n fights.
44
+ """
45
+ past_fights = [f for f in fighter_history if f['date_obj'] < current_fight_date]
46
+ last_n_fights = past_fights[-n:]
47
+
48
+ if not last_n_fights:
49
+ # Return a default dictionary with the correct keys for a fighter with no history
50
+ return {
51
+ 'wins_last_n': 0,
52
+ 'avg_opp_elo_last_n': 1500, # Assume average ELO for first opponent
53
+ 'ko_percent_last_n': 0,
54
+ 'sig_str_landed_per_min_last_n': 0,
55
+ }
56
+
57
+ stats = {
58
+ 'wins': 0, 'ko_wins': 0, 'total_time_secs': 0,
59
+ 'sig_str_landed': 0, 'opponent_elos': []
60
+ }
61
+
62
+ for fight in last_n_fights:
63
+ is_fighter_1 = (fight['fighter_1'] == fighter_name)
64
+ opponent_name = fight['fighter_2'] if is_fighter_1 else fight['fighter_1']
65
+
66
+ if fight['winner'] == fighter_name:
67
+ stats['wins'] += 1
68
+ if 'KO' in fight['method']:
69
+ stats['ko_wins'] += 1
70
+
71
+ if opponent_name in fighters_df.index:
72
+ opp_elo = fighters_df.loc[opponent_name, 'elo']
73
+ stats['opponent_elos'].append(opp_elo if pd.notna(opp_elo) else 1500)
74
+
75
+ stats['total_time_secs'] += _parse_round_time_to_seconds(fight['round'], fight['time'])
76
+
77
+ sig_str_stat = fight.get(f'f1_sig_str' if is_fighter_1 else 'f2_sig_str', '0 of 0')
78
+ landed, _ = _parse_striking_stats(sig_str_stat)
79
+ stats['sig_str_landed'] += landed
80
+
81
+ # Final calculations
82
+ avg_opp_elo = sum(stats['opponent_elos']) / len(stats['opponent_elos']) if stats['opponent_elos'] else 1500
83
+
84
+ return {
85
+ 'wins_last_n': stats['wins'],
86
+ 'avg_opp_elo_last_n': avg_opp_elo,
87
+ 'ko_percent_last_n': (stats['ko_wins'] / stats['wins']) if stats['wins'] > 0 else 0,
88
+ 'sig_str_landed_per_min_last_n': (stats['sig_str_landed'] * 60 / stats['total_time_secs']) if stats['total_time_secs'] > 0 else 0,
89
+ }
90
+
91
+ def preprocess_for_ml(fights_to_process, fighters_csv_path):
92
+ """
93
+ Transforms raw fight and fighter data into a feature matrix (X) and target vector (y)
94
+ suitable for a binary classification machine learning model.
95
+
96
+ Args:
97
+ fights_to_process (list of dict): The list of fights to process.
98
+ fighters_csv_path (str): Path to the CSV file with all fighter stats.
99
+
100
+ Returns:
101
+ pd.DataFrame: Feature matrix X.
102
+ pd.Series: Target vector y.
103
+ pd.DataFrame: Metadata DataFrame.
104
+ """
105
+ if not os.path.exists(fighters_csv_path):
106
+ raise FileNotFoundError(f"Fighters data not found at '{fighters_csv_path}'.")
107
+
108
+ fighters_df = pd.read_csv(fighters_csv_path)
109
+
110
+ # 1. Prepare fighters data for merging
111
+ fighters_prepared = fighters_df.copy()
112
+ fighters_prepared['full_name'] = fighters_prepared['first_name'] + ' ' + fighters_prepared['last_name']
113
+
114
+ # Handle duplicate fighter names by keeping the first entry
115
+ fighters_prepared = fighters_prepared.drop_duplicates(subset=['full_name'], keep='first')
116
+ fighters_prepared = fighters_prepared.set_index('full_name')
117
+
118
+ for col in ['height_cm', 'reach_in', 'elo']:
119
+ if col in fighters_prepared.columns:
120
+ fighters_prepared[col] = _clean_numeric_column(fighters_prepared[col])
121
+
122
+ # 2. Pre-calculate fighter histories to speed up lookups
123
+ # And convert date strings to datetime objects once
124
+ for fight in fights_to_process:
125
+ fight['date_obj'] = datetime.strptime(fight['event_date'], '%B %d, %Y')
126
+
127
+ fighter_histories = {}
128
+ for fighter_name in fighters_prepared.index:
129
+ history = [f for f in fights_to_process if fighter_name in (f['fighter_1'], f['fighter_2'])]
130
+ fighter_histories[fighter_name] = sorted(history, key=lambda x: x['date_obj'])
131
+
132
+ # 3. Process fights to create features and targets
133
+ feature_list = []
134
+ target_list = []
135
+ metadata_list = []
136
+
137
+ for fight in fights_to_process:
138
+ # Per the dataset's design, fighter_1 is always the winner.
139
+ f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
140
+
141
+ if f1_name not in fighters_prepared.index or f2_name not in fighters_prepared.index:
142
+ continue
143
+
144
+ f1_stats, f2_stats = fighters_prepared.loc[f1_name], fighters_prepared.loc[f2_name]
145
+
146
+ if isinstance(f1_stats, pd.DataFrame): f1_stats = f1_stats.iloc[0]
147
+ if isinstance(f2_stats, pd.DataFrame): f2_stats = f2_stats.iloc[0]
148
+
149
+ # Calculate ages for both fighters
150
+ f1_age = _calculate_age(f1_stats.get('dob'), fight['event_date'])
151
+ f2_age = _calculate_age(f2_stats.get('dob'), fight['event_date'])
152
+
153
+ # Get historical stats for both fighters
154
+ f1_hist_stats = _get_fighter_history_stats(f1_name, fight['date_obj'], fighter_histories.get(f1_name, []), fighters_prepared)
155
+ f2_hist_stats = _get_fighter_history_stats(f2_name, fight['date_obj'], fighter_histories.get(f2_name, []), fighters_prepared)
156
+
157
+ # --- Create two training examples from each fight for a balanced dataset ---
158
+
159
+ # 1. The "Win" case: (fighter_1 - fighter_2)
160
+ features_win = {
161
+ # Original diffs
162
+ 'elo_diff': f1_stats.get('elo', 1500) - f2_stats.get('elo', 1500),
163
+ 'height_diff_cm': f1_stats.get('height_cm', 0) - f2_stats.get('height_cm', 0),
164
+ 'reach_diff_in': f1_stats.get('reach_in', 0) - f2_stats.get('reach_in', 0),
165
+ 'age_diff_years': (f1_age - f2_age) if f1_age and f2_age else 0,
166
+ 'stance_is_different': 1 if f1_stats.get('stance') != f2_stats.get('stance') else 0,
167
+ # New historical diffs
168
+ 'wins_last_5_diff': f1_hist_stats['wins_last_n'] - f2_hist_stats['wins_last_n'],
169
+ 'avg_opp_elo_last_5_diff': f1_hist_stats['avg_opp_elo_last_n'] - f2_hist_stats['avg_opp_elo_last_n'],
170
+ 'ko_percent_last_5_diff': f1_hist_stats['ko_percent_last_n'] - f2_hist_stats['ko_percent_last_n'],
171
+ 'sig_str_landed_per_min_last_5_diff': f1_hist_stats['sig_str_landed_per_min_last_n'] - f2_hist_stats['sig_str_landed_per_min_last_n'],
172
+ }
173
+ feature_list.append(features_win)
174
+ target_list.append(1) # 1 represents a win
175
+
176
+ # 2. The "Loss" case: (fighter_2 - fighter_1)
177
+ # We invert the differences for the losing case.
178
+ features_loss = {key: -value for key, value in features_win.items()}
179
+ # Stance difference is symmetric; it doesn't get inverted.
180
+ features_loss['stance_is_different'] = features_win['stance_is_different']
181
+
182
+ feature_list.append(features_loss)
183
+ target_list.append(0) # 0 represents a loss
184
+
185
+ # Add metadata for both generated samples
186
+ # The 'winner' and 'loser' are consistent with the original data structure
187
+ metadata_list.append({
188
+ 'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
189
+ })
190
+ metadata_list.append({
191
+ 'winner': f1_name, 'loser': f2_name, 'event_date': fight['event_date']
192
+ })
193
+
194
+ X = pd.DataFrame(feature_list).fillna(0)
195
+ y = pd.Series(target_list, name='winner')
196
+ metadata = pd.DataFrame(metadata_list)
197
+
198
+ print(f"Preprocessing complete. Generated {X.shape[0]} samples with {X.shape[1]} features.")
199
+ return X, y, metadata
200
+
201
+ if __name__ == '__main__':
202
+ from .pipeline import PredictionPipeline
203
+
204
+ print("--- Running Preprocessing Example ---")
205
+
206
+ pipeline = PredictionPipeline(models=[])
207
+ try:
208
+ pipeline._load_and_split_data()
209
+ if pipeline.train_fights:
210
+ X_train, y_train, metadata_train = preprocess_for_ml(pipeline.train_fights, FIGHTERS_CSV_PATH)
211
+ print("\nTraining Data Shape:")
212
+ print("X_train:", X_train.shape)
213
+ print("y_train:", y_train.shape)
214
+ print("metadata_train:", metadata_train.shape)
215
+
216
+ print("\nLast 5 rows of X_train (showing populated historical features):")
217
+ print(X_train.tail())
218
+
219
+ print("\nTarget distribution (0=Loss, 1=Win):")
220
+ print(y_train.value_counts())
221
+
222
+ print("\nMetadata for last 5 rows:")
223
+ print(metadata_train.tail())
224
+
225
+ except FileNotFoundError as e:
226
+ print(e)
227
+ print("Please run the scraping pipeline first ('python -m src.scrape.main').")
src/scrape/main.py CHANGED
@@ -8,37 +8,29 @@ from .. import config
8
 
9
  def main():
10
  """
11
- Main pipeline to scrape UFC data and convert it to CSV.
12
  """
13
  # Ensure the output directory exists
14
  if not os.path.exists(config.OUTPUT_DIR):
15
  os.makedirs(config.OUTPUT_DIR)
16
  print(f"Created directory: {config.OUTPUT_DIR}")
17
 
18
- # --- Step 1: Scrape Events and Fights ---
19
- print("\n--- Starting Events and Fights Scraping ---")
20
- all_events_data = scrape_all_events()
21
- with open(config.EVENTS_JSON_PATH, 'w') as f:
22
- json.dump(all_events_data, f, indent=4)
23
- print(f"Scraping for events complete. Data saved to {config.EVENTS_JSON_PATH}")
24
 
25
- # --- Step 2: Scrape Fighters ---
26
- print("\n--- Starting Fighters Scraping ---")
27
- all_fighters_data = scrape_all_fighters()
28
- with open(config.FIGHTERS_JSON_PATH, 'w') as f:
29
- json.dump(all_fighters_data, f, indent=4)
30
- print(f"Scraping for fighters complete. Data saved to {config.FIGHTERS_JSON_PATH}")
31
-
32
- # --- Step 3: Convert JSON to CSV ---
33
- print("\n--- Converting all JSON files to CSV ---")
34
  json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
35
  fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
36
 
37
- # --- Step 4: Preprocess CSV data ---
38
- print("\n--- Preprocessing fighter data (converting height to cm) ---")
39
- preprocess_fighters_csv(config.FIGHTERS_CSV_PATH)
 
40
 
41
- # --- Step 5: Clean up temporary JSON files ---
42
  print("\n--- Deleting temporary JSON files ---")
43
  try:
44
  if os.path.exists(config.EVENTS_JSON_PATH):
@@ -50,7 +42,7 @@ def main():
50
  except OSError as e:
51
  print(f"Error deleting JSON files: {e}")
52
 
53
- print("\n--- Pipeline Finished ---")
54
 
55
  if __name__ == '__main__':
56
  main()
 
8
 
9
  def main():
10
  """
11
+ Main function to run the complete scraping and preprocessing pipeline.
12
  """
13
  # Ensure the output directory exists
14
  if not os.path.exists(config.OUTPUT_DIR):
15
  os.makedirs(config.OUTPUT_DIR)
16
  print(f"Created directory: {config.OUTPUT_DIR}")
17
 
18
+ # --- Step 1: Scrape all data from the website ---
19
+ # This will generate fighters.json and events.json
20
+ scrape_all_fighters()
21
+ scrape_all_events()
 
 
22
 
23
+ # --- Step 2: Convert the scraped JSON data to CSV format ---
24
+ # This will generate fighters.csv and fights.csv
 
 
 
 
 
 
 
25
  json_to_csv(config.EVENTS_JSON_PATH, config.FIGHTS_CSV_PATH)
26
  fighters_json_to_csv(config.FIGHTERS_JSON_PATH, config.FIGHTERS_CSV_PATH)
27
 
28
+ # --- Step 3: Run post-processing on the generated CSV files ---
29
+ # This cleans names, converts height, etc.
30
+ print("\n--- Running post-scraping preprocessing ---")
31
+ preprocess_fighters_csv()
32
 
33
+ # --- Step 4: Clean up temporary JSON files ---
34
  print("\n--- Deleting temporary JSON files ---")
35
  try:
36
  if os.path.exists(config.EVENTS_JSON_PATH):
 
42
  except OSError as e:
43
  print(f"Error deleting JSON files: {e}")
44
 
45
+ print("\n\n--- Scraping and Preprocessing Pipeline Finished ---")
46
 
47
  if __name__ == '__main__':
48
  main()
src/scrape/preprocess.py CHANGED
@@ -26,7 +26,7 @@ def convert_height_to_cm(height_str):
26
 
27
  def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
28
  """
29
- Reads the fighters CSV, converts height to cm, renames the column,
30
  and saves the changes back to the same file.
31
  """
32
  if not os.path.exists(file_path):
@@ -46,18 +46,24 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
46
  headers = reader.fieldnames
47
  rows = list(reader)
48
 
49
- # Check if there's a 'height' column to process
50
- if 'height' not in headers:
51
- print("No 'height' column found. Nothing to do.")
52
- return
53
-
54
  # Process the rows in memory
55
  for row in rows:
56
- # Create a new key for the converted height and remove the old one
57
- row['height_cm'] = convert_height_to_cm(row.pop('height', ''))
 
 
 
 
 
 
 
58
 
59
- # Update the header name
60
- headers[headers.index('height')] = 'height_cm'
 
61
 
62
  # Write the modified data back to the same file, overwriting it
63
  with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
@@ -66,7 +72,10 @@ def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
66
  writer.writerows(rows)
67
 
68
  print(f"Successfully processed file: {file_path}")
69
- print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
 
 
 
70
 
71
  except Exception as e:
72
  print(f"An error occurred: {e}")
 
26
 
27
  def preprocess_fighters_csv(file_path=config.FIGHTERS_CSV_PATH):
28
  """
29
+ Reads the fighters CSV, cleans names, converts height to cm,
30
  and saves the changes back to the same file.
31
  """
32
  if not os.path.exists(file_path):
 
46
  headers = reader.fieldnames
47
  rows = list(reader)
48
 
49
+ # --- Data Cleaning and Processing ---
50
+
51
+ name_cleaned_count = 0
 
 
52
  # Process the rows in memory
53
  for row in rows:
54
+ # Clean fighter names (e.g., "O ftMalley" -> "O'Malley")
55
+ for col in ['first_name', 'last_name']:
56
+ if col in row and ' ft' in row[col]:
57
+ row[col] = row[col].replace(' ft', "'")
58
+ name_cleaned_count += 1
59
+
60
+ # Convert height to cm and remove the old column
61
+ if 'height' in row:
62
+ row['height_cm'] = convert_height_to_cm(row.pop('height'))
63
 
64
+ # Update the header name if 'height' was present
65
+ if 'height' in headers:
66
+ headers[headers.index('height')] = 'height_cm'
67
 
68
  # Write the modified data back to the same file, overwriting it
69
  with open(file_path, 'w', newline='', encoding='utf-8') as csv_file:
 
72
  writer.writerows(rows)
73
 
74
  print(f"Successfully processed file: {file_path}")
75
+ if name_cleaned_count > 0:
76
+ print(f"Cleaned {name_cleaned_count} instances of ' ft' in fighter names.")
77
+ if 'height_cm' in headers:
78
+ print("Converted 'height' column to centimeters and renamed it to 'height_cm'.")
79
 
80
  except Exception as e:
81
  print(f"An error occurred: {e}")