from fastapi import APIRouter from datetime import datetime from datasets import load_dataset from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from .utils.evaluation import TextEvaluationRequest from .utils.emissions import tracker, clean_emissions_data, get_space_info import os import re import pandas as pd import tensorflow as tf from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline router = APIRouter() DESCRIPTION = " XGBOOST classification" ROUTE = "/text" @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION) async def evaluate_text(request: TextEvaluationRequest): """ Evaluate text classification for climate disinformation detection. Current Model: Bidirectional LSTM with Attention layer classification - Current Model: Bidirectional LSTM with Attention layer classification classification predictions from the label space (0-7) - Used as a baseline for comparison """ # Get space info username, space_url = get_space_info() # Define the label mapping LABEL_MAPPING = { "0_not_relevant": 0, "1_not_happening": 1, "2_not_human": 2, "3_not_bad": 3, "4_solutions_harmful_unnecessary": 4, "5_science_unreliable": 5, "6_proponents_biased": 6, "7_fossil_fuels_needed": 7 } # Load and prepare the dataset dataset = load_dataset(request.dataset_name) # Convert string labels to integers dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]}) # Split dataset train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed) train_dataset = train_test["train"] test_dataset = train_test["test"] from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from datetime import datetime from sklearn.feature_extraction.text import CountVectorizer tfidf_vect = TfidfVectorizer(stop_words = 'english') tfidf_train = tfidf_vect.fit_transform(train_dataset['quote']) tfidf_train = tfidf_vect.transform(train_dataset['quote']) tfidf_test = tfidf_vect.fit_transform(test_dataset['quote']) tfidf_test = tfidf_vect.transform(test_dataset['quote']) true_labels = test_dataset["label"] y_train = train_dataset["label"] y_test = test_dataset["label"] # Model import xgboost as xgb #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452} #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452} #Parameters: {'colsample_bytree': 0.7498850106268238, 'gamma': 0.3690168082131852, 'learning_rate': 0.054839600377537934, 'max_depth': 5, 'n_estimators': 125, 'subsample': 0.6272998821416366} #xgb_model = xgb.XGBRegressor(max_depth=5, objective='multi:softprob', # n_estimators=125, num_class=8, colsample_bytree=0.7498850106268238,gamma=0.3690168082131852, # learning_rate=0.054839600377537934, subsample=0.6272998821416366) #xgb_model.fit(tfidf_train, y_train) #y_pred = xgb_model.predict(tfidf_train) # Start tracking emissions tracker.start() tracker.start_task("inference") xgb_model = xgb.XGBRegressor(max_depth=6, objective='multi:softprob', n_estimators=500, num_class=8, colsample_bytree=0.75,gamma=0.35, learning_rate=0.06, subsample=0.63) xgb_model.fit(tfidf_test, y_test) predictions = np.argmax(xgb_model.predict(tfidf_test), axis=1) # Stop tracking emissions emissions_data = tracker.stop_task() # Calculate accuracy accuracy = accuracy_score(true_labels, predictions) # Prepare results dictionary results = { "username": username, "space_url": space_url, "submission_timestamp": datetime.now().isoformat(), "model_description": DESCRIPTION, "accuracy": float(accuracy), "energy_consumed_wh": emissions_data.energy_consumed * 1000, "emissions_gco2eq": emissions_data.emissions * 1000, "emissions_data": clean_emissions_data(emissions_data), "api_route": ROUTE, "dataset_config": { "dataset_name": request.dataset_name, "test_size": request.test_size, "test_seed": request.test_seed } } return results