File size: 4,277 Bytes
4d6e8c2
 
 
 
 
243d40e
296146e
9012700
4d6e8c2
 
 
296146e
4d6e8c2
 
 
70f5f26
1c33274
70f5f26
3b83e0c
 
 
 
 
b562460
3b83e0c
 
 
 
 
 
 
 
 
9bcb67c
3b83e0c
 
296146e
 
f5aa578
3b83e0c
 
 
 
 
 
 
 
 
9012700
 
1c33274
70f5f26
3b83e0c
8589546
4d6e8c2
70f5f26
 
 
 
 
4d6e8c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70f5f26
 
 
 
 
4d6e8c2
 
 
243d40e
3b83e0c
 
 
320940c
70f5f26
 
 
 
 
4d6e8c2
 
 
 
 
 
 
 
 
 
 
 
70f5f26
4d6e8c2
 
 
 
1c33274
4d6e8c2
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import random
from skops.io import load
# Textpreprocessor defined in this scope


from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info
from .utils.text_preprocessor import preprocess

router = APIRouter()

DESCRIPTION = "Random Baseline"
ROUTE = "/text"

models_description = {
    "baseline": "random baseline",
    "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier",
}


# Some code borrowed from Nonnormalizable

def baseline_model(dataset_length: int):
    # Make random predictions (placeholder for actual model inference)
    predictions = [random.randint(0, 7) for _ in range(dataset_length)]

    return predictions

def tree_classifier(test_dataset: dict, model: str):

    texts = test_dataset["quote"]

    texts = preprocess(texts)

    model_path = f"tasks/text_models/{model}.skops"

    model = load(model_path, 
                 trusted=[
                     'xgboost.core.Booster',
                     'xgboost.sklearn.XGBClassifier'])

    predictions = model.predict(texts)

    return predictions


@router.post(ROUTE, tags=["Text Task"], 
             description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest,
                        model: str = "tfidf_xgb"):
    """
    Evaluate text classification for climate disinformation detection.
    
    Current Model: Random Baseline
    - Makes random predictions from the label space (0-7)
    - Used as a baseline for comparison
    """
    # Get space info
    username, space_url = get_space_info()

    # Define the label mapping
    LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7
    }

    # Load and prepare the dataset
    dataset = load_dataset(request.dataset_name)

    # Convert string labels to integers
    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})

    # Split dataset
    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
    test_dataset = train_test["test"]
    
    # Start tracking emissions
    tracker.start()
    tracker.start_task("inference")

    #--------------------------------------------------------------------------------------------
    # YOUR MODEL INFERENCE CODE HERE
    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
    #--------------------------------------------------------------------------------------------   
    
    # Make random predictions (placeholder for actual model inference)
    true_labels = test_dataset["label"]

    if model == "baseline":
        predictions = baseline_model(len(true_labels))
    elif model == "tfidf_xgb":
        predictions = tree_classifier(test_dataset, model='xgb_pipeline')

    #--------------------------------------------------------------------------------------------
    # YOUR MODEL INFERENCE STOPS HERE
    #--------------------------------------------------------------------------------------------   

    
    # Stop tracking emissions
    emissions_data = tracker.stop_task()
    
    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    
    # Prepare results dictionary
    results = {
        "username": username,
        "space_url": space_url,
        "submission_timestamp": datetime.now().isoformat(),
        "model_description": DESCRIPTION,
        "accuracy": float(accuracy),
        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
        "emissions_gco2eq": emissions_data.emissions * 1000,
        "emissions_data": clean_emissions_data(emissions_data),
        "api_route": ROUTE,
        "dataset_config": {
            "dataset_name": request.dataset_name,
            "test_size": request.test_size,
            "test_seed": request.test_seed
        }
    }
    
    return results