File size: 4,995 Bytes
4d6e8c2
 
 
 
945b47e
aa215e9
8963d03
4d6e8c2
 
0f3f04a
24930f7
8963d03
 
2aa2667
 
24930f7
4d6e8c2
 
 
2aa2667
1c33274
70f5f26
1c33274
70f5f26
4d6e8c2
 
70f5f26
 
f8008ee
 
70f5f26
4d6e8c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c1e26c
 
4d6e8c2
d195db5
8963d03
 
d195db5
 
8963d03
2aa2667
 
 
 
 
 
 
8963d03
2aa2667
8963d03
2aa2667
 
 
 
8963d03
 
2aa2667
8963d03
 
 
 
2aa2667
8963d03
2aa2667
 
 
8963d03
2aa2667
96dd54b
 
2aa2667
 
8963d03
4d6e8c2
 
 
0f3f04a
2aa2667
 
 
 
 
 
4d6e8c2
 
d195db5
8963d03
 
 
4d6e8c2
 
 
 
 
 
70f5f26
4d6e8c2
 
 
 
1c33274
4d6e8c2
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info
import os
import re
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline



router = APIRouter()

DESCRIPTION = " XGBOOST classification"
ROUTE = "/text"

@router.post(ROUTE, tags=["Text Task"], 
             description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
    """
    Evaluate text classification for climate disinformation detection.
    
    Current Model: Bidirectional LSTM with Attention layer classification
    - Current Model: Bidirectional LSTM with Attention layer classification classification predictions from the label space (0-7)
    - Used as a baseline for comparison
    """
    # Get space info
    username, space_url = get_space_info()

    # Define the label mapping
    LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7
    }

    # Load and prepare the dataset
    dataset = load_dataset(request.dataset_name)

    # Convert string labels to integers
    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})


    
    # Split dataset
   
    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
    
    train_dataset = train_test["train"]
    test_dataset = train_test["test"]
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn import metrics
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    from datetime import datetime
    from sklearn.feature_extraction.text import CountVectorizer
    
    tfidf_vect = TfidfVectorizer(stop_words = 'english')
    
    tfidf_train = tfidf_vect.fit_transform(train_dataset['quote'])
    tfidf_train = tfidf_vect.transform(train_dataset['quote'])
    tfidf_test = tfidf_vect.fit_transform(test_dataset['quote'])
    tfidf_test = tfidf_vect.transform(test_dataset['quote'])
    true_labels = test_dataset["label"]
    y_train = train_dataset["label"]
    y_test = test_dataset["label"]

    
            
    # Model
    import xgboost as xgb

    #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
    #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
    #Parameters: {'colsample_bytree': 0.7498850106268238, 'gamma': 0.3690168082131852, 'learning_rate': 0.054839600377537934, 'max_depth': 5, 'n_estimators': 125, 'subsample': 0.6272998821416366}
    
    #xgb_model = xgb.XGBRegressor(max_depth=5, objective='multi:softprob', 
    #                             n_estimators=125, num_class=8, colsample_bytree=0.7498850106268238,gamma=0.3690168082131852,
    #                             learning_rate=0.054839600377537934, subsample=0.6272998821416366)
    #xgb_model.fit(tfidf_train, y_train)
    #y_pred = xgb_model.predict(tfidf_train)

    # Start tracking emissions
    tracker.start()
    tracker.start_task("inference")
    
    xgb_model = xgb.XGBRegressor(max_depth=6, objective='multi:softprob', 
                             n_estimators=500, num_class=8, colsample_bytree=0.75,gamma=0.35,
                             learning_rate=0.06, subsample=0.63)
    xgb_model.fit(tfidf_test, y_test)
    predictions = np.argmax(xgb_model.predict(tfidf_test), axis=1)
   
    # Stop tracking emissions
    emissions_data = tracker.stop_task()

    # Calculate accuracy        
    accuracy = accuracy_score(true_labels, predictions)
    
    
    # Prepare results dictionary
    results = {
        "username": username,
        "space_url": space_url,
        "submission_timestamp": datetime.now().isoformat(),
        "model_description": DESCRIPTION,
        "accuracy": float(accuracy),
        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
        "emissions_gco2eq": emissions_data.emissions * 1000,
        "emissions_data": clean_emissions_data(emissions_data),
        "api_route": ROUTE,
        "dataset_config": {
            "dataset_name": request.dataset_name,
            "test_size": request.test_size,
            "test_seed": request.test_seed
        }
    }
    
    return results