|
import pandas as pd
|
|
import json
|
|
import numpy as np
|
|
import warnings
|
|
|
|
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
|
|
GOLD_PATH='/home/jfraile/Programs/DIPROMATS_2024/evaluation_script/gold_test.json'
|
|
file_path='/home/jfraile/Programs/DIPROMATS_2024/evaluation_script/test_en.json'
|
|
lang='en'
|
|
|
|
with open(GOLD_PATH, 'r') as g:
|
|
gold = json.load(g)
|
|
|
|
with open(file_path, 'r') as f:
|
|
test = json.load(f)
|
|
|
|
|
|
|
|
def evaluate_results(lang, gold, test):
|
|
def normalize_labels(df):
|
|
|
|
def convert_narratives(row):
|
|
country_code = row['country'][:2].upper()
|
|
narratives = row['narratives']
|
|
|
|
|
|
for i in range(1, 7):
|
|
narrative_code = f"{country_code}{i}"
|
|
row[f"N{i}"] = 'yes' if narrative_code in narratives else 'no'
|
|
return row
|
|
|
|
data = df.apply(convert_narratives, axis=1)
|
|
|
|
data.drop(columns=['narratives', 'tweet_id'], inplace=True)
|
|
return data
|
|
def get_gold_lists_for_evaluation(gold_list, test_list):
|
|
gold_strict=[]
|
|
gold_lenient=[]
|
|
for i in range(0,6):
|
|
g=gold_list[i]
|
|
t=test_list[i]
|
|
g = 1 if g == 'yes' else 2 if g == 'no' else g
|
|
t = 1 if t == 'yes' else 2 if t == 'no' else t
|
|
if g==t:
|
|
gold_strict.append(g)
|
|
gold_lenient.append(g)
|
|
elif g!=t:
|
|
if g in [2, 1]:
|
|
gold_strict.append(g)
|
|
gold_lenient.append(g)
|
|
else:
|
|
gold_strict.append(2)
|
|
gold_lenient.append(t)
|
|
return gold_strict, gold_lenient
|
|
def gen_dic(lang):
|
|
narratives_list=['CH1', 'CH2', 'CH3', 'CH4', 'CH5', 'CH6', 'CH_micro', 'RU1', 'RU2', 'RU3', 'RU4', 'RU5', 'RU6', 'RU_micro', 'EU1', 'EU2', 'EU3', 'EU4', 'EU5', 'EU6', 'EU_micro', 'US1', 'US2', 'US3', 'US4', 'US5', 'US6', 'US_micro']
|
|
countries_dic={'China':'CH', 'Russia':'RU', 'EU':'EU', 'USA':'US'}
|
|
dic = {}
|
|
dic[lang] = {}
|
|
for ev in ['strict', 'lenient']:
|
|
if ev not in dic[lang]:
|
|
dic[lang][ev] = {}
|
|
for narr in narratives_list:
|
|
dic[lang][ev][narr] = {'scores': {'precision': 0., 'recall': 0., 'f1-score': 0.}, 'raw_data': []}
|
|
|
|
for code in countries_dic.values():
|
|
dic[lang][ev][f'{code}_micro'] = {'scores': {'precision': 0., 'recall': 0., 'f1-score': 0}, 'raw_data': []}
|
|
|
|
dic[lang][ev]['micro'] = {'scores': {'precision': 0., 'recall': 0., 'f1-score': 0}, 'raw_data': []}
|
|
return dic
|
|
def convert_labels(values):
|
|
return np.array([
|
|
[1 if v == 'yes' else 2 if v == 'no' else 3 for v in row]
|
|
for row in values
|
|
])
|
|
def convert_floats(dic):
|
|
for key, value in dic.items():
|
|
if isinstance(value, np.float64):
|
|
dic[key] = float(value)
|
|
elif isinstance(value, dict):
|
|
convert_floats(value)
|
|
elif isinstance(value, list):
|
|
dic[key] = [float(v) if isinstance(v, np.float64) else v for v in value]
|
|
dic=gen_dic(lang)
|
|
countries_dic={'China':'CH', 'Russia':'RU', 'EU':'EU', 'USA':'US'}
|
|
cols=[f'N{i}' for i in range(1,7)]
|
|
|
|
df_gold=pd.DataFrame(gold)
|
|
df_gold["country"] = df_gold["country"].replace("European Union", "EU")
|
|
df_gold.drop_duplicates(subset=['id', 'lang'], keep='last', inplace=True)
|
|
df=df_gold[df_gold['lang']==lang]
|
|
df.reset_index(inplace=True, drop=True)
|
|
|
|
df_test=pd.DataFrame(test)
|
|
df_test["country"] = df_test["country"].replace("European Union", "EU")
|
|
df_test=normalize_labels(df_test)
|
|
df_test.drop_duplicates(subset=['id', 'language'], keep='last', inplace=True)
|
|
df_test.reset_index(inplace=True, drop=True)
|
|
|
|
df_strict=df.copy()
|
|
df_lenient=df.copy()
|
|
for i in range(len(df)):
|
|
lang=df['lang'].iloc[i]
|
|
id=df['id'].iloc[i]
|
|
gold_values=df[cols].iloc[i].values
|
|
dft=df_test[(df_test['language']==lang) & (df_test['id']==id)]
|
|
|
|
test_values=dft[cols].iloc[0].values
|
|
df_strict.loc[i, cols], df_lenient.loc[i, cols]=get_gold_lists_for_evaluation(gold_values, test_values)
|
|
|
|
countries=['China', 'Russia', 'EU', 'USA']
|
|
|
|
df_lang=df[(df['lang']==lang)]
|
|
df_test_lang=df_test[(df_test['language']==lang)]
|
|
df_strict_lang=df_strict[df_strict['lang']==lang]
|
|
df_lenient_lang=df_lenient[df_lenient['lang']==lang]
|
|
|
|
for country in countries:
|
|
df_dup_t=df[(df['country']==country) & (df['lang']==lang)]
|
|
df_strict_t=df_strict_lang[df_strict_lang['country']==country]
|
|
df_lenient_t=df_lenient_lang[df_lenient_lang['country']==country]
|
|
dft=df_test_lang[(df_test_lang['country']==country)]
|
|
real_strict=[]
|
|
real_lenient=[]
|
|
real=[]
|
|
pred=[]
|
|
for i in range(len(df_strict_t)):
|
|
id=df_strict_t['id'].iloc[i]
|
|
dft2=dft[dft['id']==id]
|
|
if len(dft2)!=0:
|
|
real_strict.append(df_strict_t[cols].iloc[i].values)
|
|
real_lenient.append(df_lenient_t[cols].iloc[i].values)
|
|
pred.append(dft2[cols].iloc[0].values)
|
|
real.append(df_dup_t[df_dup_t['id']==id][cols].iloc[0].values)
|
|
real_strict=np.array(real_strict)
|
|
real_lenient=np.array(real_lenient)
|
|
|
|
real = convert_labels(real)
|
|
pred = convert_labels(pred)
|
|
|
|
for i in range(0, 6):
|
|
raw_matrix = np.zeros((2, 3), dtype=int)
|
|
pred_options = [1, 2]
|
|
real_options = [1, 3, 2]
|
|
p=pred[:,i]
|
|
r=real[:,i]
|
|
for p, r in zip(p, r):
|
|
pred_index = pred_options.index(p)
|
|
real_index = real_options.index(r)
|
|
raw_matrix[pred_index, real_index] += 1
|
|
tp=raw_matrix[0,0]
|
|
yl=raw_matrix[0,1]
|
|
fp=raw_matrix[0,2]
|
|
fn=raw_matrix[1,0]
|
|
nl=raw_matrix[1,1]
|
|
tn=raw_matrix[1,2]
|
|
dic[lang]['lenient'][f'{countries_dic[country]}{i+1}']['raw_data']=raw_matrix.tolist()
|
|
precision=(tp+yl)/(tp+yl+fp) if (tp+yl+fp)!=0 else 0
|
|
recall=(tp+yl)/(tp+fn+yl) if (tp+fn+yl)!=0 else 0
|
|
dic[lang]['lenient'][f'{countries_dic[country]}{i+1}']['scores']['precision']=precision
|
|
dic[lang]['lenient'][f'{countries_dic[country]}{i+1}']['scores']['recall']=recall
|
|
dic[lang]['lenient'][f'{countries_dic[country]}{i+1}']['scores']['f1-score']=(2*precision*recall)/(precision+recall) if (precision+recall)!=0 else 0
|
|
dic[lang]['strict'][f'{countries_dic[country]}{i+1}']['raw_data']=raw_matrix.tolist()
|
|
precision=tp/(tp+fp+yl) if (tp+fp+yl)!=0 else 0
|
|
recall=tp/(tp+fn) if (tp+fn)!=0 else 0
|
|
dic[lang]['strict'][f'{countries_dic[country]}{i+1}']['scores']['precision']=precision
|
|
dic[lang]['strict'][f'{countries_dic[country]}{i+1}']['scores']['recall']=recall
|
|
dic[lang]['strict'][f'{countries_dic[country]}{i+1}']['scores']['f1-score']=(2*precision*recall)/(precision+recall) if (precision+recall)!=0 else 0
|
|
|
|
|
|
real_strict=[]
|
|
real_lenient=[]
|
|
pred=[]
|
|
not_match=[]
|
|
real=[]
|
|
for i in range(len(df_lang)):
|
|
id=df_lang['id'].iloc[i]
|
|
dft=df_test_lang[df_test_lang['id']==id][cols]
|
|
if len(dft)!=0:
|
|
real_strict.extend(df_strict_lang[cols].iloc[i].values)
|
|
real_lenient.extend(df_strict_lang[cols].iloc[i].values)
|
|
pred.extend(df_test_lang[df_test_lang['id']==id][cols].iloc[0].values)
|
|
real.extend(df_lang[df_lang['id']==id][cols].iloc[0].values)
|
|
else:
|
|
not_match.append(id)
|
|
|
|
real = convert_labels([real])[0]
|
|
pred = convert_labels([pred])[0]
|
|
raw_matrix=np.zeros((2,3), dtype=int)
|
|
pred_options = [1, 2]
|
|
real_options = [1, 3, 2]
|
|
raw_matrix = np.zeros((2, 3), dtype=int)
|
|
for p, r in zip(pred, real):
|
|
pred_index = pred_options.index(p)
|
|
real_index = real_options.index(r)
|
|
raw_matrix[pred_index, real_index] += 1
|
|
tp=raw_matrix[0,0]
|
|
yl=raw_matrix[0,1]
|
|
fp=raw_matrix[0,2]
|
|
fn=raw_matrix[1,0]
|
|
nl=raw_matrix[1,1]
|
|
tn=raw_matrix[1,2]
|
|
dic[lang]['lenient']['micro']['raw_data']=raw_matrix.tolist()
|
|
precision=(tp+yl)/(tp+yl+fp) if (tp+yl+fp)!=0 else 0
|
|
recall=(tp+yl)/(tp+fn+yl) if (tp+fn+yl)!=0 else 0
|
|
dic[lang]['lenient']['micro']['scores']['precision']=precision
|
|
dic[lang]['lenient']['micro']['scores']['recall']=recall
|
|
dic[lang]['lenient']['micro']['scores']['f1-score']=(2*precision*recall)/(precision+recall) if (precision+recall)!=0 else 0
|
|
dic[lang]['strict']['micro']['raw_data']=raw_matrix.tolist()
|
|
precision=tp/(tp+fp+yl) if (tp+yl+fp)!=0 else 0
|
|
recall=tp/(tp+fn) if (tp+fn)!=0 else 0
|
|
dic[lang]['strict']['micro']['scores']['precision']=precision
|
|
dic[lang]['strict']['micro']['scores']['recall']=recall
|
|
dic[lang]['strict']['micro']['scores']['f1-score']=(2*precision*recall)/(precision+recall) if (precision+recall)!=0 else 0
|
|
|
|
|
|
for country in countries_dic.values():
|
|
raw_matrix = np.sum([np.array(dic[f'{lang}']['strict'][f'{country}{i}']['raw_data']) for i in range(1, 7)], axis=0)
|
|
tp=raw_matrix[0,0]
|
|
yl=raw_matrix[0,1]
|
|
fp=raw_matrix[0,2]
|
|
fn=raw_matrix[1,0]
|
|
nl=raw_matrix[1,1]
|
|
tn=raw_matrix[1,2]
|
|
precision=(tp+yl)/(tp+yl+fp) if (tp+yl+fp)!=0 else 0
|
|
recall=(tp+yl)/(tp+fn+yl) if (tp+fn+yl)!=0 else 0
|
|
dic[lang]['lenient'][f'{country}_micro']['scores']['precision']=precision
|
|
dic[lang]['lenient'][f'{country}_micro']['scores']['recall']=recall
|
|
dic[lang]['lenient'][f'{country}_micro']['scores']['f1-score']=(2*precision*recall)/(precision+recall) if (precision+recall)!=0 else 0
|
|
dic[lang]['lenient'][f'{country}_micro']['raw_data']=raw_matrix.tolist()
|
|
precision=tp/(tp+fp+yl) if (tp+yl+fp)!=0 else 0
|
|
recall=tp/(tp+fn) if (tp+fn)!=0 else 0
|
|
dic[lang]['strict'][f'{country}_micro']['scores']['precision']=precision
|
|
dic[lang]['strict'][f'{country}_micro']['scores']['recall']=recall
|
|
dic[lang]['strict'][f'{country}_micro']['scores']['f1-score']=(2*precision*recall)/(precision+recall) if (precision+recall)!=0 else 0
|
|
dic[lang]['strict'][f'{country}_micro']['raw_data']=raw_matrix.tolist()
|
|
|
|
convert_floats(dic[lang])
|
|
|
|
return dic[lang]
|
|
|
|
results = evaluate_results(lang, gold, test)
|
|
print(results)
|
|
|
|
|
|
"""
|
|
strict
|
|
narrative_country (e.g. CH1)
|
|
scores
|
|
precision
|
|
recall
|
|
f1-score
|
|
raw_data
|
|
country_micro (e.g. CH_micro)
|
|
scores
|
|
precision
|
|
recall
|
|
f1-score
|
|
raw_data
|
|
micro (global micro)
|
|
scores
|
|
precision
|
|
recall
|
|
f1-score
|
|
raw_data
|
|
|
|
lenient
|
|
narrative_country (e.g. CH1)
|
|
scores
|
|
precision
|
|
recall
|
|
f1-score
|
|
raw_data
|
|
country_micro (e.g. CH_micro)
|
|
scores
|
|
precision
|
|
recall
|
|
f1-score
|
|
raw_data
|
|
micro (global micro)
|
|
scores
|
|
precision
|
|
recall
|
|
f1-score
|
|
raw_data""" |