In [None]:
import pickle as pk
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns

from rouge_score import rouge_scorer


from lexrank import LexRank
from lexrank.mappings.stopwords import STOPWORDS
import nltk 



In [None]:



path = Path("output/summaries/rsa_reranking/reviews_rsa_matrices/")
output_path = Path("output/summaries/methods_reviews/")



# Consensus score based summaries:

In [None]:
def consensus_scores_based_summaries(sample, n_consensus=3, n_dissensus=3):
 consensus_samples = sample['consensuality_scores'].sort_values(ascending=True).head(n_consensus).index.tolist()
 disensus_samples = sample['consensuality_scores'].sort_values(ascending=False).head(n_dissensus).index.tolist()
 
 consensus = ".".join(consensus_samples)
 disensus = ".".join(disensus_samples)
 
 return consensus + "\n\n" + disensus
 
 
def rsa_scores_based_summaries(sample, n_consensus=3, n_rsa_speaker=3):
 consensus_samples = sample['consensuality_scores'].sort_values(ascending=True).head(n_consensus).index.tolist()
 rsa = sample['best_rsa'].tolist()[:n_rsa_speaker]
 
 consensus = ".".join(consensus_samples)
 rsa = ".".join(rsa)
 
 return consensus + "\n\n" + rsa

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def lead(sample, N=10):
 texts = sample['speaker_df'].index.tolist()
 
 summary = "\n".join([".".join(t.split('.')[:N]) for t in texts])
 
 return summary

 
 

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)


def construct_templated_summaries(data, fn, dataset=None): 
 records = []
 for sample in data['results']:
 summary = fn(sample)
 text = "\n\n".join(sample['speaker_df'].index.tolist())
 record = {'id' : sample['id'], 'summary': summary, 'metadata/reranking_model' : data['metadata/reranking_model'], 'metadata/rsa_iterations' : data['metadata/reranking_model'], "text": text}
 if dataset is not None:
 record['gold'] = dataset.loc[sample["id"]]['gold'].tolist()[0]
 if record['gold'] is not None:
 rouges = scorer.score(summary, record['gold'])
 record |= {r : v.fmeasure for r, v in rouges.items()}
 
 
 
 records.append(record)
 
 return pd.DataFrame.from_records(records)
 

 
 


In [None]:
def prepare_dataset(dataset_name, dataset_path="data/processed/"):
 dataset_path = Path(dataset_path)
 if dataset_name == "amazon":
 dataset = pd.read_csv(dataset_path / "amazon_test.csv")
 elif dataset_name == "space":
 dataset = pd.read_csv(dataset_path / "space.csv")
 elif dataset_name == "yelp":
 dataset = pd.read_csv(dataset_path / "yelp_test.csv")
 elif dataset_name == "reviews":
 dataset = pd.read_csv(dataset_path / "test_metareviews.csv")
 else:
 raise ValueError(f"Unknown dataset {dataset_name}")


 return dataset


In [None]:
# df = prepare_dataset('reviews')

# for n, group in df.groupby('id'):
# for idx, row in group.iterrows():
# print(row['text'].replace('-----', "\n"))
# print("===========")
# break
rsa_scores_based_summaries

In [None]:


for n in [1, 2, 3, 4, 5, 6]:
 for file in path.glob("*.pk"):
 print(file)
 with file.open('rb') as fd:
 data = pk.load(fd)

 Path(output_path).mkdir(parents=True, exist_ok=True)
 model_name, dataset_name, decoding_config, date = str(file.stem).split('-_-')[:4]

 dataset = prepare_dataset(dataset_name, dataset_path="data/processed/")
 dataset = dataset.set_index('id')
 
 fn = lambda sample: consensus_scores_based_summaries(sample, n_consensus=n, n_dissensus=n)

 df = construct_templated_summaries(data, fn, dataset=dataset)
 
 df['metadata/method'] = "Agreement"
 df['metadata/n_sentences'] = 2*n
 df['metadata/n_consensus'] = n
 df['metadata/n_dissensus'] = n

 name = file.stem + "-_-" + f"consensus_score_based_{n}.csv"

 if (output_path / name).exists():
 df_old = pd.read_csv(output_path / name)

 for col in df.columns:
 if col not in df_old.columns:
 df_old[col] = float("nan")

 # add entry to the dataframe
 for col in df.columns:
 df_old[col] = df[col]

 df = df_old

 df.to_csv(output_path / name)
 
 
 
 

In [None]:

for n in [1, 2, 3, 4, 5, 6]:
 for file in path.glob("*.pk"):
 with file.open('rb') as fd:
 data = pk.load(fd)

 Path(output_path).mkdir(parents=True, exist_ok=True)
 model_name, dataset_name, decoding_config, date = str(file.stem).split('-_-')[:4]

 dataset = prepare_dataset(dataset_name, dataset_path="data/processed/")
 dataset = dataset.set_index('id')

 fn = lambda sample: rsa_scores_based_summaries(sample, n_consensus=n, n_rsa_speaker=n)
 df = construct_templated_summaries(data, fn, dataset=dataset)

 df['metadata/method'] = "Speaker+Agreement"
 df['metadata/n_sentences'] = 2*n
 df['metadata/n_consensus'] = n
 df['metadata/n_dissensus'] = n

 name = file.stem + "-_-" + f"rsa_score_based_{n}.csv"

 if (output_path / name).exists():
 df_old = pd.read_csv(output_path / name)

 for col in df.columns:
 if col not in df_old.columns:
 df_old[col] = float("nan")

 # add entry to the dataframe
 for col in df.columns:
 df_old[col] = df[col]

 df = df_old

 df.to_csv(output_path / name)

In [None]:

for n in [1, 2, 3, 4, 5, 6, 7, 8]:
 for file in path.glob("*.pk"):
 with file.open('rb') as fd:
 data = pk.load(fd)

 Path(output_path).mkdir(parents=True, exist_ok=True)
 model_name, dataset_name, decoding_config, date = str(file.stem).split('-_-')[:4]

 dataset = prepare_dataset(dataset_name, dataset_path="data/processed/")
 dataset = dataset.set_index('id')

 fn = lambda sample: lead(sample, N=2*n)


 df = construct_templated_summaries(data, fn, dataset=dataset)

 df['metadata/method'] = "Lead"
 df['metadata/n_sentences'] = 2*n

 name = file.stem + "-_-" + f"lead_{2*n}.csv"

 if (output_path / name).exists():
 df_old = pd.read_csv(output_path / name)

 for col in df.columns:
 if col not in df_old.columns:
 df_old[col] = float("nan")

 # add entry to the dataframe
 for col in df.columns:
 df_old[col] = df[col]

 df = df_old

 df.to_csv(output_path / name)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
output_path = Path("output/summaries/methods_reviews/")

In [None]:
import subprocess


for file in output_path.glob("*.csv"):
 print(file)
 cmd = ["python", "mds/evaluate_bartbert_metrics.py", "--summaries", file]
 subprocess.run(cmd)

In [None]:
dfs = []
for file in output_path.glob("*.csv"):
 model_name, dataset_name, decoding_config, date = str(file.stem).split('-_-')[:4]
 method = str(file.stem).split('-_-')[-1]
 
 df = pd.read_csv(file)
 df['metadata/Model'] = model_name
 df['metadata/Dataset'] = dataset_name
 df['metadata/method'] = method
 
 df["Method"] = f"{model_name}/{method}"
 
 dfs.append(df)
 
df = pd.concat(dfs)
 
 
df

In [None]:
sns.catplot(data=df, hue='Method', y='rougeL', x='metadata/Dataset', kind='bar')

In [None]:
sns.catplot(data=df, hue='metadata/method', y='rouge1', x='metadata/reranking_model', kind='bar', row="metadata/model")