ReView / glimpse-ui /alternative_polarity /scideberta /scideberta_full_polarity.py
Sina1138's picture
Super-squash branch 'main' using huggingface_hub
6fe7180
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path
import nltk
from tqdm import tqdm
import sys, os.path
from torch.nn import functional as F
nltk.download('punkt')
BASE_DIR = Path(__file__).resolve().parent.parent.parent
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
from glimpse.glimpse.data_loading.Glimpse_tokenizer import glimpse_tokenizer
# === CONFIGURATION ===
MODEL_DIR = BASE_DIR / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model"
DATA_DIR = BASE_DIR / "glimpse" / "data" / "processed"
OUTPUT_DIR = BASE_DIR / "data" / "polarity_scored"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# === Load model and tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# === Tokenize like GLIMPSE ===
# def tokenize_sentences(text: str) -> list:
# # same tokenization as in the original glimpse code
# text = text.replace('-----', '\n')
# sentences = nltk.sent_tokenize(text)
# sentences = [sentence for sentence in sentences if sentence != ""]
# return sentences
def predict_polarity(sentences):
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
temperature = 2.7
probs = F.softmax(logits / temperature, dim=-1)
# Get probability of positive class
polarity_scores = probs[:, 1]
# Rescale: 0 β†’ -1 (very negative), 1 β†’ +1 (very positive)
polarity_scores = (polarity_scores * 2) - 1
return polarity_scores.cpu().tolist()
def find_polarity(start_year=2017, end_year=2021):
for year in range(start_year, end_year + 1):
print(f"Processing {year}...")
input_path = DATA_DIR / f"all_reviews_{year}.csv"
output_path = OUTPUT_DIR / f"polarity_scored_reviews_{year}.csv"
df = pd.read_csv(input_path)
all_rows = []
for _, row in tqdm(df.iterrows(), total=len(df)):
review_id = row["id"]
text = row["text"]
sentences = glimpse_tokenizer(text)
if not sentences:
continue
labels = predict_polarity(sentences)
for sentence, polarity in zip(sentences, labels):
all_rows.append({"id": review_id, "sentence": sentence, "polarity": polarity})
output_df = pd.DataFrame(all_rows)
output_df.to_csv(output_path, index=False)
print(f"Saved polarity-scored data to {output_path}")
if __name__ == "__main__":
find_polarity()