Spaces:

Sina1138
/

ReView

Sleeping

App Files Files Community

ReView / glimpse-ui /alternative_polarity /scideberta /scideberta_full_polarity.py

Sina1138

Super-squash branch 'main' using huggingface_hub

6fe7180 about 1 month ago

raw

history blame contribute delete

2.94 kB

	import pandas as pd
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from pathlib import Path
	import nltk
	from tqdm import tqdm
	import sys, os.path
	from torch.nn import functional as F

	nltk.download('punkt')

	BASE_DIR = Path(__file__).resolve().parent.parent.parent
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))

	from glimpse.glimpse.data_loading.Glimpse_tokenizer import glimpse_tokenizer

	# === CONFIGURATION ===

	MODEL_DIR = BASE_DIR / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model"
	DATA_DIR = BASE_DIR / "glimpse" / "data" / "processed"
	OUTPUT_DIR = BASE_DIR / "data" / "polarity_scored"
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	# === Load model and tokenizer ===
	tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
	model.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# === Tokenize like GLIMPSE ===
	# def tokenize_sentences(text: str) -> list:
	# # same tokenization as in the original glimpse code
	# text = text.replace('-----', '\n')
	# sentences = nltk.sent_tokenize(text)
	# sentences = [sentence for sentence in sentences if sentence != ""]
	# return sentences


	def predict_polarity(sentences):
	inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	temperature = 2.7
	probs = F.softmax(logits / temperature, dim=-1)
	# Get probability of positive class
	polarity_scores = probs[:, 1]
	# Rescale: 0 → -1 (very negative), 1 → +1 (very positive)
	polarity_scores = (polarity_scores * 2) - 1
	return polarity_scores.cpu().tolist()


	def find_polarity(start_year=2017, end_year=2021):
	for year in range(start_year, end_year + 1):
	print(f"Processing {year}...")
	input_path = DATA_DIR / f"all_reviews_{year}.csv"
	output_path = OUTPUT_DIR / f"polarity_scored_reviews_{year}.csv"

	df = pd.read_csv(input_path)

	all_rows = []
	for _, row in tqdm(df.iterrows(), total=len(df)):
	review_id = row["id"]
	text = row["text"]
	sentences = glimpse_tokenizer(text)
	if not sentences:
	continue
	labels = predict_polarity(sentences)
	for sentence, polarity in zip(sentences, labels):
	all_rows.append({"id": review_id, "sentence": sentence, "polarity": polarity})

	output_df = pd.DataFrame(all_rows)
	output_df.to_csv(output_path, index=False)
	print(f"Saved polarity-scored data to {output_path}")


	if __name__ == "__main__":
	find_polarity()