Spaces:

Seppukku
/

nlp_project_gpt_team

Sleeping

App Files Files Community

nlp_project_gpt_team / funcs /nastya_funcs.py

Seppukku

initial commit

8fb2bb2 10 months ago

raw

history blame

5.59 kB

	import time
	import joblib
	import re
	import string
	import pymorphy3
	import torch
	from transformers import BertModel, BertTokenizer
	from torch import nn


	model_name = "cointegrated/rubert-tiny2"
	tokenizer = BertTokenizer.from_pretrained(model_name)

	bert_model = BertModel.from_pretrained(model_name)


	class MyTinyBERT(nn.Module):
	def __init__(self):
	super().__init__()
	self.bert = bert_model
	for param in self.bert.parameters():
	param.requires_grad = False
	self.linear = nn.Sequential(
	nn.Linear(312, 256),
	nn.Sigmoid(),
	nn.Dropout(),
	nn.Linear(256, 6)
	)


	def forward(self, input_ids, attention_mask=None):
	# Pass the input_ids and attention_mask to the BERT model
	bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)

	# Normalize the output from BERT
	normed_bert_out = nn.functional.normalize(bert_out.last_hidden_state[:, 0, :])

	# Pass through the linear layer
	out = self.linear(normed_bert_out)

	return out


	weights_path = "models/clf_rewievs_bert.pt"

	model = MyTinyBERT()
	model.load_state_dict(torch.load(weights_path, map_location=torch.device('cpu')))
	model.to('cpu')
	# tokenizer = transformers.AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


	# bert_model = transformers.AutoModel.from_pretrained("cointegrated/rubert-tiny2")
	# weights_path = "./model_weights.pt" # Replace with your .pt file path
	# bert_model.load_state_dict(torch.load('models/clf_rewievs_bert.pt', map_location=torch.device('cpu')))

	# bert_model.to('cpu')

	morph = pymorphy3.MorphAnalyzer()

	def lemmatize(text):
	words = text.split()
	lem_words = [morph.parse(word)[0].normal_form for word in words]
	return " ".join(lem_words)




	logreg = joblib.load('models/logregmodel_restaurants.pkl')
	vectorizer = joblib.load('models/tfidf_vectorizer_restaurants.pkl')

	with open(
	"funcs/stopwords-ru.txt", "r", encoding="utf-8"
	) as file:
	stop_words = set(file.read().split())


	rating_dict = {
	1: "Отвратительно",
	2: "Плохо",
	3: "Удовлетворительно",
	4: "Хорошо",
	5: "Великолепно",}


	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # Emoticons
	"\U0001F300-\U0001F5FF" # Symbols & Pictographs
	"\U0001F680-\U0001F6FF" # Transport & Map Symbols
	"\U0001F1E0-\U0001F1FF" # Flags (iOS)
	"\U00002700-\U000027BF" # Dingbats
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U00002600-\U000026FF" # Miscellaneous Symbols
	"\U00002B50-\U00002B55" # Miscellaneous Symbols and Pictographs
	"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	"\U0001F700-\U0001F77F" # Alchemical Symbols
	"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U0001FA00-\U0001FA6F" # Chess Symbols
	"]+",
	flags=re.UNICODE,
	)

	def clean(text, stopwords):
	text = text.lower() # нижний регистр
	text = re.sub(r"http\S+", " ", text) # удаляем ссылки
	text = re.sub(r"@\w+", " ", text) # удаляем упоминания пользователей
	text = re.sub(r"#\w+", " ", text) # удаляем хэштеги
	text = re.sub(r"\d+", " ", text) # удаляем числа
	text = text.translate(str.maketrans("", "", string.punctuation))
	text = re.sub(r"<.*?>", " ", text) #
	text = re.sub(r"[️«»—]", " ", text)
	text = re.sub(r"[^а-яё ]", " ", text)
	text = text.lower()
	text = emoji_pattern.sub(r"", text)
	text = " ".join([word for word in text.split() if word not in stopwords])
	return text


	def predict_review(review):
	start_time = time.time()

	# Очистка и лемматизация текста
	clean_text = clean(review, stop_words)
	lem_text = lemmatize(clean_text)

	# Преобразование текста в TF-IDF представление
	X_new = vectorizer.transform([lem_text])

	# Предсказание
	prediction = logreg.predict(X_new)[0]

	# Проверка допустимости предсказания
	if prediction not in rating_dict:
	rating = "Ошибка предсказания"
	else:
	rating = rating_dict[prediction]

	# Измерение времени
	end_time = time.time()
	elapsed_time = end_time - start_time

	print(f"Лейбл: {prediction}")
	print(f"Оценка отзыва: {rating}")
	print(f"Затраченное время: {elapsed_time:.6f} seconds")
	return prediction, rating, elapsed_time


	def preprocess_input(text):
	inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
	return inputs


	def predict_bert(text):
	start_time = time.time()

	model.eval()
	inputs = preprocess_input(text)

	# Move tensors to the correct device if using GPU
	inputs = {k: v.to('cpu') for k, v in inputs.items()}

	# Get model predictions
	with torch.no_grad():
	outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

	# Since the output is already logits, no need to access outputs.logits
	predicted_class = outputs.argmax(dim=-1).item()
	end_time = time.time()
	elapsed_time = end_time - start_time

	return predicted_class, rating_dict[predicted_class], elapsed_time