Spaces:

TeePoat
/

frantics-bot

Running

App Files Files Community

WonderWaffle4 commited on 24 days ago

Commit

59c6d5c

1 Parent(s): 3328eb8

GPT-2 bot with transformers and bot server

Browse files

Files changed (21) hide show

.gitignore +2 -2
main.py → bot_local.py +85 -90
bot_server.py +88 -0
models/seq2seq/attention.py +1 -0
models/seq2seq/chat_dataset.py +3 -2
models/seq2seq/constants.py +4 -0
models/seq2seq/custom_types.py +2 -7
models/seq2seq/model.py +3 -2
models/seq2seq/requirements.txt +13 -0
models/seq2seq/searchers.py +3 -2
models/seq2seq/vocab.py +4 -4
models/transformer/__init__.py +0 -0
models/transformer/constants.py +1 -0
models/transformer/custom_types.py +12 -0
models/transformer/fine_tuner.py +75 -0
models/transformer/requirements.txt +36 -0
models/transformer/telegram_data_extractor.py +98 -0
models/transformer/text_generator.py +72 -0
models/transformer/utils.py +20 -0
requirements.txt +14 -14
server.py +31 -0

.gitignore CHANGED Viewed

@@ -206,5 +206,5 @@ marimo/_static/
 marimo/_lsp/
 __marimo__/
-models/seq2seq/data/train/*
-models/seq2seq/checkpoint/*

 marimo/_lsp/
 __marimo__/
+models/*/data/*
+models/*/checkpoint/*

main.py → bot_local.py RENAMED Viewed

@@ -1,90 +1,85 @@
-import time
-from typing import Final
-import requests
-import re
-from telegram import Update
-from telegram.ext import Application, MessageHandler, filters, ContextTypes
-from typing import Optional
-import random
-import os
-from dotenv import load_dotenv
-from models.seq2seq.model import Seq2SeqChatbot
-import torch
-load_dotenv()
-TOKEN: Final = os.environ.get("TOKEN")
-BOT_USERNAME: Final = os.environ.get("BOT_USERNAME")
-CHAT_ID: Final = int(os.environ.get("CHAT_ID"))
-CHECKPOINT_PATH: Final = "models/seq2seq/checkpoint/150_checkpoint.tar"
-romantiki_gif_id = "CgACAgIAAxkBAAE4zMlojLmMwqrxG5e2rnYS2f9_PZZgVwACL2oAAjbWyUqiyR5II6u6YDYE"
-bezumtsi_gif_id = "CgACAgIAAxkBAAE4zMtojLmiH_CGW5cT7G0QVXHR7D4g6wAC53UAApkBmEmM-VxqunRc6zYE"
-last_gif_sent = 1.0
-gif_sent_cooldown = 180.0
-torch.manual_seed(0)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-chatbot = Seq2SeqChatbot(500, 8856, 2, 2, 0.1, device)
-chatbot.load_checkpoint(CHECKPOINT_PATH)
-chatbot.eval_mode()
-def handle_response(text: str) -> Optional[str]:
-    response_chance = 0.02
-    if random.random() < response_chance:
-        return chatbot(text)
-    return None
-def edit_response(text: Optional[str]) -> Optional[str]:
-    if text is None:
-        return None
-    text = re.sub(r'\s+([,.!?;])\s+', r'\1 ', text)
-    return text
-async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
-    if update.message.chat_id == CHAT_ID:
-        # response: Optional[str] = ""
-        global last_gif_sent
-        if "роман" in update.message.text.lower() and \
-                time.time() - last_gif_sent >= gif_sent_cooldown:
-            await context.bot.send_animation( chat_id=update.message.chat_id, animation=romantiki_gif_id)
-            last_gif_sent = time.time()
-        elif "безу" in update.message.text.lower() and \
-                time.time() - last_gif_sent >= gif_sent_cooldown:
-            await context.bot.send_animation(chat_id=update.message.chat_id, animation=bezumtsi_gif_id)
-            last_gif_sent = time.time()
-        else:
-            text = update.message.text.replace(BOT_USERNAME, '').strip().lower()
-            response = edit_response(handle_response(text))
-            if response:
-                await context.bot.sendMessage(update.message.chat_id, response, reply_to_message_id=update.message.id)
-async def error(update: Update, context: ContextTypes.DEFAULT_TYPE):
-    print(f"{update.message.from_user.username} in {update.message.chat.type} "
-          f"chat caused error \"{context.error}\"\n"
-          f"{update}\"")
-def main() -> None:
-    """Run the bot."""
-    requests.post(f"https://api.telegram.org/bot{TOKEN}/getUpdates?offset=-1")
-    application = Application.builder().token(TOKEN).build()
-    application.add_handler(MessageHandler(filters.TEXT, handle_message))
-    application.add_error_handler(error)
-    application.run_polling(allowed_updates=Update.ALL_TYPES)
-if __name__ == '__main__':
-    print("Running main...")
-    # print(chatbot("test"))
-    main()

+import time
+from typing import Final
+import re
+from telegram import Update
+from telegram.ext import Application, MessageHandler, filters, ContextTypes
+from typing import Optional
+import random
+import os
+from dotenv import load_dotenv
+from models.seq2seq.model import Seq2SeqChatbot
+import torch
+load_dotenv()
+TOKEN: Final = os.environ.get("TOKEN")
+BOT_USERNAME: Final = os.environ.get("BOT_USERNAME")
+CHAT_ID: Final = int(os.environ.get("CHAT_ID"))
+CHECKPOINT_PATH: Final = "models/seq2seq/checkpoint/150_checkpoint.tar"
+ROMANTIKI_GIF_ID: Final = "CgACAgIAAxkBAAE4zMlojLmMwqrxG5e2rnYS2f9_PZZgVwACL2oAAjbWyUqiyR5II6u6YDYE"
+BEZUMTSI_GIF_ID: Final = "CgACAgIAAxkBAAE4zMlojLmMwqrxG5e2rnYS2f9_PZZgVwACL2oAAjbWyUqiyR5II6u6YDYE"
+last_gif_sent = 1.0
+gif_sent_cooldown = 180.0
+response_chance =  1.0
+torch.manual_seed(0)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+chatbot = Seq2SeqChatbot(500, 8856, 2, 2, 0.1, device)
+chatbot.load_checkpoint(CHECKPOINT_PATH)
+chatbot.eval_mode()
+def handle_response(text: str) -> Optional[str]:
+    if random.random() < response_chance:
+        return chatbot(text)
+    return None
+def edit_response(text: Optional[str]) -> Optional[str]:
+    if text is None:
+        return None
+    text = re.sub(r'\s+([,.!?;])\s+', r'\1 ', text)
+    return text
+async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    if update.message.chat_id == CHAT_ID:
+        global last_gif_sent
+        if "роман" in update.message.text.lower() and \
+                time.time() - last_gif_sent >= gif_sent_cooldown:
+            await context.bot.send_animation( chat_id=update.message.chat_id, animation=ROMANTIKI_GIF_ID)
+            last_gif_sent = time.time()
+        elif "безу" in update.message.text.lower() and \
+                time.time() - last_gif_sent >= gif_sent_cooldown:
+            await context.bot.send_animation(chat_id=update.message.chat_id, animation=BEZUMTSI_GIF_ID)
+            last_gif_sent = time.time()
+        else:
+            text = update.message.text.replace(BOT_USERNAME, '').strip().lower()
+            response = edit_response(handle_response(text))
+            if response:
+                await context.bot.sendMessage(update.message.chat_id, response, reply_to_message_id=update.message.id)
+async def error(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    print(f"{update.message.from_user.username} in {update.message.chat.type} "
+          f"chat caused error \"{context.error}\"\n"
+          f"{update}\"")
+def main() -> None:
+    """Run the bot."""
+    application = Application.builder().token(TOKEN).build()
+    application.add_handler(MessageHandler(filters.TEXT, handle_message))
+    application.add_error_handler(error)
+    application.run_polling(allowed_updates=Update.ALL_TYPES, drop_pending_updates=True)
+if __name__ == '__main__':
+    print("Running main...")
+    main()

bot_server.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import time
+from typing import Final
+import re
+from telegram import Update
+from telegram.ext import Application, MessageHandler, filters, ContextTypes
+from typing import Optional
+import random
+import os
+import requests
+from dotenv import load_dotenv
+import requests
+load_dotenv()
+TOKEN: Final = os.environ.get("TOKEN")
+BOT_USERNAME: Final = os.environ.get("BOT_USERNAME")
+CHAT_ID: Final = int(os.environ.get("CHAT_ID"))
+CHECKPOINT_PATH: Final = "models/seq2seq/checkpoint/150_checkpoint.tar"
+ROMANTIKI_GIF_ID: Final = "CgACAgIAAxkBAAE4zMlojLmMwqrxG5e2rnYS2f9_PZZgVwACL2oAAjbWyUqiyR5II6u6YDYE"
+BEZUMTSI_GIF_ID: Final = "CgACAgIAAxkBAAE4zMlojLmMwqrxG5e2rnYS2f9_PZZgVwACL2oAAjbWyUqiyR5II6u6YDYE"
+last_gif_sent = 1.0
+gif_sent_cooldown = 180.0
+response_chance =  1.0
+def handle_response(author: str, content: str) -> Optional[str]:
+    if random.random() < response_chance:
+        return requests.post("http://localhost:8000/generate", json={"author": author, "content": content + " "}).json()["response"]
+    return None
+def edit_response(text: Optional[str]) -> Optional[str]:
+    if text is None:
+        return None
+    # text = re.sub(r'\s+([,.!?;])\s+', r'\1 ', text)
+    return text
+async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    if update.message.chat_id == CHAT_ID:
+        global last_gif_sent
+        if "роман" in update.message.text.lower() and \
+                time.time() - last_gif_sent >= gif_sent_cooldown:
+            await context.bot.send_animation( chat_id=update.message.chat_id, animation=ROMANTIKI_GIF_ID)
+            last_gif_sent = time.time()
+        elif "безу" in update.message.text.lower() and \
+                time.time() - last_gif_sent >= gif_sent_cooldown:
+            await context.bot.send_animation(chat_id=update.message.chat_id, animation=BEZUMTSI_GIF_ID)
+            last_gif_sent = time.time()
+        else:
+            author = ""
+            first_name = update.message.from_user.first_name
+            last_name = update.message.from_user.last_name
+            if first_name:
+                author += first_name
+            if last_name:
+                author += f" {last_name}"
+            content = update.message.text.replace(BOT_USERNAME, '').strip().lower()
+            # response = edit_response(handle_response(author, content))
+            response = handle_response(author, content)
+            print(response)
+            if response:
+                await context.bot.sendMessage(update.message.chat_id, response, reply_to_message_id=update.message.id)
+async def error(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    print(f"{update.message.from_user.username} in {update.message.chat.type} "
+          f"chat caused error \"{context.error}\"\n"
+          f"{update}\"")
+def main() -> None:
+    """Run the bot."""
+    application = Application.builder().token(TOKEN).build()
+    application.add_handler(MessageHandler(filters.TEXT, handle_message))
+    application.add_error_handler(error)
+    application.run_polling(allowed_updates=Update.ALL_TYPES, drop_pending_updates=True)
+if __name__ == '__main__':
+    print("Running main...")
+    # print(chatbot("test"))
+    main()

models/seq2seq/attention.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch.nn.functional as F
 import torch.nn as nn
 from .custom_types import Method
 class LuongAttention(nn.Module):
     def __init__(self, method: Method, hidden_size: int):
         super().__init__()

 import torch.nn as nn
 from .custom_types import Method
 class LuongAttention(nn.Module):
     def __init__(self, method: Method, hidden_size: int):
         super().__init__()

models/seq2seq/chat_dataset.py CHANGED Viewed

@@ -5,10 +5,11 @@ from collections import OrderedDict
 from .vocab import Vocab
 from .custom_types import Message, MessageId, Conversation
 from torch.nn.utils.rnn import pad_sequence
-from .custom_types import Token
 import re
 import json
 class ChatDataset(data.Dataset):
     def __init__(self, path: str, max_message_count: int = None, batch_size=5):
         super().__init__()
@@ -32,7 +33,7 @@ class ChatDataset(data.Dataset):
             batches_X.append(pad_sequence([self.vocab.sentence_indices(conversations[i+j][0] + ["<eos>"]) for j in range(self.batch_size) if i+j < len(conversations)], batch_first=True, padding_value=0))
             batches_y.append(pad_sequence([self.vocab.sentence_indices(conversations[i+j][1] + ["<eos>"]) for j in range(self.batch_size) if i+j < len(conversations)], batch_first=True, padding_value=0))
             lengths.append(torch.tensor([len(conversations[i+j][0]) for j in range(self.batch_size) if i+j < len(conversations)]))
-            mask.append(batches_y[-1] != Token.PAD_TOKEN.value)
         return batches_X, batches_y, lengths, mask
     @classmethod

 from .vocab import Vocab
 from .custom_types import Message, MessageId, Conversation
 from torch.nn.utils.rnn import pad_sequence
+from .constants import PAD_TOKEN
 import re
 import json
 class ChatDataset(data.Dataset):
     def __init__(self, path: str, max_message_count: int = None, batch_size=5):
         super().__init__()
             batches_X.append(pad_sequence([self.vocab.sentence_indices(conversations[i+j][0] + ["<eos>"]) for j in range(self.batch_size) if i+j < len(conversations)], batch_first=True, padding_value=0))
             batches_y.append(pad_sequence([self.vocab.sentence_indices(conversations[i+j][1] + ["<eos>"]) for j in range(self.batch_size) if i+j < len(conversations)], batch_first=True, padding_value=0))
             lengths.append(torch.tensor([len(conversations[i+j][0]) for j in range(self.batch_size) if i+j < len(conversations)]))
+            mask.append(batches_y[-1] != PAD_TOKEN)
         return batches_X, batches_y, lengths, mask
     @classmethod

models/seq2seq/constants.py ADDED Viewed

	@@ -0,0 +1,4 @@

+PAD_TOKEN = 0
+BOS_TOKEN = 1
+EOS_TOKEN = 2
+UNK_TOKEN = 3

models/seq2seq/custom_types.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, TypedDict, NotRequired, Tuple
 from enum import Enum, auto
 MessageId = int
 MessageText = List[str]
 Conversation = Tuple[MessageText]
@@ -14,10 +15,4 @@ class Message(TypedDict):
 class Method(Enum):
     DOT = auto()
     GENERAL = auto()
-    CONCAT = auto()
-class Token(Enum):
-    PAD_TOKEN = 0
-    BOS_TOKEN = 1
-    EOS_TOKEN = 2
-    UNK_TOKEN = 3

 from typing import List, TypedDict, NotRequired, Tuple
 from enum import Enum, auto
 MessageId = int
 MessageText = List[str]
 Conversation = Tuple[MessageText]
 class Method(Enum):
     DOT = auto()
     GENERAL = auto()
+    CONCAT = auto()

models/seq2seq/model.py CHANGED Viewed

@@ -5,7 +5,8 @@ import torch.nn as nn
 import torch.optim as optim
 from .chat_dataset import ChatDataset
 from .attention import LuongAttention
-from .custom_types import Method, Token
 from .vocab import Vocab
 from .searchers import GreedySearch
 import os
@@ -108,7 +109,7 @@ class Seq2SeqChatbot(nn.Module):
                 encoder_outputs, hidden = self.encoder(x_train, x_lengths) # Output shape: (batch_size, max_len_in_batch, hidden_size)
                 hidden = hidden[:self.decoder_num_layers]
                 loss = 0
-                decoder_input = torch.LongTensor([[Token.BOS_TOKEN.value] for _ in range(y_train.shape[0])])
                 decoder_input = decoder_input.to(device)
                 use_teacher_forcing = random.random() < teacher_forcing_ratio
                 if use_teacher_forcing:

 import torch.optim as optim
 from .chat_dataset import ChatDataset
 from .attention import LuongAttention
+from .custom_types import Method
+from .constants import BOS_TOKEN
 from .vocab import Vocab
 from .searchers import GreedySearch
 import os
                 encoder_outputs, hidden = self.encoder(x_train, x_lengths) # Output shape: (batch_size, max_len_in_batch, hidden_size)
                 hidden = hidden[:self.decoder_num_layers]
                 loss = 0
+                decoder_input = torch.LongTensor([[BOS_TOKEN] for _ in range(y_train.shape[0])])
                 decoder_input = decoder_input.to(device)
                 use_teacher_forcing = random.random() < teacher_forcing_ratio
                 if use_teacher_forcing:

models/seq2seq/requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+colorama==0.4.6
+filelock==3.18.0
+fsspec==2025.7.0
+Jinja2==3.1.6
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.5
+numpy==2.3.2
+setuptools==80.9.0
+sympy==1.14.0
+torch==2.7.1
+tqdm==4.67.1
+typing_extensions==4.14.1

models/seq2seq/searchers.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
-from .custom_types import Token
 class GreedySearch(nn.Module):
     def __init__(self, encoder, decoder, embedding, device):
@@ -13,7 +14,7 @@ class GreedySearch(nn.Module):
     def forward(self, x, input_length, max_length):
         encoder_outputs, hidden = self.encoder(x, input_length)
         decoder_hidden = hidden[:self.decoder.num_layers]
-        decoder_input = torch.ones(1, 1, device=self.device, dtype=torch.long) * Token.BOS_TOKEN.value
         all_tokens = torch.zeros([0], device=self.device, dtype=torch.long)
         all_scores = torch.zeros([0], device=self.device)

 import torch
 import torch.nn as nn
+from .constants import BOS_TOKEN
 class GreedySearch(nn.Module):
     def __init__(self, encoder, decoder, embedding, device):
     def forward(self, x, input_length, max_length):
         encoder_outputs, hidden = self.encoder(x, input_length)
         decoder_hidden = hidden[:self.decoder.num_layers]
+        decoder_input = torch.ones(1, 1, device=self.device, dtype=torch.long) * BOS_TOKEN
         all_tokens = torch.zeros([0], device=self.device, dtype=torch.long)
         all_scores = torch.zeros([0], device=self.device)

models/seq2seq/vocab.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import torch
 import torch.nn as nn
 from typing import List, Dict
-from .custom_types import Token
 class Vocab(nn.Module):
     def __init__(self, messages: List[Dict]):
         super().__init__()
-        self.word2index: Dict[str, int] = {"<pad>": Token.PAD_TOKEN.value, "<bos>": Token.BOS_TOKEN.value, "<eos>": Token.EOS_TOKEN.value, "<unk>": Token.UNK_TOKEN.value}
-        self.index2word: Dict[int, str] = {Token.PAD_TOKEN.value: "<pad>", Token.BOS_TOKEN.value: "<bos>", Token.EOS_TOKEN.value: "<eos>", Token.UNK_TOKEN.value: "<unk>"}
         self.word_count: Dict[str, int] = dict()
         self.size = 4
@@ -33,7 +33,7 @@ class Vocab(nn.Module):
     def sentence_indices(self, sentence: List[str]) -> torch.LongTensor:
         indices = torch.LongTensor(len(sentence))
         for i, word in enumerate(sentence):
-            indices[i] = self.word2index[word] if word in self.word2index else Token.UNK_TOKEN.value
         return indices
     def forward(self, indices: torch.LongTensor):

 import torch
 import torch.nn as nn
 from typing import List, Dict
+from .constants import PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN
 class Vocab(nn.Module):
     def __init__(self, messages: List[Dict]):
         super().__init__()
+        self.word2index: Dict[str, int] = {"<pad>": PAD_TOKEN, "<bos>": BOS_TOKEN, "<eos>": EOS_TOKEN, "<unk>": UNK_TOKEN}
+        self.index2word: Dict[int, str] = {PAD_TOKEN: "<pad>", BOS_TOKEN: "<bos>", EOS_TOKEN: "<eos>", UNK_TOKEN: "<unk>"}
         self.word_count: Dict[str, int] = dict()
         self.size = 4
     def sentence_indices(self, sentence: List[str]) -> torch.LongTensor:
         indices = torch.LongTensor(len(sentence))
         for i, word in enumerate(sentence):
+            indices[i] = self.word2index[word] if word in self.word2index else UNK_TOKEN
         return indices
     def forward(self, indices: torch.LongTensor):

models/transformer/__init__.py ADDED Viewed

File without changes

models/transformer/constants.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ CHECKPOINT_PATH = "models/transformer/checkpoint/"

models/transformer/custom_types.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import List, NotRequired, Tuple, TypedDict, Union
+MessageId = int
+MessageText = Union[List[str], Tuple[str]]
+Conversation = Tuple[MessageText]
+class Message(TypedDict):
+    id: MessageId
+    text: MessageText
+    reply_to_id: NotRequired[int]

models/transformer/fine_tuner.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from pathlib import Path
+from .utils import modified_tokenizer
+from .telegram_data_extractor import TelegramDataExtractor
+from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
+from datasets import load_dataset
+from .constants import CHECKPOINT_PATH
+class FineTuner:
+    def __init__(self,
+                 model_name="ai-forever/rugpt3small_based_on_gpt2",
+                 cache_dir="model_cache",
+                 data_path=CHECKPOINT_PATH):
+        self.data_path = Path(data_path)
+        # Инициализация токенизатора и модели
+        self.tokenizer = modified_tokenizer(model_name, cache_dir, self.data_path)
+        self.model = GPT2LMHeadModel.from_pretrained(model_name, cache_dir=str(self.data_path / cache_dir))
+    def prepare_data(self):
+        """
+        Подготовка данных для обучения
+        """
+        messages = TelegramDataExtractor.load_messages_from_json("/kaggle/input/chat-history/chat_history_small.json")
+        dataset_path = TelegramDataExtractor.conversations_from_messages(self.data_path, self.tokenizer, messages)
+        return dataset_path
+    def fine_tune(self,
+                  dataset_path,
+                  output_name='fine_tuned_model',
+                  num_train_epochs=10,
+                  per_device_train_batch_size=8,
+                  learning_rate=5e-5,
+                  save_steps=10_000):
+        """
+        Дообучение модели на заданном датасете.
+        """
+        dataset = load_dataset("text", data_files={"train": "train_dataset.txt"})
+        def preprocess(example):
+            # Tokenize while preserving structure
+            return self.tokenizer(example["text"], truncation=True, max_length=300)
+        train_dataset = dataset.map(preprocess, batched=True)["train"]
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=self.tokenizer, mlm=False
+        )
+        training_args = TrainingArguments(
+            output_dir=str(self.data_path / output_name),
+            overwrite_output_dir=True,
+            num_train_epochs=num_train_epochs,
+            per_device_train_batch_size=per_device_train_batch_size,
+            # fp16=True,
+            # gradient_accumulation_steps=2,
+            save_steps=save_steps,
+            learning_rate=learning_rate,
+            torch_compile=True,
+            save_total_limit=2,
+            logging_dir=str(self.data_path / 'logs'),
+            report_to="none"
+        )
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+        )
+        trainer.train()
+        # Сохранение обученной модели и токенизатора
+        self.model.save_pretrained(str(self.data_path / output_name))
+        self.tokenizer.save_pretrained(str(self.data_path / output_name))

models/transformer/requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+attrs==25.3.0
+certifi==2025.8.3
+charset-normalizer==3.4.2
+colorama==0.4.6
+datasets==4.0.0
+dill==0.3.8
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec==2025.3.0
+huggingface-hub==0.34.3
+idna==3.10
+multidict==6.6.3
+multiprocess==0.70.16
+numpy==2.3.2
+packaging==25.0
+pandas==2.3.1
+propcache==0.3.2
+pyarrow==21.0.0
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
+regex==2025.7.34
+requests==2.32.4
+safetensors==0.5.3
+six==1.17.0
+tokenizers==0.21.4
+tqdm==4.67.1
+transformers==4.54.1
+typing_extensions==4.14.1
+tzdata==2025.2
+urllib3==2.5.0
+xxhash==3.5.0
+yarl==1.20.1

models/transformer/telegram_data_extractor.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from pathlib import Path
+from typing import Dict, OrderedDict, Tuple, Union
+from .custom_types import MessageId, Message, Conversation, MessageText
+from typing import List
+import re
+import json
+class TelegramDataExtractor:
+    @classmethod
+    def load_messages_from_json(cls, path: str, max_message_count: int = None) -> OrderedDict[MessageId, Message]:
+        messages: OrderedDict[MessageId, Message] = OrderedDict()
+        with open(path, "r", encoding="utf-8") as file:
+            chat_json = json.load(file)
+            for i, message in enumerate(chat_json["messages"]):
+                if max_message_count and i == max_message_count:
+                    break
+                if message["type"] != "message":
+                    continue
+                new_message = {
+                    "from": cls.normalize_username(message["from"]),
+                    "id": message["id"],
+                    "text": cls.normalize(message["text"])
+                }
+                if not new_message["text"]: # Check for empty message
+                    continue
+                if "reply_to_message_id" in message.keys():
+                    new_message["reply_to_id"] = message["reply_to_message_id"]
+                messages[new_message["id"]] = new_message
+        return messages
+    @staticmethod
+    def conversations_from_messages(save_to: Path, tokenizer, messages: OrderedDict[MessageId, Message]) -> List[Conversation]:
+        _MAX_MESSAGE_LEN = 150
+        _MAX_QA_LEN_DIFF = 20
+        def remove_duplicates_keep_order(lst: List[Conversation]) -> List[Conversation]:
+            lst = list(dict.fromkeys(lst)) # Remove duplicates and keep order
+            return [(list(x[0]), list(x[1]), x[2]) for x in lst] # Tuples are only needed for hashability
+        def remove_answers_with_only_special_symbols(lst: List[Conversation]) -> List[Conversation]:
+            return [i for i in lst if re.findall(r"[а-я]", " ".join(i[1]))]
+        def remove_long_qa(lst: List[Conversation]) -> List[Conversation]:
+            return [i for i in lst if len(i[0]) <= _MAX_MESSAGE_LEN and len(i[1]) <= _MAX_MESSAGE_LEN]
+        def remove_unbalanced_qa(lst: List[Conversation]) -> List[Conversation]:
+            return [i for i in lst if abs(len(i[0]) - len(i[1])) <= _MAX_QA_LEN_DIFF]
+        def normalize_conversations(lst: List[Conversation]) -> List[Conversation]:
+            lst = remove_duplicates_keep_order(lst)
+            lst = remove_answers_with_only_special_symbols(lst)
+            lst = remove_long_qa(lst)
+            lst = remove_unbalanced_qa(lst)
+            return lst
+        conversations: List[Conversation] = []
+        questions: Dict[MessageText, int] = dict()
+        messages_values = list(messages.values()) # TODO: try changing this cast to something more applicable
+        for i in range(len(messages) - 1): # There's no answer for last message so add -1
+            try: # Message is answer for message with `id` of `reply_to_id`
+                prev_message = messages[messages_values[i+1]["reply_to_id"]]
+            except KeyError:
+                prev_message = messages_values[i]
+            qa = (prev_message["text"], messages_values[i+1]["text"], prev_message["from"])
+            if qa[0] in questions.keys(): # If there are multiple answers for same message, choose the longest one
+                if len(conversations[questions[qa[0]]][1]) < len(qa[1]) and abs(len(conversations[questions[qa[0]]][1]) - len(qa[1])) <= _MAX_QA_LEN_DIFF:
+                    conversations[questions[qa[0]]] = (qa[0], qa[1], qa[2])
+                continue
+            else:
+                questions[qa[0]] = len(conversations)
+            conversations.append(qa)
+        conversations = normalize_conversations(conversations)
+        output_path = save_to / "train_dataset.txt"
+        with open(output_path, "w", encoding="utf-8") as file:
+            for conversation in conversations:
+                line = "<user> " + conversation[2] + " <says> " + " ".join(conversation[0]) + f" {tokenizer.eos_token} <response> " + " ".join(conversation[1]) + f" {tokenizer.eos_token}" + "\n"
+                file.write(line)
+        return output_path
+    @staticmethod
+    def normalize(text: Union[str, List]) -> Tuple[str]:
+        if isinstance(text, List):
+            text = " ".join([word for word in text if isinstance(word, str)])
+        text = text.lower().strip()
+        text = re.sub(r"[^а-яё.!?:\d]+", r" ", text) # Leave only russian and special characters
+        text = re.sub(r'\.(\s*\.)+', '... ', text) # Replace any sequence of 2+ dots with '...'
+        text = re.sub(r'([?!])(\s*\1)+', r'\1 ', text) # Collapse repeating ? or !
+        text = re.sub(r"([!?]|\.+)", r"\1 ", text) # Separate special symbols by whitespaces
+        text = re.sub(r"ё", r"е", text)
+        text = re.sub(r"(.*[ауспэиычвекьхъз]{6,}.*|\b[апх][аеписх]{2,3}\b|\b[ах]{2,}\b)", r" <laugh> ", text) # Laugh token for strings such as `ахах` etc.
+        text = re.sub(r"(<laugh>)(\s*\1)+", r" <laugh> ", text) # Collapse repeating <laugh> tokens
+        text = re.sub(r"\s+", r" ", text).strip() # Leave only one space between each word
+        return tuple(text.split())
+    @staticmethod
+    def normalize_username(text: str) -> Tuple[str]:
+        text = text.lower()
+        text = re.sub(r"[^а-яa-z\s]+", "", text).strip()
+        return text

models/transformer/text_generator.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from transformers import GPT2LMHeadModel
+from pathlib import Path
+from .utils import modified_tokenizer
+from .constants import CHECKPOINT_PATH
+class TextGenerator:
+    def __init__(self, model_name='fine_tuned_model', data_path=CHECKPOINT_PATH):
+        """
+        Инициализация модели и токенизатора.
+        Загружаем модель и токенизатор из указанного пути.
+        """
+        model_path = Path(data_path) / model_name
+        self.tokenizer = modified_tokenizer(model_path, None, data_path)
+        self.model = GPT2LMHeadModel.from_pretrained(str(model_path), device_map="auto")
+        self.model.eval()
+    def generate_text(self,
+                    author: str,
+                    input_str: str,
+                    max_length=100,
+                    num_return_sequences=1,
+                    temperature=1.0,
+                    top_k=0,
+                    top_p=1.0,
+                    do_sample=False):
+        """
+        Генерация текста на основе заданного начального текста (prompt) и параметров.
+        Параметры:
+        - input: Входная последовательность.
+        - max_length: Максимальная длина сгенерированного текста.
+        - num_return_sequences: Количество возвращаемых последовательностей.
+        - temperature: Контролирует разнообразие вывода.
+        - top_k: Если больше 0, ограничивает количество слов для выборки только k наиболее вероятными словами.
+        - top_p: Если меньше 1.0, применяется nucleus sampling.
+        - do_sample: Если True, включает случайную выборку для увеличения разнообразия.
+        """
+        # Формирование prompt
+        prompt_text = f"<user> {author} <says> {input_str} {self.tokenizer.eos_token} <response>"
+        print(prompt_text)
+        # Кодирование текста в формате, пригодном для модели
+        encoded_input = self.tokenizer.encode(prompt_text, return_tensors='pt')
+        # Генерация текстов
+        outputs = self.model.generate(
+            encoded_input,
+            max_length=max_length + len(encoded_input[0]),
+            num_return_sequences=num_return_sequences,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            no_repeat_ngram_size=2
+        )
+        # Декодирование результатов
+        all_texts = [self.tokenizer.decode(output, skip_special_tokens=False) for output in outputs]
+        # Удаление входных данных из текстов
+        prompt_length = len(self.tokenizer.decode(encoded_input[0], skip_special_tokens=False))
+        trimmed_texts = [text[prompt_length:] for text in all_texts]
+        # Возврат результатов в виде словаря
+        return {
+            "full_texts": all_texts,
+            "generated_texts": trimmed_texts
+        }
+if __name__ == "__main__":
+    print("OK")

models/transformer/utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from transformers import GPT2Tokenizer
+from pathlib import Path
+from .constants import CHECKPOINT_PATH
+def modified_tokenizer(model_name="ai-forever/rugpt3small_based_on_gpt2", cache_dir="model_cache", data_path=Path(CHECKPOINT_PATH)):
+    if cache_dir:
+        tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=str(data_path / cache_dir))
+    else:
+        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+    special_tokens_dict = {
+        "additional_special_tokens": [
+            "<user>",
+            "<says>",
+            "<response>"
+        ]
+    }
+    tokenizer.add_special_tokens(special_tokens_dict)
+    tokenizer.add_tokens(["<laugh>"])
+    return tokenizer

requirements.txt CHANGED Viewed

@@ -1,22 +1,22 @@
-anyio==4.9.0
-certifi==2025.7.14
-colorama==0.4.6
-filelock==3.18.0
-fsspec==2025.7.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
 idna==3.10
-Jinja2==3.1.6
-MarkupSafe==3.0.2
-mpmath==1.3.0
-networkx==3.5
-numpy==2.3.1
 python-dotenv==1.1.1
 python-telegram-bot==22.3
-setuptools==80.9.0
 sniffio==1.3.1
-sympy==1.14.0
-torch==2.7.1
-tqdm==4.67.1
 typing_extensions==4.14.1

+anyio==4.10.0
+certifi==2025.8.3
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
 idna==3.10
 python-dotenv==1.1.1
 python-telegram-bot==22.3
 sniffio==1.3.1
 typing_extensions==4.14.1
+typing-inspection==0.4.1
+annotated-types==0.7.0
+fastapi==0.116.1
+pydantic==2.11.7
+pydantic_core==2.33.2
+starlette==0.47.2
+charset-normalizer==3.4.2
+requests==2.32.4
+urllib3==2.5.0
+click==8.2.1
+colorama==0.4.6
+uvicorn==0.35.0

server.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from models.transformer.text_generator import TextGenerator
+app = FastAPI()
+generator = TextGenerator(
+    model_name='fine_tuned_model_gpt_2',
+)
+class Message(BaseModel):
+    author: str
+    content: str
+@app.post("/generate")
+def generate_response(message: Message):
+    response = generator.generate_text(
+        author=message.author,
+        input_str=message.content,
+        max_length=100,
+        num_return_sequences=1,
+        do_sample=True,
+        temperature=0.8,  # Слегка уменьшаем уверенность
+        top_k=100,         # Уменьшаем количество рассматриваемых верхних k слов
+        top_p=0.95        # Уменьшаем "ядерность" распределения
+    )["generated_texts"][0]
+    response = response[:response.find("</s>")]
+    return { "response": response }