import os import torch import json import shutil import re import traceback from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, default_data_collator, AutoConfig from log import log from core import INTENT_MODELS async def detect_intent(text, project_name): project_model = INTENT_MODELS.get(project_name) if not project_model: raise Exception(f"'{project_name}' için intent modeli yüklenmemiş.") tokenizer = project_model["tokenizer"] model = project_model["model"] label2id = project_model["label2id"] inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) outputs = model(**inputs) predicted_id = outputs.logits.argmax(dim=-1).item() detected_intent = [k for k, v in label2id.items() if v == predicted_id][0] confidence = outputs.logits.softmax(dim=-1).max().item() return detected_intent, confidence def background_training(project_name, intents, model_id, output_path, confidence_threshold): try: log(f"🔧 Intent eğitimi başlatıldı (proje: {project_name})") texts, labels, label2id = [], [], {} for idx, intent in enumerate(intents): label2id[intent["name"]] = idx for ex in intent["examples"]: texts.append(ex) labels.append(idx) dataset = Dataset.from_dict({"text": texts, "label": labels}) tokenizer = AutoTokenizer.from_pretrained(model_id) config = AutoConfig.from_pretrained(model_id) config.problem_type = "single_label_classification" config.num_labels = len(label2id) model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config) tokenized_data = {"input_ids": [], "attention_mask": [], "label": []} for row in dataset: out = tokenizer(row["text"], truncation=True, padding="max_length", max_length=128) tokenized_data["input_ids"].append(out["input_ids"]) tokenized_data["attention_mask"].append(out["attention_mask"]) tokenized_data["label"].append(row["label"]) tokenized = Dataset.from_dict(tokenized_data) tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) if os.path.exists(output_path): shutil.rmtree(output_path) os.makedirs(output_path, exist_ok=True) trainer = Trainer( model=model, args=TrainingArguments(output_path, per_device_train_batch_size=4, num_train_epochs=3, logging_steps=10, save_strategy="no", report_to=[]), train_dataset=tokenized, data_collator=default_data_collator ) trainer.train() log("🔧 Başarı raporu üretiliyor...") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) input_ids_tensor = torch.tensor(tokenized["input_ids"]).to(device) attention_mask_tensor = torch.tensor(tokenized["attention_mask"]).to(device) with torch.no_grad(): outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor) predictions = outputs.logits.argmax(dim=-1).tolist() actuals = tokenized["label"] counts, correct = {}, {} for pred, actual in zip(predictions, actuals): intent_name = list(label2id.keys())[list(label2id.values()).index(actual)] counts[intent_name] = counts.get(intent_name, 0) + 1 if pred == actual: correct[intent_name] = correct.get(intent_name, 0) + 1 for intent_name, total in counts.items(): accuracy = correct.get(intent_name, 0) / total log(f"📊 Intent '{intent_name}' doğruluk: {accuracy:.2f} — {total} örnek") if accuracy < confidence_threshold or total < 5: log(f"⚠️ Yetersiz performanslı intent: '{intent_name}' — Doğruluk: {accuracy:.2f}, Örnek: {total}") model.save_pretrained(output_path) tokenizer.save_pretrained(output_path) with open(os.path.join(output_path, "label2id.json"), "w") as f: json.dump(label2id, f) INTENT_MODELS[project_name] = { "model": model, "tokenizer": tokenizer, "label2id": label2id } log(f"✅ Intent eğitimi tamamlandı ve '{project_name}' modeli yüklendi.") except Exception as e: log(f"❌ Intent eğitimi hatası: {e}") traceback.print_exc()