File size: 4,582 Bytes
4977ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import torch
import json
import shutil
import re
import traceback
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, default_data_collator, AutoConfig
from log import log
from core import INTENT_MODELS

async def detect_intent(text, project_name):
    project_model = INTENT_MODELS.get(project_name)
    if not project_model:
        raise Exception(f"'{project_name}' için intent modeli yüklenmemiş.")

    tokenizer = project_model["tokenizer"]
    model = project_model["model"]
    label2id = project_model["label2id"]

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predicted_id = outputs.logits.argmax(dim=-1).item()

    detected_intent = [k for k, v in label2id.items() if v == predicted_id][0]
    confidence = outputs.logits.softmax(dim=-1).max().item()

    return detected_intent, confidence

def background_training(project_name, intents, model_id, output_path, confidence_threshold):
    try:
        log(f"🔧 Intent eğitimi başlatıldı (proje: {project_name})")
        texts, labels, label2id = [], [], {}
        for idx, intent in enumerate(intents):
            label2id[intent["name"]] = idx
            for ex in intent["examples"]:
                texts.append(ex)
                labels.append(idx)

        dataset = Dataset.from_dict({"text": texts, "label": labels})
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        config = AutoConfig.from_pretrained(model_id)
        config.problem_type = "single_label_classification"
        config.num_labels = len(label2id)
        model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)

        tokenized_data = {"input_ids": [], "attention_mask": [], "label": []}
        for row in dataset:
            out = tokenizer(row["text"], truncation=True, padding="max_length", max_length=128)
            tokenized_data["input_ids"].append(out["input_ids"])
            tokenized_data["attention_mask"].append(out["attention_mask"])
            tokenized_data["label"].append(row["label"])

        tokenized = Dataset.from_dict(tokenized_data)
        tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

        if os.path.exists(output_path):
            shutil.rmtree(output_path)
        os.makedirs(output_path, exist_ok=True)

        trainer = Trainer(
            model=model,
            args=TrainingArguments(output_path, per_device_train_batch_size=4, num_train_epochs=3, logging_steps=10, save_strategy="no", report_to=[]),
            train_dataset=tokenized,
            data_collator=default_data_collator
        )
        trainer.train()

        log("🔧 Başarı raporu üretiliyor...")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        input_ids_tensor = torch.tensor(tokenized["input_ids"]).to(device)
        attention_mask_tensor = torch.tensor(tokenized["attention_mask"]).to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            predictions = outputs.logits.argmax(dim=-1).tolist()

        actuals = tokenized["label"]
        counts, correct = {}, {}
        for pred, actual in zip(predictions, actuals):
            intent_name = list(label2id.keys())[list(label2id.values()).index(actual)]
            counts[intent_name] = counts.get(intent_name, 0) + 1
            if pred == actual:
                correct[intent_name] = correct.get(intent_name, 0) + 1
        for intent_name, total in counts.items():
            accuracy = correct.get(intent_name, 0) / total
            log(f"📊 Intent '{intent_name}' doğruluk: {accuracy:.2f}{total} örnek")
            if accuracy < confidence_threshold or total < 5:
                log(f"⚠️ Yetersiz performanslı intent: '{intent_name}' — Doğruluk: {accuracy:.2f}, Örnek: {total}")

        model.save_pretrained(output_path)
        tokenizer.save_pretrained(output_path)
        with open(os.path.join(output_path, "label2id.json"), "w") as f:
            json.dump(label2id, f)

        INTENT_MODELS[project_name] = {
            "model": model,
            "tokenizer": tokenizer,
            "label2id": label2id
        }
        log(f"✅ Intent eğitimi tamamlandı ve '{project_name}' modeli yüklendi.")

    except Exception as e:
        log(f"❌ Intent eğitimi hatası: {e}")
        traceback.print_exc()