Spaces:

UcsTurkey
/

flare

Paused

File size: 4,582 Bytes

4977ad5

import os
import torch
import json
import shutil
import re
import traceback
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, default_data_collator, AutoConfig
from log import log
from core import INTENT_MODELS

async def detect_intent(text, project_name):
    project_model = INTENT_MODELS.get(project_name)
    if not project_model:
        raise Exception(f"'{project_name}' için intent modeli yüklenmemiş.")

    tokenizer = project_model["tokenizer"]
    model = project_model["model"]
    label2id = project_model["label2id"]

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predicted_id = outputs.logits.argmax(dim=-1).item()

    detected_intent = [k for k, v in label2id.items() if v == predicted_id][0]
    confidence = outputs.logits.softmax(dim=-1).max().item()

    return detected_intent, confidence

def background_training(project_name, intents, model_id, output_path, confidence_threshold):
    try:
        log(f"🔧 Intent eğitimi başlatıldı (proje: {project_name})")
        texts, labels, label2id = [], [], {}
        for idx, intent in enumerate(intents):
            label2id[intent["name"]] = idx
            for ex in intent["examples"]:
                texts.append(ex)
                labels.append(idx)

        dataset = Dataset.from_dict({"text": texts, "label": labels})
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        config = AutoConfig.from_pretrained(model_id)
        config.problem_type = "single_label_classification"
        config.num_labels = len(label2id)
        model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)

        tokenized_data = {"input_ids": [], "attention_mask": [], "label": []}
        for row in dataset:
            out = tokenizer(row["text"], truncation=True, padding="max_length", max_length=128)
            tokenized_data["input_ids"].append(out["input_ids"])
            tokenized_data["attention_mask"].append(out["attention_mask"])
            tokenized_data["label"].append(row["label"])

        tokenized = Dataset.from_dict(tokenized_data)
        tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

        if os.path.exists(output_path):
            shutil.rmtree(output_path)
        os.makedirs(output_path, exist_ok=True)

        trainer = Trainer(
            model=model,
            args=TrainingArguments(output_path, per_device_train_batch_size=4, num_train_epochs=3, logging_steps=10, save_strategy="no", report_to=[]),
            train_dataset=tokenized,
            data_collator=default_data_collator
        )
        trainer.train()

        log("🔧 Başarı raporu üretiliyor...")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        input_ids_tensor = torch.tensor(tokenized["input_ids"]).to(device)
        attention_mask_tensor = torch.tensor(tokenized["attention_mask"]).to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            predictions = outputs.logits.argmax(dim=-1).tolist()

        actuals = tokenized["label"]
        counts, correct = {}, {}
        for pred, actual in zip(predictions, actuals):
            intent_name = list(label2id.keys())[list(label2id.values()).index(actual)]
            counts[intent_name] = counts.get(intent_name, 0) + 1
            if pred == actual:
                correct[intent_name] = correct.get(intent_name, 0) + 1
        for intent_name, total in counts.items():
            accuracy = correct.get(intent_name, 0) / total
            log(f"📊 Intent '{intent_name}' doğruluk: {accuracy:.2f} — {total} örnek")
            if accuracy < confidence_threshold or total < 5:
                log(f"⚠️ Yetersiz performanslı intent: '{intent_name}' — Doğruluk: {accuracy:.2f}, Örnek: {total}")

        model.save_pretrained(output_path)
        tokenizer.save_pretrained(output_path)
        with open(os.path.join(output_path, "label2id.json"), "w") as f:
            json.dump(label2id, f)

        INTENT_MODELS[project_name] = {
            "model": model,
            "tokenizer": tokenizer,
            "label2id": label2id
        }
        log(f"✅ Intent eğitimi tamamlandı ve '{project_name}' modeli yüklendi.")

    except Exception as e:
        log(f"❌ Intent eğitimi hatası: {e}")
        traceback.print_exc()