File size: 2,851 Bytes
e24351b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f6025c
e24351b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Backend.py
from flask import Flask, request, jsonify
from flask_cors import CORS
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

app = Flask(__name__)
CORS(app)

# List of models
MODEL_PATHS = {
    "gpt_oss_120b": "openai/gpt-oss-120b",
    "deepseek_v3": "deepseek-ai/DeepSeek-V3.1-Base",
    "gemini_25_pro": "afu4642tD/gemini-2.5-pro",
    "veo3": "sudip1987/Generate_videos_with_Veo3",
    "open_sora": "hpcai-tech/Open-Sora-v2",
    "usp_image": "GD-ML/USP-Image_Generation",
    "text_to_music": "sander-wood/text-to-music",
    "qwen_image": "Qwen/Qwen-Image",
    "qwen_image_diff": "Comfy-Org/Qwen-Image-DiffSynth-ControlNets",
    "coqui_tts": "sk0032/coqui-tts-model",
    "edge_tts": "sysf/Edge-TTS",
    "whisper_large": "openai/whisper-large-v3-turbo",
    "blip2_opt": "Salesforce/blip2-opt-2.7b",
    "mini_gpt4": "Vision-CAIR/MiniGPT-4",
    "glm_45": "zai-org/GLM-4.5",
    "chatglm3": "zai-org/chatglm3-6b",
    "gpt_oss_20b": "openai/gpt-oss-20b",
    "m2m100": "facebook/m2m100_1.2B",
    "tiny_marian": "onnx-community/tiny-random-MarianMTModel",
    "memory_transformer": "Grpp/memory-transformer-ru",
    "rl_memory_agent": "BytedTsinghua-SIA/RL-MemoryAgent-14B",
    "m3_agent": "ByteDance-Seed/M3-Agent-Memorization",
    "text_to_video": "ali-vilab/text-to-video-ms-1.7b"
}

def generate_single_answer(prompt, model_key):
    """Load model, generate answer, free memory"""
    model_name = MODEL_PATHS[model_key]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        device_map="auto" if torch.cuda.is_available() else None,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # Clean up to save memory
    del model, tokenizer, inputs, output
    torch.cuda.empty_cache()

    return answer

@app.route("/ask", methods=["POST"])
def ask():
    data = request.json
    prompt = data.get("prompt", "")
    selected_models = data.get("models", ["gpt_oss_120b", "deepseek_v3", "gemini_25_pro"])
    
    # Merge answers from all selected models
    answers = []
    for model_key in selected_models:
        if model_key in MODEL_PATHS:
            try:
                ans = generate_single_answer(prompt, model_key)
                answers.append(ans)
            except Exception as e:
                answers.append(f"[Error loading {model_key}]")

    # Return one merged answer
    final_answer = " | ".join(answers)
    return jsonify({"answer": final_answer})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)