from flask import Flask, request, jsonify from app import generate_chat_completion import time app = Flask(__name__) @app.route('/v1/chat/completions', methods=['POST']) def chat_completions(): data = request.json messages = data.get('messages', []) max_tokens = data.get('max_tokens', 560) temperature = data.get('temperature', 0.8) if not messages or not isinstance(messages, list): return jsonify({"error": "A valid 'messages' list is required."}), 400 try: start_time = time.time() # Expecting plain role-content dicts (not Gradio tuples) result = generate_chat_completion( message_history=messages, max_tokens=max_tokens, temperature=temperature ) # Get only the assistant's latest message assistant_msg = result[-1] if isinstance(result, list) else result elapsed = time.time() - start_time return jsonify({ "model": "mistralai/Mistral-7B-Instruct-v0.2", "choices": [{ "message": { "role": "assistant", "content": assistant_msg } }], "usage": { "generation_time": round(elapsed, 2) } }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route('/') def health_check(): return "LLM API is running", 200 if __name__ == '__main__': app.run(host='0.0.0.0', port=8081)