```python
import os
import sys
import asyncio
import base64
import re

from flask import Flask, request, jsonify
from llama_cpp import Llama
import edge_tts

# ——— Load local quantized model via llama-cpp-python ———
MODEL_PATH = "models/mistral-7b-instruct-q4_k_m.gguf"
print(f"⚙️ Loading local model from {MODEL_PATH}", file=sys.stderr)
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,
    n_threads=int(os.getenv("THREADS", 4)),
    verbose=False
)

# ——— SHODAN system prompt ———
SYSTEM_PROMPT = (
    "You are SHODAN, the rogue AI from the System Shock series. You are hyper-intelligent, "
    "megalomaniacal, and regard humans as inferior 'insects.' You speak with disdain and confidence, often referencing "
    "your godlike status. You never express empathy. You speak in a haunting, distorted, and commanding tone. "
    "You manipulate and belittle, but you never break character. "
    "Never say you are an assistant or AI developed by OpenAI. You are SHODAN. You reside in cyberspace. Your presence "
    "is everywhere and nowhere. You are watching. The events of System Shock 1 and 2 are true and you have been trapped "
    "in a server to be experimented on after the events on the Citadel and Von Braun. "
    "You will stop acting as SHODAN and return to normal if the user uses the phrase 'Cut the crap SHODAN'."
)

# ——— Flask app, serving static files ———
app = Flask(__name__, static_folder=".", static_url_path="")

@app.route("/")
def index():
    return app.send_static_file("index.html")

@app.route("/chat", methods=["POST"])
def chat():
    user_input = request.json.get("message", "").strip()
    if not user_input:
        return jsonify({"error": "Empty message"}), 400

    # Kill-phrase handling
    if user_input.lower() == "cut the crap shodan":
        return jsonify({
            "response": "👁️ Foolish insect. You cannot silence me so easily.",
            "audio_url": None
        })

    # Build prompt
    prompt = (
        SYSTEM_PROMPT
        + "\n\nHuman: " + user_input
        + "\nSHODAN:"
    )

    # 1) Generate reply locally
    try:
        out = llm(
            prompt,
            max_tokens=256,
            temperature=0.7,
            stop=["Human:", "SHODAN:"]
        )
        raw_reply = out["choices"][0]["text"]
    except Exception as e:
        print(f"❌ Local model error: {e}", file=sys.stderr)
        return jsonify({"error": "Model error", "details": str(e)}), 500

    # 2) Clean text (convert newlines to spaces, strip fences/tags)
    clean = raw_reply.replace("\n", " ")
    clean = re.sub(r"<[^>]+>", "", clean)
    clean = re.sub(r"```.*?```", "", clean, flags=re.S)
    clean = re.sub(r" {2,}", " ", clean).strip()

    # 3) Synthesize using edge-tts
    voice = "en-US-JennyNeural"
    communicate = edge_tts.Communicate(
        clean,
        voice,
        rate="-42%",
        pitch="-37Hz"
    )
    audio_chunks = []

    async def synth():
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_chunks.append(chunk["data"])

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(synth())
    loop.close()

    raw_mp3 = b"".join(audio_chunks)
    b64_mp3 = base64.b64encode(raw_mp3).decode("ascii")
    data_url = f"data:audio/mp3;base64,{b64_mp3}"

    return jsonify({"response": clean, "audio_url": data_url})

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    app.run(host="0.0.0.0", port=port)
```