Spaces:

rkonan
/

chatbot-fr

Paused

App Files Files Community

rkonan commited on 9 days ago

Commit

e7a5765

1 Parent(s): 9bc7341

stable1

Browse files

Files changed (14) hide show

app_ollama_v1 copy 2.py +160 -0
app_ollama_v1 copy.py +123 -0
app_ollama_v1.py +29 -48
app_ollama_v1_chat.py +99 -0
log_app.txt +378 -0
log_cli.txt +370 -0
logs +374 -0
rag_model_ollama_v1 copy 2.py +194 -0
rag_model_ollama_v1 copy.py +382 -0
rag_model_ollama_v1 stable_lazy.py +247 -0
rag_model_ollama_v1.py +167 -186
rag_model_ollama_v1_ok_full_load.py +211 -0
rag_model_ollama_v1_ok_llm.py +185 -0
rag_model_ollama_v2.py +302 -0

app_ollama_v1 copy 2.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import logging
+import streamlit as st
+from huggingface_hub import hf_hub_download
+# ✅ Nouveau moteur RAG (Ollama)
+from rag_model_ollama_v1 import RAGEngine
+# --- Config & logs ---
+os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
+logger = logging.getLogger("Streamlit")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
+# --- ENV ---
+ENV = os.getenv("ENV", "local")  # "local" ou "space"
+logger.info(f"ENV: {ENV}")
+# --- Chemins FAISS & chunks ---
+if ENV == "local":
+    # Adapte ces chemins à ton filesystem local
+    faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
+    vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
+else:
+    # Télécharge depuis Hugging Face (dataset privé/public selon tes réglages)
+    faiss_index_path = hf_hub_download(
+        repo_id="rkonan/chatbot-models",
+        filename="chatbot-models/vectordb_docling/index.faiss",
+        repo_type="dataset"
+    )
+    vectors_path = hf_hub_download(
+        repo_id="rkonan/chatbot-models",
+        filename="chatbot-models/vectordb_docling/chunks.pkl",
+        repo_type="dataset"
+    )
+# --- UI Sidebar ---
+st.sidebar.header("⚙️ Paramètres")
+default_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+ollama_host = st.sidebar.text_input("Ollama host", value=default_host, help="Ex: http://localhost:11434")
+# Propose des modèles déjà présents ou courants
+suggested_models = [
+    "qwen2.5:3b-instruct-q4_K_M",
+    "noushermes_rag",
+    "mistral",            # présent chez toi
+    "gemma3",             # présent chez toi
+    "deepseek-r1",        # présent chez toi (raisonnement long, plus lent)
+    "granite3.3",         # présent chez toi
+    "llama3.1:8b-instruct-q4_K_M",
+    "nous-hermes2:Q4_K_M",
+]
+model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
+num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
+temperature = st.sidebar.slider("Température", min_value=0.0, max_value=1.5, value=0.1, step=0.1)
+st.title("🤖 Chatbot RAG Local (Ollama)")
+# --- Cache du moteur ---
+@st.cache_resource(show_spinner=True)
+# def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float,_version: int =1):
+#     # Options pour Ollama
+#     ollama_opts = {
+#         "num_thread": int(_threads),
+#         "temperature": float(_temp),
+#         "num_ctx": 256,
+#         "num_batch": 16,
+#     }
+#     rag = RAGEngine(
+#         model_name=_model_name,
+#         vector_path=vectors_path,
+#         index_path=faiss_index_path,
+#         model_threads=_threads,
+#         ollama_host=_host,
+#         ollama_opts=ollama_opts
+#     )
+#     # Warmup léger (évite la latence au 1er token)
+#     try:
+#         gen = rag._complete_stream("Bonjour", max_tokens=1)
+#         next(gen,"")
+#     except Exception as e:
+#         logger.warning(f"Warmup Ollama échoué: {e}")
+#     return rag
+@st.cache_resource(show_spinner=True)
+def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float, _version: int = 1):
+    # Applique KEEP_ALIVE pour garder le modèle en mémoire après usage
+    os.environ["OLLAMA_KEEP_ALIVE"] = "15m"
+    ollama_opts = {
+        "num_thread": int(_threads),
+        "temperature": float(_temp),
+        "num_ctx": 512,   # identique au CLI
+        "num_batch": 16,
+    }
+    rag = RAGEngine(
+        model_name=_model_name,
+        vector_path=vectors_path,
+        index_path=faiss_index_path,
+        model_threads=_threads,
+        ollama_host=_host,
+        ollama_opts=ollama_opts
+    )
+    # Warmup proche du CLI (plus de 1 token pour remplir le cache)
+    try:
+        list(rag._complete_stream("Bonjour", max_tokens=8))
+    except Exception as e:
+        logger.warning(f"Warmup Ollama échoué: {e}")
+    return rag
+rag = load_rag_engine(model_name, ollama_host, num_threads, temperature,_version=2)
+# --- Chat simple ---
+user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")
+col1, col2 = st.columns([1,1])
+if col1.button("Envoyer"):
+    if user_input.strip():
+        with st.spinner("Génération en cours..."):
+            try:
+                response = rag.ask(user_input)
+                st.markdown("**Réponse :**")
+                st.success(response)
+            except Exception as e:
+                st.error(f"Erreur pendant la génération: {e}")
+    else:
+        st.info("Saisissez une question.")
+if col2.button("Envoyer (stream)"):
+    if user_input.strip():
+        with st.spinner("Génération en cours (stream)..."):
+            try:
+                # Affichage token-par-token
+                ph = st.empty()
+                acc = ""
+                for token in rag.ask_stream(user_input):
+                    acc += token
+                    ph.markdown(acc)
+                st.balloons()
+            except Exception as e:
+                st.error(f"Erreur pendant la génération (stream): {e}")
+    else:
+        st.info("Saisissez une question.")

app_ollama_v1 copy.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import logging
+import streamlit as st
+from huggingface_hub import hf_hub_download
+# ✅ Nouveau moteur RAG (Ollama)
+from rag_model_ollama_v1 import RAGEngine
+# --- Config & logs ---
+os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
+os.environ["OLLAMA_KEEP_ALIVE"] = "15m"  # garde le modèle chaud
+logger = logging.getLogger("Streamlit")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
+# --- ENV ---
+ENV = os.getenv("ENV", "local")  # "local" ou "space"
+logger.info(f"ENV: {ENV}")
+# --- Chemins FAISS & chunks ---
+if ENV == "local":
+    faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
+    vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
+else:
+    faiss_index_path = hf_hub_download(
+        repo_id="rkonan/chatbot-models",
+        filename="chatbot-models/vectordb_docling/index.faiss",
+        repo_type="dataset"
+    )
+    vectors_path = hf_hub_download(
+        repo_id="rkonan/chatbot-models",
+        filename="chatbot-models/vectordb_docling/chunks.pkl",
+        repo_type="dataset"
+    )
+# --- UI Sidebar ---
+st.sidebar.header("⚙️ Paramètres")
+default_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+ollama_host = st.sidebar.text_input("Ollama host", value=default_host)
+suggested_models = [
+    "qwen2.5:3b-instruct-q4_K_M",
+    "noushermes_rag",
+    "mistral",
+    "gemma3",
+    "deepseek-r1",
+    "granite3.3",
+    "llama3.1:8b-instruct-q4_K_M",
+    "nous-hermes2:Q4_K_M",
+]
+model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
+num_threads = st.sidebar.slider("Threads", min_value=2, max_value=16, value=6, step=1)
+temperature = st.sidebar.slider("Température", min_value=0.0, max_value=1.5, value=0.1, step=0.1)
+st.title("🤖 Chatbot RAG Local (Ollama)")
+# --- Cache du moteur ---
+@st.cache_resource(show_spinner=True)
+def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
+    ollama_opts = {
+        "num_thread": int(_threads),
+        "temperature": float(_temp),
+        "num_ctx": 512,   # identique au CLI
+        "num_batch": 16,
+    }
+    rag = RAGEngine(
+        model_name=_model_name,
+        vector_path=vectors_path,
+        index_path=faiss_index_path,
+        model_threads=_threads,
+        ollama_host=_host,
+        ollama_opts=ollama_opts
+    )
+    # Warmup proche du CLI
+    try:
+        list(rag._complete_stream("Bonjour", max_tokens=8))
+    except Exception as e:
+        logger.warning(f"Warmup Ollama échoué: {e}")
+    return rag
+rag = load_rag_engine(model_name, ollama_host, num_threads, temperature)
+# --- Chat ---
+user_input = st.text_area("Posez votre question :", height=120,
+                          placeholder="Ex: Quels sont les traitements appliqués aux images ?")
+col1, col2 = st.columns([1, 1])
+if col1.button("Envoyer"):
+    if user_input.strip():
+        with st.spinner("Génération en cours..."):
+            try:
+                response = rag.ask(user_input)
+                st.markdown("**Réponse :**")
+                st.success(response)
+            except Exception as e:
+                st.error(f"Erreur pendant la génération: {e}")
+    else:
+        st.info("Saisissez une question.")
+if col2.button("Envoyer (stream)"):
+    if user_input.strip():
+        with st.spinner("Génération en cours (stream)..."):
+            try:
+                ph = st.empty()
+                acc = ""
+                for token in rag.ask_stream(user_input):
+                    acc += token
+                    ph.markdown(acc)
+                st.balloons()
+            except Exception as e:
+                st.error(f"Erreur pendant la génération (stream): {e}")
+    else:
+        st.info("Saisissez une question.")

app_ollama_v1.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import os
 import logging
 import streamlit as st
 from huggingface_hub import hf_hub_download
-# ✅ Nouveau moteur RAG (Ollama)
 from rag_model_ollama_v1 import RAGEngine
 # --- Config & logs ---
@@ -22,16 +20,14 @@ if not logger.handlers:
 st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
 # --- ENV ---
-ENV = os.getenv("ENV", "local")  # "local" ou "space"
 logger.info(f"ENV: {ENV}")
 # --- Chemins FAISS & chunks ---
 if ENV == "local":
-    # Adapte ces chemins à ton filesystem local
     faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
     vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
 else:
-    # Télécharge depuis Hugging Face (dataset privé/public selon tes réglages)
     faiss_index_path = hf_hub_download(
         repo_id="rkonan/chatbot-models",
         filename="chatbot-models/vectordb_docling/index.faiss",
@@ -45,19 +41,17 @@ else:
 # --- UI Sidebar ---
 st.sidebar.header("⚙️ Paramètres")
-default_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
-ollama_host = st.sidebar.text_input("Ollama host", value=default_host, help="Ex: http://localhost:11434")
-# Propose des modèles déjà présents ou courants
 suggested_models = [
     "noushermes_rag",
-    "mistral",            # présent chez toi
-    "gemma3",             # présent chez toi
-    "deepseek-r1",        # présent chez toi (raisonnement long, plus lent)
-    "granite3.3",         # présent chez toi
     "llama3.1:8b-instruct-q4_K_M",
     "nous-hermes2:Q4_K_M",
 ]
 model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
 num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
@@ -67,54 +61,41 @@ st.title("🤖 Chatbot RAG Local (Ollama)")
 # --- Cache du moteur ---
 @st.cache_resource(show_spinner=True)
-def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float,_version: int =1):
-    # Options pour Ollama
-    ollama_opts = {
-        "num_thread": int(_threads),
-        "temperature": float(_temp),
-    }
     rag = RAGEngine(
         model_name=_model_name,
         vector_path=vectors_path,
         index_path=faiss_index_path,
         model_threads=_threads,
-        ollama_host=_host,
-        ollama_opts=ollama_opts
     )
-    # Warmup léger (évite la latence au 1er token)
-    try:
-        gen = rag._complete_stream("Bonjour", max_tokens=1)
-        next(gen,"")
-    except Exception as e:
-        logger.warning(f"Warmup Ollama échoué: {e}")
     return rag
-rag = load_rag_engine(model_name, ollama_host, num_threads, temperature,_version=2)
 # --- Chat simple ---
-user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")
-col1, col2 = st.columns([1,1])
-if col1.button("Envoyer"):
-    if user_input.strip():
-        with st.spinner("Génération en cours..."):
-            try:
-                response = rag.ask(user_input)
-                st.markdown("**Réponse :**")
-                st.success(response)
-            except Exception as e:
-                st.error(f"Erreur pendant la génération: {e}")
-    else:
-        st.info("Saisissez une question.")
 if col2.button("Envoyer (stream)"):
     if user_input.strip():
         with st.spinner("Génération en cours (stream)..."):
             try:
-                # Affichage token-par-token
                 ph = st.empty()
                 acc = ""
                 for token in rag.ask_stream(user_input):

 import os
 import logging
 import streamlit as st
 from huggingface_hub import hf_hub_download
+# ✅ Nouveau moteur RAG (sans ollama_opts)
 from rag_model_ollama_v1 import RAGEngine
 # --- Config & logs ---
 st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
 # --- ENV ---
+ENV = os.getenv("ENV", "local")
 logger.info(f"ENV: {ENV}")
 # --- Chemins FAISS & chunks ---
 if ENV == "local":
     faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
     vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
 else:
     faiss_index_path = hf_hub_download(
         repo_id="rkonan/chatbot-models",
         filename="chatbot-models/vectordb_docling/index.faiss",
 # --- UI Sidebar ---
 st.sidebar.header("⚙️ Paramètres")
+default_host = os.getenv("OLLAMA_HOST", "http://localhost:11435")
+ollama_host = st.sidebar.text_input("Ollama host", value=default_host)
 suggested_models = [
+    "qwen2.5:3b-instruct-q4_K_M",
     "noushermes_rag",
+    "mistral",
+    "gemma3",
+    "deepseek-r1",
+    "granite3.3",
     "llama3.1:8b-instruct-q4_K_M",
     "nous-hermes2:Q4_K_M",
 ]
 model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
 num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
 # --- Cache du moteur ---
 @st.cache_resource(show_spinner=True)
+def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
+    os.environ["OLLAMA_KEEP_ALIVE"] = "15m"
     rag = RAGEngine(
         model_name=_model_name,
         vector_path=vectors_path,
         index_path=faiss_index_path,
         model_threads=_threads,
+        ollama_host=_host
+        # ❌ pas d'ollama_opts → Ollama choisit les defaults
     )
     return rag
+rag = load_rag_engine(model_name, ollama_host, num_threads, temperature)
 # --- Chat simple ---
+user_input = st.text_area("Posez votre question :", height=120,
+                          placeholder="Ex: Quels sont les traitements appliqués aux images ?")
+col1, col2 = st.columns([1, 1])
+# if col1.button("Envoyer"):
+#     if user_input.strip():
+#         with st.spinner("Génération en cours..."):
+#             try:
+#                 response = rag.ask(user_input)
+#                 st.markdown("**Réponse :**")
+#                 st.success(response)
+#             except Exception as e:
+#                 st.error(f"Erreur pendant la génération: {e}")
+#     else:
+#         st.info("Saisissez une question.")
 if col2.button("Envoyer (stream)"):
     if user_input.strip():
         with st.spinner("Génération en cours (stream)..."):
             try:
                 ph = st.empty()
                 acc = ""
                 for token in rag.ask_stream(user_input):

app_ollama_v1_chat.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import logging
+import streamlit as st
+import requests
+import json
+# --- Config & logs ---
+os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
+logger = logging.getLogger("Streamlit")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+st.set_page_config(page_title="Chat Ollama", page_icon="🤖")
+# --- UI Sidebar ---
+st.sidebar.header("⚙️ Paramètres")
+default_host = os.getenv("OLLAMA_HOST", "http://localhost:11435")
+ollama_host = st.sidebar.text_input("Ollama host", value=default_host)
+model_name = st.sidebar.text_input("Modèle Ollama", value="qwen2.5:3b-instruct-q4_K_M")
+st.title("💬 Chat Ollama (simple)")
+# --- Historique ---
+if "messages" not in st.session_state:
+    st.session_state["messages"] = []
+user_input = st.text_area("Votre message :", height=100, placeholder="Ex: Bonjour ?")
+col1, col2 = st.columns([1, 1])
+# --- Fonction d'appel API /api/chat ---
+def ollama_chat(messages, stream=False):
+    url = ollama_host.rstrip("/") + "/api/chat"
+    payload = {"model": model_name, "messages": messages, "stream": stream}
+    if stream:
+        # renvoie un générateur de tokens
+        def token_gen():
+            with requests.post(url, json=payload, stream=True, timeout=300) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    data = json.loads(line)
+                    if "message" in data and not data.get("done"):
+                        yield data["message"]["content"]
+                    if data.get("done"):
+                        break
+        return token_gen()
+    else:
+        # renvoie directement la réponse complète (dict)
+        r = requests.post(url, json=payload, timeout=300)
+        r.raise_for_status()
+        return r.json()
+# --- Bouton : envoi normal ---
+if col1.button("Envoyer"):
+    if user_input.strip():
+        st.session_state["messages"].append({"role": "user", "content": user_input})
+        with st.spinner("Génération en cours..."):
+            try:
+                result = ollama_chat(st.session_state["messages"], stream=False)
+                content = result.get("message", {}).get("content", "")
+                st.session_state["messages"].append({"role": "assistant", "content": content})
+                st.markdown("**Réponse :**")
+                st.success(content)
+                st.write(f"⏱ Temps total : {result['total_duration']/1e9:.2f}s")
+                st.write(f"📝 Tokens prompt : {result['prompt_eval_count']}, génération : {result['eval_count']}")
+            except Exception as e:
+                st.error(f"Erreur: {e}")
+    else:
+        st.info("Saisissez un message.")
+# --- Bouton : envoi streaming ---
+if col2.button("Envoyer (stream)"):
+    if user_input.strip():
+        st.session_state["messages"].append({"role": "user", "content": user_input})
+        with st.spinner("Génération en cours (stream)..."):
+            try:
+                ph = st.empty()
+                acc = ""
+                for token in ollama_chat(st.session_state["messages"], stream=True):
+                    acc += token
+                    ph.markdown(acc)
+                st.session_state["messages"].append({"role": "assistant", "content": acc})
+            except Exception as e:
+                st.error(f"Erreur (stream): {e}")
+    else:
+        st.info("Saisissez un message.")
+# --- Affichage historique ---
+st.subheader("Historique de la conversation")
+for msg in st.session_state["messages"]:
+    role = "🧑‍💻" if msg["role"] == "user" else "🤖"
+    st.markdown(f"{role} **{msg['role']}**: {msg['content']}")

log_app.txt ADDED Viewed

	@@ -0,0 +1,378 @@

+rkonan@rkonan-ThinkPad-T460:~$ OLLAMA_DEBUG=1 ollama serve
+time=2025-08-09T22:42:53.523+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+time=2025-08-09T22:42:53.524+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
+time=2025-08-09T22:42:53.525+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
+time=2025-08-09T22:42:53.525+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
+time=2025-08-09T22:42:53.525+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
+time=2025-08-09T22:42:53.525+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
+time=2025-08-09T22:42:53.527+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
+time=2025-08-09T22:42:53.527+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
+time=2025-08-09T22:42:53.527+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
+time=2025-08-09T22:42:53.539+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
+time=2025-08-09T22:42:53.539+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
+time=2025-08-09T22:42:53.539+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
+time=2025-08-09T22:42:53.543+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
+cudaSetDevice err: 35
+time=2025-08-09T22:42:53.544+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama"
+time=2025-08-09T22:42:53.544+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
+time=2025-08-09T22:42:53.544+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
+time=2025-08-09T22:42:53.544+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.6 GiB"
+time=2025-08-09T22:42:53.544+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
+time=2025-08-09T22:46:23.269+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="10.7 GiB" now.free_swap="2.1 GiB"
+time=2025-08-09T22:46:23.269+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
+time=2025-08-09T22:46:23.313+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
+time=2025-08-09T22:46:23.419+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
+time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="10.7 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="10.7 GiB" now.free_swap="2.1 GiB"
+time=2025-08-09T22:46:23.420+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="10.7 GiB" free_swap="2.1 GiB"
+time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[10.7 GiB]"
+time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
+time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
+time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
+time=2025-08-09T22:46:23.421+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[10.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="1.9 GiB" memory.required.partial="0 B" memory.required.kv="72.0 MiB" memory.required.allocations="[1.9 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="9.4 MiB" memory.graph.partial="252.8 MiB"
+time=2025-08-09T22:46:23.421+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
+llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = qwen2
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
+llama_model_loader: - kv   5:                         general.size_label str              = 3B
+llama_model_loader: - kv   6:                            general.license str              = other
+llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
+llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
+llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
+llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
+llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
+llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
+llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
+llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
+llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
+llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
+llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
+llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
+llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
+llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
+llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
+llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
+llama_model_loader: - kv  23:                          general.file_type u32              = 15
+llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
+llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
+llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
+llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
+llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
+llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
+llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
+llama_model_loader: - kv  34:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:  181 tensors
+llama_model_loader: - type q4_K:  216 tensors
+llama_model_loader: - type q6_K:   37 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 1.79 GiB (4.99 BPW)
+init_tokenizer: initializing tokenizer for type 2
+load: control token: 151660 '<|fim_middle|>' is not marked as EOG
+load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
+load: control token: 151653 '<|vision_end|>' is not marked as EOG
+load: control token: 151648 '<|box_start|>' is not marked as EOG
+load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
+load: control token: 151649 '<|box_end|>' is not marked as EOG
+load: control token: 151655 '<|image_pad|>' is not marked as EOG
+load: control token: 151651 '<|quad_end|>' is not marked as EOG
+load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
+load: control token: 151652 '<|vision_start|>' is not marked as EOG
+load: control token: 151654 '<|vision_pad|>' is not marked as EOG
+load: control token: 151656 '<|video_pad|>' is not marked as EOG
+load: control token: 151644 '<|im_start|>' is not marked as EOG
+load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
+load: control token: 151650 '<|quad_start|>' is not marked as EOG
+load: special tokens cache size = 22
+load: token to piece cache size = 0.9310 MB
+print_info: arch             = qwen2
+print_info: vocab_only       = 1
+print_info: model type       = ?B
+print_info: model params     = 3.09 B
+print_info: general.name     = Qwen2.5 3B Instruct
+print_info: vocab type       = BPE
+print_info: n_vocab          = 151936
+print_info: n_merges         = 151387
+print_info: BOS token        = 151643 '<|endoftext|>'
+print_info: EOS token        = 151645 '<|im_end|>'
+print_info: EOT token        = 151645 '<|im_end|>'
+print_info: PAD token        = 151643 '<|endoftext|>'
+print_info: LF token         = 198 'Ċ'
+print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
+print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
+print_info: FIM MID token    = 151660 '<|fim_middle|>'
+print_info: FIM PAD token    = 151662 '<|fim_pad|>'
+print_info: FIM REP token    = 151663 '<|repo_name|>'
+print_info: FIM SEP token    = 151664 '<|file_sep|>'
+print_info: EOG token        = 151643 '<|endoftext|>'
+print_info: EOG token        = 151645 '<|im_end|>'
+print_info: EOG token        = 151662 '<|fim_pad|>'
+print_info: EOG token        = 151663 '<|repo_name|>'
+print_info: EOG token        = 151664 '<|file_sep|>'
+print_info: max token length = 256
+llama_model_load: vocab only - skipping tensors
+time=2025-08-09T22:46:24.490+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
+time=2025-08-09T22:46:24.490+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 256 --batch-size 16 --threads 6 --no-mmap --parallel 1 --port 37337"
+time=2025-08-09T22:46:24.490+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
+time=2025-08-09T22:46:24.493+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
+time=2025-08-09T22:46:24.493+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
+time=2025-08-09T22:46:24.497+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
+time=2025-08-09T22:46:24.537+02:00 level=INFO source=runner.go:815 msg="starting go runner"
+time=2025-08-09T22:46:24.537+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
+load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
+time=2025-08-09T22:46:24.555+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
+time=2025-08-09T22:46:24.557+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:37337"
+llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = qwen2
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
+llama_model_loader: - kv   5:                         general.size_label str              = 3B
+llama_model_loader: - kv   6:                            general.license str              = other
+llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
+llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
+llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
+llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
+llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
+llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
+llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
+llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
+llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
+llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
+llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
+llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
+llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
+llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
+llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
+llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
+llama_model_loader: - kv  23:                          general.file_type u32              = 15
+llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
+llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+time=2025-08-09T22:46:24.755+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
+llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
+llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
+llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
+llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
+llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
+llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
+llama_model_loader: - kv  34:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:  181 tensors
+llama_model_loader: - type q4_K:  216 tensors
+llama_model_loader: - type q6_K:   37 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 1.79 GiB (4.99 BPW)
+init_tokenizer: initializing tokenizer for type 2
+load: control token: 151660 '<|fim_middle|>' is not marked as EOG
+load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
+load: control token: 151653 '<|vision_end|>' is not marked as EOG
+load: control token: 151648 '<|box_start|>' is not marked as EOG
+load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
+load: control token: 151649 '<|box_end|>' is not marked as EOG
+load: control token: 151655 '<|image_pad|>' is not marked as EOG
+load: control token: 151651 '<|quad_end|>' is not marked as EOG
+load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
+load: control token: 151652 '<|vision_start|>' is not marked as EOG
+load: control token: 151654 '<|vision_pad|>' is not marked as EOG
+load: control token: 151656 '<|video_pad|>' is not marked as EOG
+load: control token: 151644 '<|im_start|>' is not marked as EOG
+load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
+load: control token: 151650 '<|quad_start|>' is not marked as EOG
+load: special tokens cache size = 22
+load: token to piece cache size = 0.9310 MB
+print_info: arch             = qwen2
+print_info: vocab_only       = 0
+print_info: n_ctx_train      = 32768
+print_info: n_embd           = 2048
+print_info: n_layer          = 36
+print_info: n_head           = 16
+print_info: n_head_kv        = 2
+print_info: n_rot            = 128
+print_info: n_swa            = 0
+print_info: n_swa_pattern    = 1
+print_info: n_embd_head_k    = 128
+print_info: n_embd_head_v    = 128
+print_info: n_gqa            = 8
+print_info: n_embd_k_gqa     = 256
+print_info: n_embd_v_gqa     = 256
+print_info: f_norm_eps       = 0.0e+00
+print_info: f_norm_rms_eps   = 1.0e-06
+print_info: f_clamp_kqv      = 0.0e+00
+print_info: f_max_alibi_bias = 0.0e+00
+print_info: f_logit_scale    = 0.0e+00
+print_info: f_attn_scale     = 0.0e+00
+print_info: n_ff             = 11008
+print_info: n_expert         = 0
+print_info: n_expert_used    = 0
+print_info: causal attn      = 1
+print_info: pooling type     = -1
+print_info: rope type        = 2
+print_info: rope scaling     = linear
+print_info: freq_base_train  = 1000000.0
+print_info: freq_scale_train = 1
+print_info: n_ctx_orig_yarn  = 32768
+print_info: rope_finetuned   = unknown
+print_info: ssm_d_conv       = 0
+print_info: ssm_d_inner      = 0
+print_info: ssm_d_state      = 0
+print_info: ssm_dt_rank      = 0
+print_info: ssm_dt_b_c_rms   = 0
+print_info: model type       = 3B
+print_info: model params     = 3.09 B
+print_info: general.name     = Qwen2.5 3B Instruct
+print_info: vocab type       = BPE
+print_info: n_vocab          = 151936
+print_info: n_merges         = 151387
+print_info: BOS token        = 151643 '<|endoftext|>'
+print_info: EOS token        = 151645 '<|im_end|>'
+print_info: EOT token        = 151645 '<|im_end|>'
+print_info: PAD token        = 151643 '<|endoftext|>'
+print_info: LF token         = 198 'Ċ'
+print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
+print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
+print_info: FIM MID token    = 151660 '<|fim_middle|>'
+print_info: FIM PAD token    = 151662 '<|fim_pad|>'
+print_info: FIM REP token    = 151663 '<|repo_name|>'
+print_info: FIM SEP token    = 151664 '<|file_sep|>'
+print_info: EOG token        = 151643 '<|endoftext|>'
+print_info: EOG token        = 151645 '<|im_end|>'
+print_info: EOG token        = 151662 '<|fim_pad|>'
+print_info: EOG token        = 151663 '<|repo_name|>'
+print_info: EOG token        = 151664 '<|file_sep|>'
+print_info: max token length = 256
+load_tensors: loading model tensors, this can take a while... (mmap = false)
+load_tensors: layer   0 assigned to device CPU, is_swa = 0
+load_tensors: layer   1 assigned to device CPU, is_swa = 0
+load_tensors: layer   2 assigned to device CPU, is_swa = 0
+load_tensors: layer   3 assigned to device CPU, is_swa = 0
+load_tensors: layer   4 assigned to device CPU, is_swa = 0
+load_tensors: layer   5 assigned to device CPU, is_swa = 0
+load_tensors: layer   6 assigned to device CPU, is_swa = 0
+load_tensors: layer   7 assigned to device CPU, is_swa = 0
+load_tensors: layer   8 assigned to device CPU, is_swa = 0
+load_tensors: layer   9 assigned to device CPU, is_swa = 0
+load_tensors: layer  10 assigned to device CPU, is_swa = 0
+load_tensors: layer  11 assigned to device CPU, is_swa = 0
+load_tensors: layer  12 assigned to device CPU, is_swa = 0
+load_tensors: layer  13 assigned to device CPU, is_swa = 0
+load_tensors: layer  14 assigned to device CPU, is_swa = 0
+load_tensors: layer  15 assigned to device CPU, is_swa = 0
+load_tensors: layer  16 assigned to device CPU, is_swa = 0
+load_tensors: layer  17 assigned to device CPU, is_swa = 0
+load_tensors: layer  18 assigned to device CPU, is_swa = 0
+load_tensors: layer  19 assigned to device CPU, is_swa = 0
+load_tensors: layer  20 assigned to device CPU, is_swa = 0
+load_tensors: layer  21 assigned to device CPU, is_swa = 0
+load_tensors: layer  22 assigned to device CPU, is_swa = 0
+load_tensors: layer  23 assigned to device CPU, is_swa = 0
+load_tensors: layer  24 assigned to device CPU, is_swa = 0
+load_tensors: layer  25 assigned to device CPU, is_swa = 0
+load_tensors: layer  26 assigned to device CPU, is_swa = 0
+load_tensors: layer  27 assigned to device CPU, is_swa = 0
+load_tensors: layer  28 assigned to device CPU, is_swa = 0
+load_tensors: layer  29 assigned to device CPU, is_swa = 0
+load_tensors: layer  30 assigned to device CPU, is_swa = 0
+load_tensors: layer  31 assigned to device CPU, is_swa = 0
+load_tensors: layer  32 assigned to device CPU, is_swa = 0
+load_tensors: layer  33 assigned to device CPU, is_swa = 0
+load_tensors: layer  34 assigned to device CPU, is_swa = 0
+load_tensors: layer  35 assigned to device CPU, is_swa = 0
+load_tensors: layer  36 assigned to device CPU, is_swa = 0
+load_tensors:          CPU model buffer size =  1834.82 MiB
+load_all_data: no device found for buffer type CPU for async uploads
+time=2025-08-09T22:46:25.773+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.18"
+time=2025-08-09T22:46:26.025+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.27"
+time=2025-08-09T22:46:26.276+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.39"
+time=2025-08-09T22:46:26.526+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.54"
+time=2025-08-09T22:46:26.777+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.68"
+time=2025-08-09T22:46:27.029+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.81"
+time=2025-08-09T22:46:27.281+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.94"
+llama_context: constructing llama_context
+llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
+llama_context: n_seq_max     = 1
+llama_context: n_ctx         = 256
+llama_context: n_ctx_per_seq = 256
+llama_context: n_batch       = 64
+llama_context: n_ubatch      = 64
+llama_context: causal_attn   = 1
+llama_context: flash_attn    = 0
+llama_context: freq_base     = 1000000.0
+llama_context: freq_scale    = 1
+llama_context: n_ctx_per_seq (256) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
+set_abort_callback: call
+llama_context:        CPU  output buffer size =     0.59 MiB
+create_memory: n_ctx = 256 (padded)
+llama_kv_cache_unified: kv_size = 256, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
+llama_kv_cache_unified: layer   0: dev = CPU
+llama_kv_cache_unified: layer   1: dev = CPU
+llama_kv_cache_unified: layer   2: dev = CPU
+llama_kv_cache_unified: layer   3: dev = CPU
+llama_kv_cache_unified: layer   4: dev = CPU
+llama_kv_cache_unified: layer   5: dev = CPU
+llama_kv_cache_unified: layer   6: dev = CPU
+llama_kv_cache_unified: layer   7: dev = CPU
+llama_kv_cache_unified: layer   8: dev = CPU
+llama_kv_cache_unified: layer   9: dev = CPU
+llama_kv_cache_unified: layer  10: dev = CPU
+llama_kv_cache_unified: layer  11: dev = CPU
+llama_kv_cache_unified: layer  12: dev = CPU
+llama_kv_cache_unified: layer  13: dev = CPU
+llama_kv_cache_unified: layer  14: dev = CPU
+llama_kv_cache_unified: layer  15: dev = CPU
+llama_kv_cache_unified: layer  16: dev = CPU
+llama_kv_cache_unified: layer  17: dev = CPU
+llama_kv_cache_unified: layer  18: dev = CPU
+llama_kv_cache_unified: layer  19: dev = CPU
+llama_kv_cache_unified: layer  20: dev = CPU
+llama_kv_cache_unified: layer  21: dev = CPU
+llama_kv_cache_unified: layer  22: dev = CPU
+llama_kv_cache_unified: layer  23: dev = CPU
+llama_kv_cache_unified: layer  24: dev = CPU
+llama_kv_cache_unified: layer  25: dev = CPU
+llama_kv_cache_unified: layer  26: dev = CPU
+llama_kv_cache_unified: layer  27: dev = CPU
+llama_kv_cache_unified: layer  28: dev = CPU
+llama_kv_cache_unified: layer  29: dev = CPU
+llama_kv_cache_unified: layer  30: dev = CPU
+llama_kv_cache_unified: layer  31: dev = CPU
+llama_kv_cache_unified: layer  32: dev = CPU
+llama_kv_cache_unified: layer  33: dev = CPU
+llama_kv_cache_unified: layer  34: dev = CPU
+llama_kv_cache_unified: layer  35: dev = CPU
+llama_kv_cache_unified:        CPU KV buffer size =     9.00 MiB
+llama_kv_cache_unified: KV self size  =    9.00 MiB, K (f16):    4.50 MiB, V (f16):    4.50 MiB
+llama_context: enumerating backends
+llama_context: backend_ptrs.size() = 1
+llama_context: max_nodes = 65536
+llama_context: worst-case: n_tokens = 64, n_seqs = 1, n_outputs = 0
+llama_context: reserving graph for n_tokens = 64, n_seqs = 1
+llama_context: reserving graph for n_tokens = 1, n_seqs = 1
+llama_context: reserving graph for n_tokens = 64, n_seqs = 1
+llama_context:        CPU compute buffer size =    37.59 MiB
+llama_context: graph nodes  = 1338
+llama_context: graph splits = 1
+time=2025-08-09T22:46:27.533+02:00 level=INFO source=server.go:637 msg="llama runner started in 3.04 seconds"
+time=2025-08-09T22:46:27.533+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256
+time=2025-08-09T22:46:27.533+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=7 format=""
+time=2025-08-09T22:46:27.539+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=1 used=0 remaining=1
+time=2025-08-09T22:46:35.295+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
+time=2025-08-09T22:46:35.295+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 duration=5m0s
+time=2025-08-09T22:46:35.295+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 refCount=0
+[GIN] 2025/08/09 - 22:46:35 | 200 | 12.149925014s |       127.0.0.1 | POST     "/api/generate"
+time=2025-08-09T22:47:21.815+02:00 level=DEBUG source=sched.go:613 msg="evaluating already loaded" model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6
+time=2025-08-09T22:47:21.816+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=81 format=""
+time=2025-08-09T22:47:21.829+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=2 prompt=20 used=0 remaining=20
+[GIN] 2025/08/09 - 22:49:13 | 200 |         1m51s |       127.0.0.1 | POST     "/api/generate"
+time=2025-08-09T22:49:13.057+02:00 level=DEBUG source=sched.go:432 msg="context for request finished" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256
+time=2025-08-09T22:49:13.057+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 duration=5m0s
+time=2025-08-09T22:49:13.058+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 refCount=0

log_cli.txt ADDED Viewed

	@@ -0,0 +1,370 @@

+rkonan@rkonan-ThinkPad-T460:~$ OLLAMA_DEBUG=1 ollama serve
+time=2025-08-09T22:41:31.741+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
+time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
+time=2025-08-09T22:41:31.743+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
+time=2025-08-09T22:41:31.744+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
+time=2025-08-09T22:41:31.744+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
+time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
+time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
+time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
+time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
+time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
+time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
+time=2025-08-09T22:41:31.753+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
+cudaSetDevice err: 35
+time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama"
+time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
+time=2025-08-09T22:41:31.754+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
+time=2025-08-09T22:41:31.754+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.6 GiB"
+time=2025-08-09T22:41:31.754+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
+[GIN] 2025/08/09 - 22:41:51 | 200 |        96.9µs |       127.0.0.1 | HEAD     "/"
+time=2025-08-09T22:41:51.222+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
+[GIN] 2025/08/09 - 22:41:51 | 200 |  110.417215ms |       127.0.0.1 | POST     "/api/show"
+time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB"
+time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
+time=2025-08-09T22:41:51.319+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
+time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
+time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB"
+time=2025-08-09T22:41:51.380+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="11.6 GiB" free_swap="2.1 GiB"
+time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[11.6 GiB]"
+time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
+time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
+time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
+time=2025-08-09T22:41:51.381+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[11.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.3 GiB" memory.required.partial="0 B" memory.required.kv="144.0 MiB" memory.required.allocations="[2.3 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="300.8 MiB" memory.graph.partial="544.2 MiB"
+time=2025-08-09T22:41:51.381+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
+llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = qwen2
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
+llama_model_loader: - kv   5:                         general.size_label str              = 3B
+llama_model_loader: - kv   6:                            general.license str              = other
+llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
+llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
+llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
+llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
+llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
+llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
+llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
+llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
+llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
+llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
+llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
+llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
+llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
+llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
+llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
+llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
+llama_model_loader: - kv  23:                          general.file_type u32              = 15
+llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
+llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
+llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
+llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
+llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
+llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
+llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
+llama_model_loader: - kv  34:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:  181 tensors
+llama_model_loader: - type q4_K:  216 tensors
+llama_model_loader: - type q6_K:   37 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 1.79 GiB (4.99 BPW)
+init_tokenizer: initializing tokenizer for type 2
+load: control token: 151660 '<|fim_middle|>' is not marked as EOG
+load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
+load: control token: 151653 '<|vision_end|>' is not marked as EOG
+load: control token: 151648 '<|box_start|>' is not marked as EOG
+load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
+load: control token: 151649 '<|box_end|>' is not marked as EOG
+load: control token: 151655 '<|image_pad|>' is not marked as EOG
+load: control token: 151651 '<|quad_end|>' is not marked as EOG
+load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
+load: control token: 151652 '<|vision_start|>' is not marked as EOG
+load: control token: 151654 '<|vision_pad|>' is not marked as EOG
+load: control token: 151656 '<|video_pad|>' is not marked as EOG
+load: control token: 151644 '<|im_start|>' is not marked as EOG
+load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
+load: control token: 151650 '<|quad_start|>' is not marked as EOG
+load: special tokens cache size = 22
+load: token to piece cache size = 0.9310 MB
+print_info: arch             = qwen2
+print_info: vocab_only       = 1
+print_info: model type       = ?B
+print_info: model params     = 3.09 B
+print_info: general.name     = Qwen2.5 3B Instruct
+print_info: vocab type       = BPE
+print_info: n_vocab          = 151936
+print_info: n_merges         = 151387
+print_info: BOS token        = 151643 '<|endoftext|>'
+print_info: EOS token        = 151645 '<|im_end|>'
+print_info: EOT token        = 151645 '<|im_end|>'
+print_info: PAD token        = 151643 '<|endoftext|>'
+print_info: LF token         = 198 'Ċ'
+print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
+print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
+print_info: FIM MID token    = 151660 '<|fim_middle|>'
+print_info: FIM PAD token    = 151662 '<|fim_pad|>'
+print_info: FIM REP token    = 151663 '<|repo_name|>'
+print_info: FIM SEP token    = 151664 '<|file_sep|>'
+print_info: EOG token        = 151643 '<|endoftext|>'
+print_info: EOG token        = 151645 '<|im_end|>'
+print_info: EOG token        = 151662 '<|fim_pad|>'
+print_info: EOG token        = 151663 '<|repo_name|>'
+print_info: EOG token        = 151664 '<|file_sep|>'
+print_info: max token length = 256
+llama_model_load: vocab only - skipping tensors
+time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
+time=2025-08-09T22:41:51.857+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 4096 --batch-size 512 --threads 2 --no-mmap --parallel 1 --port 42013"
+time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
+time=2025-08-09T22:41:51.857+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
+time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
+time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
+time=2025-08-09T22:41:51.877+02:00 level=INFO source=runner.go:815 msg="starting go runner"
+time=2025-08-09T22:41:51.878+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
+load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
+time=2025-08-09T22:41:51.892+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
+time=2025-08-09T22:41:51.892+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:42013"
+llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = qwen2
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
+llama_model_loader: - kv   5:                         general.size_label str              = 3B
+llama_model_loader: - kv   6:                            general.license str              = other
+llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
+llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
+llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
+llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
+llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
+llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
+llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
+llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
+llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
+llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
+llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
+llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
+llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
+llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
+llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
+llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
+llama_model_loader: - kv  23:                          general.file_type u32              = 15
+llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
+llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
+llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
+llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
+llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
+llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
+llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
+llama_model_loader: - kv  34:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:  181 tensors
+llama_model_loader: - type q4_K:  216 tensors
+llama_model_loader: - type q6_K:   37 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 1.79 GiB (4.99 BPW)
+time=2025-08-09T22:41:52.110+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
+init_tokenizer: initializing tokenizer for type 2
+load: control token: 151660 '<|fim_middle|>' is not marked as EOG
+load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
+load: control token: 151653 '<|vision_end|>' is not marked as EOG
+load: control token: 151648 '<|box_start|>' is not marked as EOG
+load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
+load: control token: 151649 '<|box_end|>' is not marked as EOG
+load: control token: 151655 '<|image_pad|>' is not marked as EOG
+load: control token: 151651 '<|quad_end|>' is not marked as EOG
+load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
+load: control token: 151652 '<|vision_start|>' is not marked as EOG
+load: control token: 151654 '<|vision_pad|>' is not marked as EOG
+load: control token: 151656 '<|video_pad|>' is not marked as EOG
+load: control token: 151644 '<|im_start|>' is not marked as EOG
+load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
+load: control token: 151650 '<|quad_start|>' is not marked as EOG
+load: special tokens cache size = 22
+load: token to piece cache size = 0.9310 MB
+print_info: arch             = qwen2
+print_info: vocab_only       = 0
+print_info: n_ctx_train      = 32768
+print_info: n_embd           = 2048
+print_info: n_layer          = 36
+print_info: n_head           = 16
+print_info: n_head_kv        = 2
+print_info: n_rot            = 128
+print_info: n_swa            = 0
+print_info: n_swa_pattern    = 1
+print_info: n_embd_head_k    = 128
+print_info: n_embd_head_v    = 128
+print_info: n_gqa            = 8
+print_info: n_embd_k_gqa     = 256
+print_info: n_embd_v_gqa     = 256
+print_info: f_norm_eps       = 0.0e+00
+print_info: f_norm_rms_eps   = 1.0e-06
+print_info: f_clamp_kqv      = 0.0e+00
+print_info: f_max_alibi_bias = 0.0e+00
+print_info: f_logit_scale    = 0.0e+00
+print_info: f_attn_scale     = 0.0e+00
+print_info: n_ff             = 11008
+print_info: n_expert         = 0
+print_info: n_expert_used    = 0
+print_info: causal attn      = 1
+print_info: pooling type     = -1
+print_info: rope type        = 2
+print_info: rope scaling     = linear
+print_info: freq_base_train  = 1000000.0
+print_info: freq_scale_train = 1
+print_info: n_ctx_orig_yarn  = 32768
+print_info: rope_finetuned   = unknown
+print_info: ssm_d_conv       = 0
+print_info: ssm_d_inner      = 0
+print_info: ssm_d_state      = 0
+print_info: ssm_dt_rank      = 0
+print_info: ssm_dt_b_c_rms   = 0
+print_info: model type       = 3B
+print_info: model params     = 3.09 B
+print_info: general.name     = Qwen2.5 3B Instruct
+print_info: vocab type       = BPE
+print_info: n_vocab          = 151936
+print_info: n_merges         = 151387
+print_info: BOS token        = 151643 '<|endoftext|>'
+print_info: EOS token        = 151645 '<|im_end|>'
+print_info: EOT token        = 151645 '<|im_end|>'
+print_info: PAD token        = 151643 '<|endoftext|>'
+print_info: LF token         = 198 'Ċ'
+print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
+print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
+print_info: FIM MID token    = 151660 '<|fim_middle|>'
+print_info: FIM PAD token    = 151662 '<|fim_pad|>'
+print_info: FIM REP token    = 151663 '<|repo_name|>'
+print_info: FIM SEP token    = 151664 '<|file_sep|>'
+print_info: EOG token        = 151643 '<|endoftext|>'
+print_info: EOG token        = 151645 '<|im_end|>'
+print_info: EOG token        = 151662 '<|fim_pad|>'
+print_info: EOG token        = 151663 '<|repo_name|>'
+print_info: EOG token        = 151664 '<|file_sep|>'
+print_info: max token length = 256
+load_tensors: loading model tensors, this can take a while... (mmap = false)
+load_tensors: layer   0 assigned to device CPU, is_swa = 0
+load_tensors: layer   1 assigned to device CPU, is_swa = 0
+load_tensors: layer   2 assigned to device CPU, is_swa = 0
+load_tensors: layer   3 assigned to device CPU, is_swa = 0
+load_tensors: layer   4 assigned to device CPU, is_swa = 0
+load_tensors: layer   5 assigned to device CPU, is_swa = 0
+load_tensors: layer   6 assigned to device CPU, is_swa = 0
+load_tensors: layer   7 assigned to device CPU, is_swa = 0
+load_tensors: layer   8 assigned to device CPU, is_swa = 0
+load_tensors: layer   9 assigned to device CPU, is_swa = 0
+load_tensors: layer  10 assigned to device CPU, is_swa = 0
+load_tensors: layer  11 assigned to device CPU, is_swa = 0
+load_tensors: layer  12 assigned to device CPU, is_swa = 0
+load_tensors: layer  13 assigned to device CPU, is_swa = 0
+load_tensors: layer  14 assigned to device CPU, is_swa = 0
+load_tensors: layer  15 assigned to device CPU, is_swa = 0
+load_tensors: layer  16 assigned to device CPU, is_swa = 0
+load_tensors: layer  17 assigned to device CPU, is_swa = 0
+load_tensors: layer  18 assigned to device CPU, is_swa = 0
+load_tensors: layer  19 assigned to device CPU, is_swa = 0
+load_tensors: layer  20 assigned to device CPU, is_swa = 0
+load_tensors: layer  21 assigned to device CPU, is_swa = 0
+load_tensors: layer  22 assigned to device CPU, is_swa = 0
+load_tensors: layer  23 assigned to device CPU, is_swa = 0
+load_tensors: layer  24 assigned to device CPU, is_swa = 0
+load_tensors: layer  25 assigned to device CPU, is_swa = 0
+load_tensors: layer  26 assigned to device CPU, is_swa = 0
+load_tensors: layer  27 assigned to device CPU, is_swa = 0
+load_tensors: layer  28 assigned to device CPU, is_swa = 0
+load_tensors: layer  29 assigned to device CPU, is_swa = 0
+load_tensors: layer  30 assigned to device CPU, is_swa = 0
+load_tensors: layer  31 assigned to device CPU, is_swa = 0
+load_tensors: layer  32 assigned to device CPU, is_swa = 0
+load_tensors: layer  33 assigned to device CPU, is_swa = 0
+load_tensors: layer  34 assigned to device CPU, is_swa = 0
+load_tensors: layer  35 assigned to device CPU, is_swa = 0
+load_tensors: layer  36 assigned to device CPU, is_swa = 0
+load_tensors:          CPU model buffer size =  1834.82 MiB
+load_all_data: no device found for buffer type CPU for async uploads
+time=2025-08-09T22:41:52.865+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.28"
+time=2025-08-09T22:41:53.116+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.50"
+time=2025-08-09T22:41:53.366+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.72"
+time=2025-08-09T22:41:53.617+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.97"
+llama_context: constructing llama_context
+llama_context: n_seq_max     = 1
+llama_context: n_ctx         = 4096
+llama_context: n_ctx_per_seq = 4096
+llama_context: n_batch       = 512
+llama_context: n_ubatch      = 512
+llama_context: causal_attn   = 1
+llama_context: flash_attn    = 0
+llama_context: freq_base     = 1000000.0
+llama_context: freq_scale    = 1
+llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
+set_abort_callback: call
+llama_context:        CPU  output buffer size =     0.59 MiB
+create_memory: n_ctx = 4096 (padded)
+llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
+llama_kv_cache_unified: layer   0: dev = CPU
+llama_kv_cache_unified: layer   1: dev = CPU
+llama_kv_cache_unified: layer   2: dev = CPU
+llama_kv_cache_unified: layer   3: dev = CPU
+llama_kv_cache_unified: layer   4: dev = CPU
+llama_kv_cache_unified: layer   5: dev = CPU
+llama_kv_cache_unified: layer   6: dev = CPU
+llama_kv_cache_unified: layer   7: dev = CPU
+llama_kv_cache_unified: layer   8: dev = CPU
+llama_kv_cache_unified: layer   9: dev = CPU
+llama_kv_cache_unified: layer  10: dev = CPU
+llama_kv_cache_unified: layer  11: dev = CPU
+llama_kv_cache_unified: layer  12: dev = CPU
+llama_kv_cache_unified: layer  13: dev = CPU
+llama_kv_cache_unified: layer  14: dev = CPU
+llama_kv_cache_unified: layer  15: dev = CPU
+llama_kv_cache_unified: layer  16: dev = CPU
+llama_kv_cache_unified: layer  17: dev = CPU
+llama_kv_cache_unified: layer  18: dev = CPU
+llama_kv_cache_unified: layer  19: dev = CPU
+llama_kv_cache_unified: layer  20: dev = CPU
+llama_kv_cache_unified: layer  21: dev = CPU
+llama_kv_cache_unified: layer  22: dev = CPU
+llama_kv_cache_unified: layer  23: dev = CPU
+llama_kv_cache_unified: layer  24: dev = CPU
+llama_kv_cache_unified: layer  25: dev = CPU
+llama_kv_cache_unified: layer  26: dev = CPU
+llama_kv_cache_unified: layer  27: dev = CPU
+llama_kv_cache_unified: layer  28: dev = CPU
+llama_kv_cache_unified: layer  29: dev = CPU
+llama_kv_cache_unified: layer  30: dev = CPU
+llama_kv_cache_unified: layer  31: dev = CPU
+llama_kv_cache_unified: layer  32: dev = CPU
+llama_kv_cache_unified: layer  33: dev = CPU
+llama_kv_cache_unified: layer  34: dev = CPU
+llama_kv_cache_unified: layer  35: dev = CPU
+llama_kv_cache_unified:        CPU KV buffer size =   144.00 MiB
+llama_kv_cache_unified: KV self size  =  144.00 MiB, K (f16):   72.00 MiB, V (f16):   72.00 MiB
+llama_context: enumerating backends
+llama_context: backend_ptrs.size() = 1
+llama_context: max_nodes = 65536
+llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
+llama_context: reserving graph for n_tokens = 512, n_seqs = 1
+llama_context: reserving graph for n_tokens = 1, n_seqs = 1
+llama_context: reserving graph for n_tokens = 512, n_seqs = 1
+llama_context:        CPU compute buffer size =   300.75 MiB
+llama_context: graph nodes  = 1338
+llama_context: graph splits = 1
+time=2025-08-09T22:41:53.869+02:00 level=INFO source=server.go:637 msg="llama runner started in 2.01 seconds"
+time=2025-08-09T22:41:53.869+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096
+time=2025-08-09T22:41:53.870+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=155 format=""
+time=2025-08-09T22:41:53.880+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=31 used=0 remaining=31
+[GIN] 2025/08/09 - 22:41:58 | 200 |  7.284707733s |       127.0.0.1 | POST     "/api/generate"
+time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
+time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 duration=5m0s
+time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 refCount=0

logs ADDED Viewed

	@@ -0,0 +1,374 @@

+rkonan@rkonan-ThinkPad-T460:~/chatbot-project/models$ OLLAMA_DEBUG=1 ollama serve
+time=2025-08-09T19:38:55.291+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+time=2025-08-09T19:38:55.293+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
+time=2025-08-09T19:38:55.295+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
+time=2025-08-09T19:38:55.297+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
+time=2025-08-09T19:38:55.298+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
+time=2025-08-09T19:38:55.299+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
+time=2025-08-09T19:38:55.304+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
+time=2025-08-09T19:38:55.305+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
+time=2025-08-09T19:38:55.307+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/chatbot-project/models/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
+time=2025-08-09T19:38:55.319+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
+time=2025-08-09T19:38:55.323+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
+time=2025-08-09T19:38:55.323+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/chatbot-project/models/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
+time=2025-08-09T19:38:55.334+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
+cudaSetDevice err: 35
+time=2025-08-09T19:38:55.335+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama"
+time=2025-08-09T19:38:55.335+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
+time=2025-08-09T19:38:55.336+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
+time=2025-08-09T19:38:55.336+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.3 GiB"
+time=2025-08-09T19:38:55.336+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
+[GIN] 2025/08/09 - 19:39:06 | 200 |     174.244µs |       127.0.0.1 | HEAD     "/"
+time=2025-08-09T19:39:06.454+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
+[GIN] 2025/08/09 - 19:39:06 | 200 |   141.99526ms |       127.0.0.1 | POST     "/api/show"
+time=2025-08-09T19:39:06.555+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.3 GiB" before.free_swap="2.0 GiB" now.total="15.5 GiB" now.free="11.3 GiB" now.free_swap="2.0 GiB"
+time=2025-08-09T19:39:06.555+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
+time=2025-08-09T19:39:06.589+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
+time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
+time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.3 GiB" before.free_swap="2.0 GiB" now.total="15.5 GiB" now.free="11.3 GiB" now.free_swap="2.0 GiB"
+time=2025-08-09T19:39:06.686+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="11.3 GiB" free_swap="2.0 GiB"
+time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[11.3 GiB]"
+time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
+time=2025-08-09T19:39:06.687+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
+time=2025-08-09T19:39:06.687+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
+time=2025-08-09T19:39:06.687+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[11.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.3 GiB" memory.required.partial="0 B" memory.required.kv="144.0 MiB" memory.required.allocations="[2.3 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="300.8 MiB" memory.graph.partial="544.2 MiB"
+time=2025-08-09T19:39:06.688+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
+llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = qwen2
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
+llama_model_loader: - kv   5:                         general.size_label str              = 3B
+llama_model_loader: - kv   6:                            general.license str              = other
+llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
+llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
+llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
+llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
+llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
+llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
+llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
+llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
+llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
+llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
+llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
+llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
+llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
+llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
+llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
+llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
+llama_model_loader: - kv  23:                          general.file_type u32              = 15
+llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
+llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
+llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
+llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
+llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
+llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
+llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
+llama_model_loader: - kv  34:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:  181 tensors
+llama_model_loader: - type q4_K:  216 tensors
+llama_model_loader: - type q6_K:   37 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 1.79 GiB (4.99 BPW)
+init_tokenizer: initializing tokenizer for type 2
+load: control token: 151660 '<|fim_middle|>' is not marked as EOG
+load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
+load: control token: 151653 '<|vision_end|>' is not marked as EOG
+load: control token: 151648 '<|box_start|>' is not marked as EOG
+load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
+load: control token: 151649 '<|box_end|>' is not marked as EOG
+load: control token: 151655 '<|image_pad|>' is not marked as EOG
+load: control token: 151651 '<|quad_end|>' is not marked as EOG
+load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
+load: control token: 151652 '<|vision_start|>' is not marked as EOG
+load: control token: 151654 '<|vision_pad|>' is not marked as EOG
+load: control token: 151656 '<|video_pad|>' is not marked as EOG
+load: control token: 151644 '<|im_start|>' is not marked as EOG
+load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
+load: control token: 151650 '<|quad_start|>' is not marked as EOG
+load: special tokens cache size = 22
+load: token to piece cache size = 0.9310 MB
+print_info: arch             = qwen2
+print_info: vocab_only       = 1
+print_info: model type       = ?B
+print_info: model params     = 3.09 B
+print_info: general.name     = Qwen2.5 3B Instruct
+print_info: vocab type       = BPE
+print_info: n_vocab          = 151936
+print_info: n_merges         = 151387
+print_info: BOS token        = 151643 '<|endoftext|>'
+print_info: EOS token        = 151645 '<|im_end|>'
+print_info: EOT token        = 151645 '<|im_end|>'
+print_info: PAD token        = 151643 '<|endoftext|>'
+print_info: LF token         = 198 'Ċ'
+print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
+print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
+print_info: FIM MID token    = 151660 '<|fim_middle|>'
+print_info: FIM PAD token    = 151662 '<|fim_pad|>'
+print_info: FIM REP token    = 151663 '<|repo_name|>'
+print_info: FIM SEP token    = 151664 '<|file_sep|>'
+print_info: EOG token        = 151643 '<|endoftext|>'
+print_info: EOG token        = 151645 '<|im_end|>'
+print_info: EOG token        = 151662 '<|fim_pad|>'
+print_info: EOG token        = 151663 '<|repo_name|>'
+print_info: EOG token        = 151664 '<|file_sep|>'
+print_info: max token length = 256
+llama_model_load: vocab only - skipping tensors
+time=2025-08-09T19:39:07.278+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
+time=2025-08-09T19:39:07.278+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 4096 --batch-size 512 --threads 2 --no-mmap --parallel 1 --port 43905"
+time=2025-08-09T19:39:07.278+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin:/home/rkonan/.vscode/extensions/ms-python.debugpy-2025.10.0-linux-x64/bundled/scripts/noConfigScripts:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
+time=2025-08-09T19:39:07.279+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
+time=2025-08-09T19:39:07.279+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
+time=2025-08-09T19:39:07.279+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
+time=2025-08-09T19:39:07.309+02:00 level=INFO source=runner.go:815 msg="starting go runner"
+time=2025-08-09T19:39:07.309+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
+load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
+time=2025-08-09T19:39:07.336+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
+time=2025-08-09T19:39:07.338+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:43905"
+llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = qwen2
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
+llama_model_loader: - kv   5:                         general.size_label str              = 3B
+llama_model_loader: - kv   6:                            general.license str              = other
+llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
+llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
+llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
+llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
+llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
+llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
+llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
+llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
+llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
+llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
+llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
+llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
+llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
+llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
+llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
+llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
+llama_model_loader: - kv  23:                          general.file_type u32              = 15
+llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
+llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+time=2025-08-09T19:39:07.530+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
+llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
+llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
+llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
+llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
+llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
+llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
+llama_model_loader: - kv  34:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:  181 tensors
+llama_model_loader: - type q4_K:  216 tensors
+llama_model_loader: - type q6_K:   37 tensors
+print_info: file format = GGUF V3 (latest)
+print_info: file type   = Q4_K - Medium
+print_info: file size   = 1.79 GiB (4.99 BPW)
+init_tokenizer: initializing tokenizer for type 2
+load: control token: 151660 '<|fim_middle|>' is not marked as EOG
+load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
+load: control token: 151653 '<|vision_end|>' is not marked as EOG
+load: control token: 151648 '<|box_start|>' is not marked as EOG
+load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
+load: control token: 151649 '<|box_end|>' is not marked as EOG
+load: control token: 151655 '<|image_pad|>' is not marked as EOG
+load: control token: 151651 '<|quad_end|>' is not marked as EOG
+load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
+load: control token: 151652 '<|vision_start|>' is not marked as EOG
+load: control token: 151654 '<|vision_pad|>' is not marked as EOG
+load: control token: 151656 '<|video_pad|>' is not marked as EOG
+load: control token: 151644 '<|im_start|>' is not marked as EOG
+load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
+load: control token: 151650 '<|quad_start|>' is not marked as EOG
+load: special tokens cache size = 22
+load: token to piece cache size = 0.9310 MB
+print_info: arch             = qwen2
+print_info: vocab_only       = 0
+print_info: n_ctx_train      = 32768
+print_info: n_embd           = 2048
+print_info: n_layer          = 36
+print_info: n_head           = 16
+print_info: n_head_kv        = 2
+print_info: n_rot            = 128
+print_info: n_swa            = 0
+print_info: n_swa_pattern    = 1
+print_info: n_embd_head_k    = 128
+print_info: n_embd_head_v    = 128
+print_info: n_gqa            = 8
+print_info: n_embd_k_gqa     = 256
+print_info: n_embd_v_gqa     = 256
+print_info: f_norm_eps       = 0.0e+00
+print_info: f_norm_rms_eps   = 1.0e-06
+print_info: f_clamp_kqv      = 0.0e+00
+print_info: f_max_alibi_bias = 0.0e+00
+print_info: f_logit_scale    = 0.0e+00
+print_info: f_attn_scale     = 0.0e+00
+print_info: n_ff             = 11008
+print_info: n_expert         = 0
+print_info: n_expert_used    = 0
+print_info: causal attn      = 1
+print_info: pooling type     = -1
+print_info: rope type        = 2
+print_info: rope scaling     = linear
+print_info: freq_base_train  = 1000000.0
+print_info: freq_scale_train = 1
+print_info: n_ctx_orig_yarn  = 32768
+print_info: rope_finetuned   = unknown
+print_info: ssm_d_conv       = 0
+print_info: ssm_d_inner      = 0
+print_info: ssm_d_state      = 0
+print_info: ssm_dt_rank      = 0
+print_info: ssm_dt_b_c_rms   = 0
+print_info: model type       = 3B
+print_info: model params     = 3.09 B
+print_info: general.name     = Qwen2.5 3B Instruct
+print_info: vocab type       = BPE
+print_info: n_vocab          = 151936
+print_info: n_merges         = 151387
+print_info: BOS token        = 151643 '<|endoftext|>'
+print_info: EOS token        = 151645 '<|im_end|>'
+print_info: EOT token        = 151645 '<|im_end|>'
+print_info: PAD token        = 151643 '<|endoftext|>'
+print_info: LF token         = 198 'Ċ'
+print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
+print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
+print_info: FIM MID token    = 151660 '<|fim_middle|>'
+print_info: FIM PAD token    = 151662 '<|fim_pad|>'
+print_info: FIM REP token    = 151663 '<|repo_name|>'
+print_info: FIM SEP token    = 151664 '<|file_sep|>'
+print_info: EOG token        = 151643 '<|endoftext|>'
+print_info: EOG token        = 151645 '<|im_end|>'
+print_info: EOG token        = 151662 '<|fim_pad|>'
+print_info: EOG token        = 151663 '<|repo_name|>'
+print_info: EOG token        = 151664 '<|file_sep|>'
+print_info: max token length = 256
+load_tensors: loading model tensors, this can take a while... (mmap = false)
+load_tensors: layer   0 assigned to device CPU, is_swa = 0
+load_tensors: layer   1 assigned to device CPU, is_swa = 0
+load_tensors: layer   2 assigned to device CPU, is_swa = 0
+load_tensors: layer   3 assigned to device CPU, is_swa = 0
+load_tensors: layer   4 assigned to device CPU, is_swa = 0
+load_tensors: layer   5 assigned to device CPU, is_swa = 0
+load_tensors: layer   6 assigned to device CPU, is_swa = 0
+load_tensors: layer   7 assigned to device CPU, is_swa = 0
+load_tensors: layer   8 assigned to device CPU, is_swa = 0
+load_tensors: layer   9 assigned to device CPU, is_swa = 0
+load_tensors: layer  10 assigned to device CPU, is_swa = 0
+load_tensors: layer  11 assigned to device CPU, is_swa = 0
+load_tensors: layer  12 assigned to device CPU, is_swa = 0
+load_tensors: layer  13 assigned to device CPU, is_swa = 0
+load_tensors: layer  14 assigned to device CPU, is_swa = 0
+load_tensors: layer  15 assigned to device CPU, is_swa = 0
+load_tensors: layer  16 assigned to device CPU, is_swa = 0
+load_tensors: layer  17 assigned to device CPU, is_swa = 0
+load_tensors: layer  18 assigned to device CPU, is_swa = 0
+load_tensors: layer  19 assigned to device CPU, is_swa = 0
+load_tensors: layer  20 assigned to device CPU, is_swa = 0
+load_tensors: layer  21 assigned to device CPU, is_swa = 0
+load_tensors: layer  22 assigned to device CPU, is_swa = 0
+load_tensors: layer  23 assigned to device CPU, is_swa = 0
+load_tensors: layer  24 assigned to device CPU, is_swa = 0
+load_tensors: layer  25 assigned to device CPU, is_swa = 0
+load_tensors: layer  26 assigned to device CPU, is_swa = 0
+load_tensors: layer  27 assigned to device CPU, is_swa = 0
+load_tensors: layer  28 assigned to device CPU, is_swa = 0
+load_tensors: layer  29 assigned to device CPU, is_swa = 0
+load_tensors: layer  30 assigned to device CPU, is_swa = 0
+load_tensors: layer  31 assigned to device CPU, is_swa = 0
+load_tensors: layer  32 assigned to device CPU, is_swa = 0
+load_tensors: layer  33 assigned to device CPU, is_swa = 0
+load_tensors: layer  34 assigned to device CPU, is_swa = 0
+load_tensors: layer  35 assigned to device CPU, is_swa = 0
+load_tensors: layer  36 assigned to device CPU, is_swa = 0
+load_tensors:          CPU model buffer size =  1834.82 MiB
+load_all_data: no device found for buffer type CPU for async uploads
+time=2025-08-09T19:39:08.284+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.19"
+time=2025-08-09T19:39:08.538+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.32"
+time=2025-08-09T19:39:08.791+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.42"
+time=2025-08-09T19:39:09.043+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.52"
+time=2025-08-09T19:39:09.294+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.67"
+time=2025-08-09T19:39:09.545+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.86"
+llama_context: constructing llama_context
+llama_context: n_seq_max     = 1
+llama_context: n_ctx         = 4096
+llama_context: n_ctx_per_seq = 4096
+llama_context: n_batch       = 512
+llama_context: n_ubatch      = 512
+llama_context: causal_attn   = 1
+llama_context: flash_attn    = 0
+llama_context: freq_base     = 1000000.0
+llama_context: freq_scale    = 1
+llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
+set_abort_callback: call
+llama_context:        CPU  output buffer size =     0.59 MiB
+create_memory: n_ctx = 4096 (padded)
+llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
+llama_kv_cache_unified: layer   0: dev = CPU
+llama_kv_cache_unified: layer   1: dev = CPU
+llama_kv_cache_unified: layer   2: dev = CPU
+llama_kv_cache_unified: layer   3: dev = CPU
+llama_kv_cache_unified: layer   4: dev = CPU
+llama_kv_cache_unified: layer   5: dev = CPU
+llama_kv_cache_unified: layer   6: dev = CPU
+llama_kv_cache_unified: layer   7: dev = CPU
+llama_kv_cache_unified: layer   8: dev = CPU
+llama_kv_cache_unified: layer   9: dev = CPU
+llama_kv_cache_unified: layer  10: dev = CPU
+llama_kv_cache_unified: layer  11: dev = CPU
+llama_kv_cache_unified: layer  12: dev = CPU
+llama_kv_cache_unified: layer  13: dev = CPU
+llama_kv_cache_unified: layer  14: dev = CPU
+llama_kv_cache_unified: layer  15: dev = CPU
+llama_kv_cache_unified: layer  16: dev = CPU
+llama_kv_cache_unified: layer  17: dev = CPU
+llama_kv_cache_unified: layer  18: dev = CPU
+llama_kv_cache_unified: layer  19: dev = CPU
+llama_kv_cache_unified: layer  20: dev = CPU
+llama_kv_cache_unified: layer  21: dev = CPU
+llama_kv_cache_unified: layer  22: dev = CPU
+llama_kv_cache_unified: layer  23: dev = CPU
+llama_kv_cache_unified: layer  24: dev = CPU
+llama_kv_cache_unified: layer  25: dev = CPU
+llama_kv_cache_unified: layer  26: dev = CPU
+llama_kv_cache_unified: layer  27: dev = CPU
+llama_kv_cache_unified: layer  28: dev = CPU
+llama_kv_cache_unified: layer  29: dev = CPU
+llama_kv_cache_unified: layer  30: dev = CPU
+llama_kv_cache_unified: layer  31: dev = CPU
+llama_kv_cache_unified: layer  32: dev = CPU
+llama_kv_cache_unified: layer  33: dev = CPU
+llama_kv_cache_unified: layer  34: dev = CPU
+llama_kv_cache_unified: layer  35: dev = CPU
+time=2025-08-09T19:39:09.796+02:00 level=DEBUG source=server.go:643 msg="model load progress 1.00"
+llama_kv_cache_unified:        CPU KV buffer size =   144.00 MiB
+llama_kv_cache_unified: KV self size  =  144.00 MiB, K (f16):   72.00 MiB, V (f16):   72.00 MiB
+llama_context: enumerating backends
+llama_context: backend_ptrs.size() = 1
+llama_context: max_nodes = 65536
+llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
+llama_context: reserving graph for n_tokens = 512, n_seqs = 1
+llama_context: reserving graph for n_tokens = 1, n_seqs = 1
+llama_context: reserving graph for n_tokens = 512, n_seqs = 1
+llama_context:        CPU compute buffer size =   300.75 MiB
+llama_context: graph nodes  = 1338
+llama_context: graph splits = 1
+time=2025-08-09T19:39:10.048+02:00 level=INFO source=server.go:637 msg="llama runner started in 2.77 seconds"
+time=2025-08-09T19:39:10.048+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=188113 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096
+time=2025-08-09T19:39:10.048+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=155 format=""
+time=2025-08-09T19:39:10.053+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=30 used=0 remaining=30
+[GIN] 2025/08/09 - 19:39:18 | 200 | 12.485055582s |       127.0.0.1 | POST     "/api/generate"
+time=2025-08-09T19:39:18.953+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
+time=2025-08-09T19:39:18.953+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=188113 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 duration=5m0s
+time=2025-08-09T19:39:18.954+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=188113 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 refCount=0

rag_model_ollama_v1 copy 2.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import os
+import pickle
+import textwrap
+import logging
+from typing import List, Optional, Dict, Any, Iterable, Tuple
+import requests
+import faiss
+from llama_index.core import VectorStoreIndex
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers.util import cos_sim
+import json
+# === Logger configuration ===
+logger = logging.getLogger("RAGEngine")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+MAX_TOKENS = 64
+DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
+class OllamaClient:
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
+        self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        self.timeout = timeout
+        self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(self, prompt: str, stop: Optional[List[str]] = None,
+                 max_tokens: Optional[int] = None, stream: bool = False,
+                 options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": stream,
+        }
+        if raw:
+            payload["raw"] = True
+        if stop:
+            payload["stop"] = stop
+        if max_tokens is not None:
+            payload["num_predict"] = int(max_tokens)
+        if options:
+            payload["options"] = options
+        logger.debug(f"POST {self._gen_url} (stream={stream})")
+        if stream:
+            with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                    except Exception:
+                        continue
+                    if "response" in data and not data.get("done"):
+                        yield data["response"]
+                    if data.get("done"):
+                        break
+            return
+        r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        return data.get("response", "")
+class RAGEngine:
+    def __init__(self, model_name: str, vector_path: str, index_path: str,
+                 model_threads: int = 4, ollama_host: Optional[str] = None,
+                 ollama_opts: Optional[Dict[str, Any]] = None):
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur RAG (Ollama)...")
+        opts = dict(ollama_opts or {})
+        opts.setdefault("temperature", 0.0)
+        opts.setdefault("num_ctx", 512)  # aligné avec CLI par défaut
+        opts.setdefault("num_batch", 16)
+        if "num_thread" not in opts and model_threads:
+            opts["num_thread"] = int(model_threads)
+        self.llm = OllamaClient(model=model_name, host=ollama_host)
+        self.ollama_opts = opts
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
+        with open(vector_path, "rb") as f:
+            chunk_texts: List[str] = pickle.load(f)
+        nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        faiss_index = faiss.read_index(index_path)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
+        # Warmup pour charger le runner comme le CLI
+        try:
+            logger.info("⚡ Warmup du modèle Ollama...")
+            for _ in self._complete_stream("Bonjour", max_tokens=8, raw=False):
+                pass
+        except Exception as e:
+            logger.warning(f"Warmup échoué : {e}")
+    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
+                         max_tokens: int = MAX_TOKENS, raw: bool = False):
+        return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=True, options=self.ollama_opts, raw=raw)
+    def _complete(self, prompt: str, stop: Optional[List[str]] = None,
+                  max_tokens: int = 128, raw: bool = False) -> str:
+        text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=False, options=self.ollama_opts, raw=raw)
+        return (text or "").strip()
+    def _is_greeting(self, text: str) -> bool:
+        s = text.lower().strip()
+        return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
+    def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
+        if is_greeting:
+            return "llm"
+        top = scores[0] if scores else 0.0
+        return "rag" if top >= tau else "llm"
+    def get_adaptive_top_k(self, question: str) -> int:
+        q = question.lower()
+        if len(q.split()) <= 7:
+            top_k = 8
+        elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
+            top_k = 10
+        else:
+            top_k = 8
+        logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
+        return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
+        q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes: List[Tuple[float, TextNode]] = []
+        for node in retrieved_nodes:
+            chunk_emb = self.embed_model.get_text_embedding(node.get_content())
+            score = cos_sim(q_emb, chunk_emb).item()
+            scored_nodes.append((score, node))
+        ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        top = ranked[:top_k]
+        return [s for s, _ in top], [n for _, n in top]
+    def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        retrieved_nodes = retriever.retrieve(question)
+        scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in nodes)
+        return context, nodes, scores
+    def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
+        logger.info(f"💬 [Stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        context, scores = "", []
+        if not is_hello:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+        mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+        logger.info(f"🧭 Mode choisi (stream) : {mode}")
+        if mode == "rag":
+            prompt = (
+                "Instruction: Réponds uniquement à partir du contexte. "
+                "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+            )
+            logger.info("📡 Début streaming (RAG)...")
+            for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
+                yield token
+            logger.info("📡 Fin streaming (RAG).")
+            return
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        logger.info("📡 Début streaming (LLM pur)...")
+        for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
+            yield token
+        logger.info("📡 Fin streaming (LLM pur).")

rag_model_ollama_v1 copy.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import os
+import json
+import pickle
+import textwrap
+import logging
+from typing import List, Optional, Dict, Any, Iterable, Tuple
+import requests
+import faiss
+import numpy as np
+from llama_index.core import VectorStoreIndex
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers.util import cos_sim
+# === Logger configuration ===
+logger = logging.getLogger("RAGEngine")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+MAX_TOKENS = 64  # bornage court sur CPU-only
+DEFAULT_STOPS = ["</s>", "\n\n", "\nQuestion:", "Question:"]
+class OllamaClient:
+    """
+    Minimal Ollama client for /api/generate (text completion) with streaming support.
+    """
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
+        self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        self.timeout = timeout
+        self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        max_tokens: Optional[int] = None,
+        stream: bool = False,
+        options: Optional[Dict[str, Any]] = None,
+        raw: bool = False,
+    ) -> str | Iterable[str]:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": stream,
+        }
+        if raw:
+            payload["raw"] = True  # IMPORTANT: désactive le template Modelfile
+        if stop:
+            payload["stop"] = stop
+        if max_tokens is not None:
+            payload["num_predict"] = int(max_tokens)  # nommage Ollama
+        if options:
+            payload["options"] = options
+        logger.debug(f"POST {self._gen_url} (stream={stream})")
+        if stream:
+            with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                    except Exception:
+                        continue
+                    # En stream, Ollama renvoie des morceaux dans "response"
+                    if "response" in data and not data.get("done"):
+                        yield data["response"]
+                    if data.get("done"):
+                        break
+            return
+        r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        return data.get("response", "")
+class RAGEngine:
+    def __init__(
+        self,
+        model_name: str,
+        vector_path: str,
+        index_path: str,
+        model_threads: int = 4,
+        ollama_host: Optional[str] = None,
+        ollama_opts: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Args:
+            model_name: e.g. "noushermes_rag"
+            vector_path: pickle file with chunk texts list[str]
+            index_path: FAISS index path
+            model_threads: forwarded as a hint to Ollama options
+            ollama_host: override OLLAMA_HOST (default http://localhost:11434)
+            ollama_opts: extra Ollama options (temperature, num_ctx, num_batch, num_thread)
+        """
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur RAG (Ollama)...")
+        # Options Ollama (par défaut optimisées CPU)
+        opts = dict(ollama_opts or {})
+        opts.setdefault("temperature", 0.0)
+        opts.setdefault("num_ctx", 512)
+        opts.setdefault("num_batch", 16)
+        if "num_thread" not in opts and model_threads:
+            opts["num_thread"] = int(model_threads)
+        self.llm = OllamaClient(model=model_name, host=ollama_host)
+        self.ollama_opts = opts
+        # Embedding model pour retrieval / rerank
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
+        with open(vector_path, "rb") as f:
+            chunk_texts: List[str] = pickle.load(f)
+        nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        faiss_index = faiss.read_index(index_path)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
+    # ---------------- LLM helpers (via Ollama) ----------------
+    def _complete(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        max_tokens: int = MAX_TOKENS,
+        raw: bool = True
+    ) -> str:
+        text = self.llm.generate(
+            prompt=prompt,
+            stop=stop or DEFAULT_STOPS,
+            max_tokens=max_tokens,
+            stream=False,
+            options=self.ollama_opts,
+            raw=raw,  # toujours True pour bypass Modelfile
+        )
+        # Par sécurité si un générateur se glisse quand stream=False
+        try:
+            if hasattr(text, "__iter__") and not isinstance(text, (str, bytes)):
+                chunks = []
+                for t in text:
+                    if not isinstance(t, (str, bytes)):
+                        continue
+                    chunks.append(t)
+                text = "".join(chunks)
+        except Exception:
+            pass
+        return (text or "").strip()
+    def _complete_stream(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        max_tokens: int = MAX_TOKENS,
+        raw: bool = True
+    ) -> Iterable[str]:
+        return self.llm.generate(
+            prompt=prompt,
+            stop=stop or DEFAULT_STOPS,
+            max_tokens=max_tokens,
+            stream=True,
+            options=self.ollama_opts,
+            raw=raw,  # toujours True pour bypass Modelfile
+        )
+    # ---------------- Utilities ----------------
+    def _is_greeting(self, text: str) -> bool:
+        s = text.lower().strip()
+        return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
+    def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
+        if is_greeting:
+            return "llm"
+        top = scores[0] if scores else 0.0
+        return "rag" if top >= tau else "llm"
+    def _stream_with_local_stops(self, tokens: Iterable[str], stops: List[str]) -> Iterable[str]:
+        """
+        Coupe localement le stream si un stop apparaît, même si le serveur ne s'arrête pas.
+        """
+        buffer = ""
+        for chunk in tokens:
+            buffer += chunk
+            # Check si un des stops est présent dans le buffer
+            hit = None
+            for s in stops:
+                idx = buffer.find(s)
+                if idx != -1:
+                    hit = (s, idx)
+                    break
+            if hit:
+                s, idx = hit
+                # Yield tout avant le stop, puis stoppe
+                yield buffer[:idx]
+                break
+            else:
+                # Si pas de stop, on envoie le chunk tel quel
+                yield chunk
+    # ---------------- Retrieval + (optional) rerank ----------------
+    def get_adaptive_top_k(self, question: str) -> int:
+        q = question.lower()
+        if len(q.split()) <= 7:
+            top_k = 8
+        elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
+            top_k = 10
+        else:
+            top_k = 8
+        logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
+        return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour la question : « {question} »")
+        q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes: List[Tuple[float, TextNode]] = []
+        for node in retrieved_nodes:
+            chunk_text = node.get_content()
+            chunk_emb = self.embed_model.get_text_embedding(chunk_text)
+            score = cos_sim(q_emb, chunk_emb).item()
+            scored_nodes.append((score, node))
+        ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        logger.info("📊 Chunks les plus pertinents :")
+        for i, (score, node) in enumerate(ranked[:top_k]):
+            chunk_preview = textwrap.shorten(node.get_content().replace("\n", " "), width=100)
+            logger.info(f"#{i+1} | Score: {score:.4f} | {chunk_preview}")
+        top = ranked[:top_k]
+        scores = [s for s, _ in top]
+        nodes = [n for _, n in top]
+        return scores, nodes
+    def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
+        logger.info("📥 Récupération du contexte...")
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        retrieved_nodes = retriever.retrieve(question)
+        scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in nodes)
+        return context, nodes, scores
+    # ---------------- Public API ----------------
+    def ask(self, question_raw: str, allow_fallback: bool = True) -> str:
+        logger.info(f"💬 Question reçue : {question_raw}")
+        is_hello = self._is_greeting(question_raw)
+        # retrieval (sauf salutations)
+        context, scores = "", []
+        if not is_hello:
+            top_k = self.get_adaptive_top_k(question_raw)
+            context, _, scores = self.retrieve_context(question_raw, top_k)
+        # router RAG vs LLM
+        mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+        logger.info(f"🧭 Mode choisi : {mode}")
+        if mode == "rag":
+            prompt = (
+                "Instruction: Réponds uniquement à partir du contexte. "
+                "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                "\n\nContexte :\n"
+                f"{context}\n\n"
+                f"Question : {question_raw}\n"
+                "Réponse :"
+            )
+            resp = self._complete(
+                prompt,
+                stop=DEFAULT_STOPS,
+                max_tokens=MAX_TOKENS,
+                raw=True,   # ✅ bypass Modelfile/template
+            ).strip()
+            # fallback LLM‑pur si le RAG n'a rien trouvé
+            if allow_fallback and "Information non présente" in resp:
+                logger.info("↪️ Fallback LLM‑pur (hors contexte)")
+                prompt_llm = (
+                    "Réponds brièvement et précisément en français.\n"
+                    f"Question : {question_raw}\n"
+                    "Réponse :"
+                )
+                resp = self._complete(
+                    prompt_llm,
+                    stop=DEFAULT_STOPS,
+                    max_tokens=MAX_TOKENS,
+                    raw=True
+                ).strip()
+            ellipsis = "..." if len(resp) > 120 else ""
+            logger.info(f"🧠 Réponse générée : {resp[:120]}{ellipsis}")
+            return resp
+        # LLM pur (salutation ou score faible)
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question_raw}\n"
+            "Réponse :"
+        )
+        resp = self._complete(
+            prompt_llm,
+            stop=DEFAULT_STOPS,
+            max_tokens=MAX_TOKENS,
+            raw=True
+        ).strip()
+        ellipsis = "..." if len(resp) > 120 else ""
+        logger.info(f"🧠 Réponse générée : {resp[:120]}{ellipsis}")
+        return resp
+    def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
+        logger.info(f"💬 [Stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        context, scores = "", []
+        if not is_hello:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+        mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+        logger.info(f"🧭 Mode choisi (stream) : {mode}")
+        stops = DEFAULT_STOPS
+        if mode == "rag":
+            prompt = (
+                "Instruction: Réponds uniquement à partir du contexte. "
+                "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                "\n\nContexte :\n"
+                f"{context}\n\n"
+                f"Question : {question}\n"
+                "Réponse :"
+            )
+            logger.info("📡 Début du streaming de la réponse (RAG)...")
+            tokens = self._complete_stream(
+                prompt,
+                stop=stops,
+                max_tokens=MAX_TOKENS,
+                raw=True,
+            )
+            # Blindage local: coupe si un stop apparaît
+            for t in self._stream_with_local_stops(tokens, stops):
+                if t:
+                    yield t
+            logger.info("📡 Fin du streaming de la réponse (RAG).")
+            return
+        # LLM pur en stream
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\n"
+            "Réponse :"
+        )
+        logger.info("📡 Début du streaming de la réponse (LLM pur)...")
+        tokens = self._complete_stream(
+            prompt_llm,
+            stop=stops,
+            max_tokens=MAX_TOKENS,
+            raw=True,
+        )
+        for t in self._stream_with_local_stops(tokens, stops):
+            if t:
+                yield t
+        logger.info("📡 Fin du streaming de la réponse (LLM pur).")

rag_model_ollama_v1 stable_lazy.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os
+import pickle
+import logging
+import time
+from typing import List, Optional, Dict, Any, Iterable, Tuple
+import requests
+import faiss
+import json
+from llama_index.core import VectorStoreIndex
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers.util import cos_sim
+# === Logger configuration ===
+logger = logging.getLogger("RAGEngine")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+MAX_TOKENS = 64
+DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
+class OllamaClient:
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
+        self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        self.timeout = timeout
+        self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(self, prompt: str, stop: Optional[List[str]] = None,
+                 max_tokens: Optional[int] = None, stream: bool = False,
+                 options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": stream,
+        }
+        if raw:
+            payload["raw"] = True
+        if stop:
+            payload["stop"] = stop
+        if max_tokens is not None:
+            payload["num_predict"] = int(max_tokens)
+        # ❌ AUCUNE options → laisser Ollama auto-tuner
+        if stream:
+            with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                    except Exception:
+                        continue
+                    if "response" in data and not data.get("done"):
+                        yield data["response"]
+                    if data.get("done"):
+                        break
+            return
+        r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        return data.get("response", "")
+class RAGEngine:
+    def __init__(self, model_name: str, vector_path: str, index_path: str,
+                 model_threads: int = 4, ollama_host: Optional[str] = None,
+                 ollama_opts: Optional[Dict[str, Any]] = None):
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur (lazy RAG)...")
+        # -- LLM prêt tout de suite
+        self.llm = OllamaClient(model=model_name, host=ollama_host)
+        # -- Chemins pour lazy load
+        self.vector_path = vector_path
+        self.index_path = index_path
+        # -- Objets RAG, chargés plus tard
+        self.embed_model: Optional[HuggingFaceEmbedding] = None
+        self.index: Optional[VectorStoreIndex] = None
+        self._loaded = False
+        logger.info("✅ Moteur initialisé (sans charger FAISS ni chunks).")
+        # ❌ Pas de warmup “génération” ici ; le premier appel LLM sera rapide.
+        # (Si tu veux : décommente ce mini warmup 1 token)
+        # try:
+        #     list(self._complete_stream("Bonjour", max_tokens=1))
+        # except Exception as e:
+        #     logger.warning(f"Warmup échoué : {e}")
+    # ---------- Lazy loader ----------
+    def _ensure_loaded(self):
+        if self._loaded:
+            return
+        t0 = time.perf_counter()
+        logger.info("⏳ Chargement lazy des données RAG (FAISS + chunks + embeddings)...")
+        # 1) Charger les chunks (pickle)
+        with open(self.vector_path, "rb") as f:
+            chunk_texts: List[str] = pickle.load(f)
+        nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        # 2) Charger l'index FAISS
+        faiss_index = faiss.read_index(self.index_path)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        # 3) Embedding model
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        # 4) Construire l'index LlamaIndex
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        self._loaded = True
+        logger.info(f"✅ RAG chargé en {time.perf_counter() - t0:.2f}s (lazy).")
+    # ---------- Génération ----------
+    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
+                         max_tokens: int = MAX_TOKENS, raw: bool = False):
+        return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=True, raw=raw)
+    def _complete(self, prompt: str, stop: Optional[List[str]] = None,
+                  max_tokens: int = 128, raw: bool = False) -> str:
+        text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=False, raw=raw)
+        return (text or "").strip()
+    # ---------- Heuristiques légères ----------
+    def _is_greeting(self, text: str) -> bool:
+        s = text.lower().strip()
+        return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
+    def _should_use_rag_fast(self, question: str) -> bool:
+        """Heuristique avant de charger RAG : éviter de charger pour une question triviale."""
+        q = question.lower()
+        # Mots-clés “doc”, “procédure”, etc.
+        keywords = ("document", "docs", "procédure", "politique", "policy", "manuel", "guide", "pdf", "selon", "dans le contexte")
+        if any(k in q for k in keywords):
+            return True
+        # Longueur : si question courte, reste LLM
+        if len(q.split()) <= 7:
+            return False
+        # Par défaut, pour les questions moyennes/longues → on utilisera RAG
+        return True
+    def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
+        if is_greeting:
+            return "llm"
+        top = scores[0] if scores else 0.0
+        return "rag" if top >= tau else "llm"
+    # ---------- Récupération ----------
+    def get_adaptive_top_k(self, question: str) -> int:
+        q = question.lower()
+        if len(q.split()) <= 7:
+            top_k = 8
+        elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
+            top_k = 10
+        else:
+            top_k = 8
+        logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
+        return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
+        q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes: List[Tuple[float, TextNode]] = []
+        for node in retrieved_nodes:
+            chunk_emb = self.embed_model.get_text_embedding(node.get_content())
+            score = cos_sim(q_emb, chunk_emb).item()
+            scored_nodes.append((score, node))
+        ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        top = ranked[:top_k]
+        return [s for s, _ in top], [n for _, n in top]
+    def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
+        self._ensure_loaded()
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        retrieved_nodes = retriever.retrieve(question)
+        scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in nodes)
+        return context, nodes, scores
+    # ---------- API publique ----------
+    def ask(self, question: str, allow_fallback: bool = False) -> str:
+        logger.info(f"💬 [Non-stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        # ⚡ Heuristique avant de charger RAG
+        if not is_hello and (self._loaded or self._should_use_rag_fast(question)):
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+            mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+            if mode == "rag":
+                prompt = (
+                    "Instruction: Réponds uniquement à partir du contexte. "
+                    "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                    f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+                )
+                return self._complete(prompt, stop=DEFAULT_STOPS, raw=False)
+        # LLM pur
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        return self._complete(prompt_llm, stop=DEFAULT_STOPS, raw=False)
+    def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
+        logger.info(f"💬 [Stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        # ⚡ Heuristique avant de charger RAG
+        if not is_hello and (self._loaded or self._should_use_rag_fast(question)):
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+            mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+            if mode == "rag":
+                prompt = (
+                    "Instruction: Réponds uniquement à partir du contexte. "
+                    "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                    f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+                )
+                logger.info("📡 Début streaming (RAG)...")
+                for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
+                    yield token
+                logger.info("📡 Fin streaming (RAG).")
+                return
+        # LLM pur
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        logger.info("📡 Début streaming (LLM pur)...")
+        for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
+            yield token
+        logger.info("📡 Fin streaming (LLM pur).")

rag_model_ollama_v1.py CHANGED Viewed

@@ -1,20 +1,17 @@
 import os
 import pickle
-import textwrap
 import logging
-from typing import List, Optional, Dict, Any, Iterable
 import requests
 import faiss
-import numpy as np
 from llama_index.core import VectorStoreIndex
 from llama_index.core.schema import TextNode
 from llama_index.vector_stores.faiss import FaissVectorStore
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from sentence_transformers.util import cos_sim
 # === Logger configuration ===
 logger = logging.getLogger("RAGEngine")
 logger.setLevel(logging.INFO)
@@ -24,42 +21,33 @@ handler.setFormatter(formatter)
 if not logger.handlers:
     logger.addHandler(handler)
-MAX_TOKENS = 512
 class OllamaClient:
-    """
-    Minimal Ollama client for /api/generate (text completion) with streaming support.
-    Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
-    """
     def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
         self.model = model
-        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
         self.timeout = timeout
         self._gen_url = self.host.rstrip("/") + "/api/generate"
-    def generate(
-        self,
-        prompt: str,
-        stop: Optional[List[str]] = None,
-        max_tokens: Optional[int] = None,
-        stream: bool = False,
-        options: Optional[Dict[str, Any]] = None,
-    ) -> str | Iterable[str]:
-        payload = {
             "model": self.model,
             "prompt": prompt,
             "stream": stream,
         }
         if stop:
             payload["stop"] = stop
         if max_tokens is not None:
-            # Ollama uses "num_predict" for max new tokens
             payload["num_predict"] = int(max_tokens)
-        if options:
-            payload["options"] = options
-        logger.debug(f"POST {self._gen_url} (stream={stream})")
         if stream:
             with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
@@ -70,133 +58,125 @@ class OllamaClient:
                     try:
                         data = json.loads(line)
                     except Exception:
-                        # In case a broken line appears
                         continue
-                    if "response" in data and data.get("done") is not True:
                         yield data["response"]
                     if data.get("done"):
                         break
             return
-        # Non-streaming
         r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
         r.raise_for_status()
         data = r.json()
         return data.get("response", "")
-# Lazy import json to keep top clean
-import json
 class RAGEngine:
-    def __init__(
-        self,
-        model_name: str,
-        vector_path: str,
-        index_path: str,
-        model_threads: int = 4,
-        ollama_host: Optional[str] = None,
-        ollama_opts: Optional[Dict[str, Any]] = None,
-    ):
-        """
-        Args:
-            model_name: e.g. "nous-hermes2:Q4_K_M" or "llama3.1:8b-instruct-q4_K_M"
-            vector_path: pickle file with chunk texts list[str]
-            index_path: FAISS index path
-            model_threads: forwarded to Ollama via options.n_threads (if supported by the model)
-            ollama_host: override OLLAMA_HOST (default http://localhost:11434)
-            ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
-        """
         logger.info(f"🔎 rag_model_ollama source: {__file__}")
-        logger.info("📦 Initialisation du moteur RAG (Ollama)...")
-        # Build options
-        opts = dict(ollama_opts or {})
-        # Common low-latency defaults; user can override via ollama_opts
-        opts.setdefault("temperature", 0.1)
-        # Try to pass thread hint if supported by the backend
-        if "num_thread" not in opts and model_threads:
-            opts["num_thread"] = int(model_threads)
         self.llm = OllamaClient(model=model_name, host=ollama_host)
-        self.ollama_opts = opts
-        self.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
-        logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
-        with open(vector_path, "rb") as f:
-            chunk_texts = pickle.load(f)
         nodes = [TextNode(text=chunk) for chunk in chunk_texts]
-        faiss_index = faiss.read_index(index_path)
         vector_store = FaissVectorStore(faiss_index=faiss_index)
-        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
-        logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
-    # ---------------- LLM helpers (via Ollama) ----------------
-    def _complete(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = 128) -> str:
-        text = self.llm.generate(
-            prompt=prompt,
-            stop=stop,
-            max_tokens=max_tokens,
-            stream=False,
-            options=self.ollama_opts,
-        )
-        # Some Ollama setups may stream even when stream=False. Coerce generators to string.
-        try:
-            if hasattr(text, "__iter__") and not isinstance(text, (str, bytes)):
-                chunks = []
-                for t in text:
-                    if not isinstance(t, (str, bytes)):
-                        continue
-                    chunks.append(t)
-                text = "".join(chunks)
-        except Exception:
-            pass
-        return (text or "").strip()
-    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = MAX_TOKENS):
-        return self.llm.generate(
-            prompt=prompt,
-            stop=stop,
-            max_tokens=max_tokens,
-            stream=True,
-            options=self.ollama_opts,
-        )
-    # ---------------- Reformulation ----------------
-    def reformulate_question(self, question: str) -> str:
-        logger.info("🔁 Reformulation de la question (sans contexte)...")
-        prompt = f"""Tu es un assistant expert chargé de clarifier des questions floues.
-Transforme la question suivante en une question claire, explicite et complète, sans ajouter d'informations extérieures.
-Question floue : {question}
-Question reformulée :"""
-        reformulated = self._complete(prompt, stop=["\n"], max_tokens=128)
-        logger.info(f"📝 Reformulée : {reformulated}")
-        return reformulated
-    def reformulate_with_context(self, question: str, context_sample: str) -> str:
-        logger.info("🔁 Reformulation de la question avec contexte...")
-        prompt = f"""Tu es un assistant expert en machine learning. Ton rôle est de reformuler les questions utilisateur en tenant compte du contexte ci-dessous, extrait d’un rapport technique sur un projet de reconnaissance de maladies de plantes.
-Ta mission est de transformer une question vague ou floue en une question précise et adaptée au contenu du rapport. Ne donne pas une interprétation hors sujet. Ne reformule pas en termes de produits commerciaux.
-Contexte :
-{context_sample}
-Question initiale : {question}
-Question reformulée :"""
-        reformulated = self._complete(prompt, stop=["\n"], max_tokens=128)
-        logger.info(f"📝 Reformulée avec contexte : {reformulated}")
-        return reformulated
-    # ---------------- Retrieval ----------------
     def get_adaptive_top_k(self, question: str) -> int:
         q = question.lower()
         if len(q.split()) <= 7:
@@ -208,78 +188,79 @@ Question reformulée :"""
         logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
         return top_k
-    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3):
-        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour la question : « {question} »")
         q_emb = self.embed_model.get_query_embedding(question)
-        scored_nodes = []
         for node in retrieved_nodes:
-            chunk_text = node.get_content()
-            chunk_emb = self.embed_model.get_text_embedding(chunk_text)
             score = cos_sim(q_emb, chunk_emb).item()
             scored_nodes.append((score, node))
-        ranked_nodes = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
-        logger.info("📊 Chunks les plus pertinents :")
-        for i, (score, node) in enumerate(ranked_nodes[:top_k]):
-            chunk_preview = textwrap.shorten(node.get_content().replace("\n", " "), width=100)
-            logger.info(f"#{i+1} | Score: {score:.4f} | {chunk_preview}")
-        return [n for _, n in ranked_nodes[:top_k]]
-    def retrieve_context(self, question: str, top_k: int = 3):
-        logger.info(f"📥 Récupération du contexte...")
-        retriever = self.index.as_retriever(similarity_top_k=top_k)
         retrieved_nodes = retriever.retrieve(question)
-        reranked_nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
-        context = "\n\n".join(n.get_content()[:500] for n in reranked_nodes)
-        return context, reranked_nodes
-    # ---------------- Public API ----------------
-    def ask(self, question_raw: str) -> str:
-        logger.info(f"💬 Question reçue : {question_raw}")
-        if len(question_raw.split()) <= 100:
-            context_sample, _ = self.retrieve_context(question_raw, top_k=3)
-            reformulated = self.reformulate_with_context(question_raw, context_sample)
-        else:
-            reformulated = self.reformulate_question(question_raw)
-        logger.info(f"📝 Question reformulée : {reformulated}")
-        top_k = self.get_adaptive_top_k(reformulated)
-        context, _ = self.retrieve_context(reformulated, top_k)
-        prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
-Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
-Contexte :
-{context}
-Question : {reformulated}
-### Réponse:"""
-        response = self._complete(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS)
-        response = response.strip().split("###")[0]
-        logger.info(f"🧠 Réponse générée : {response[:120]}{{'...' if len(response) > 120 else ''}}")
-        return response
-    def ask_stream(self, question: str):
         logger.info(f"💬 [Stream] Question reçue : {question}")
-        top_k = self.get_adaptive_top_k(question)
-        context, _ = self.retrieve_context(question, top_k)
-        prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
-Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
-Contexte :
-{context}
-Question : {question}
-### Réponse:"""
-        logger.info("📡 Début du streaming de la réponse...")
-        for token in self._complete_stream(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS):
             yield token

 import os
 import pickle
 import logging
+import time
+from typing import List, Optional, Dict, Any, Iterable, Tuple
 import requests
 import faiss
+import json
 from llama_index.core import VectorStoreIndex
 from llama_index.core.schema import TextNode
 from llama_index.vector_stores.faiss import FaissVectorStore
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from sentence_transformers.util import cos_sim
 # === Logger configuration ===
 logger = logging.getLogger("RAGEngine")
 logger.setLevel(logging.INFO)
 if not logger.handlers:
     logger.addHandler(handler)
+MAX_TOKENS = 64
+DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
+# ---------- Client Ollama (use /api/generate, no options) ----------
 class OllamaClient:
     def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
         self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11435") #mode proxy
+        #self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
         self.timeout = timeout
         self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(self, prompt: str, stop: Optional[List[str]] = None,
+                 max_tokens: Optional[int] = None, stream: bool = False,
+                 options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
+        payload: Dict[str, Any] = {
             "model": self.model,
             "prompt": prompt,
             "stream": stream,
         }
+        if raw:
+            payload["raw"] = True
         if stop:
             payload["stop"] = stop
         if max_tokens is not None:
             payload["num_predict"] = int(max_tokens)
+        # ❌ aucune "options" pour laisser Ollama auto-tuner
         if stream:
             with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
                     try:
                         data = json.loads(line)
                     except Exception:
                         continue
+                    if "response" in data and not data.get("done"):
                         yield data["response"]
                     if data.get("done"):
                         break
             return
         r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
         r.raise_for_status()
         data = r.json()
         return data.get("response", "")
+# ---------- RAG Engine (lazy load + heuristique GK) ----------
 class RAGEngine:
+    def __init__(self, model_name: str, vector_path: str, index_path: str,
+                 model_threads: int = 4, ollama_host: Optional[str] = None,
+                 ollama_opts: Optional[Dict[str, Any]] = None):
         logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur (lazy RAG)...")
+        # LLM prêt immédiatement
         self.llm = OllamaClient(model=model_name, host=ollama_host)
+        # chemins pour chargement différé
+        self.vector_path = vector_path
+        self.index_path = index_path
+        # objets RAG paresseux
+        self.embed_model: Optional[HuggingFaceEmbedding] = None
+        self.index: Optional[VectorStoreIndex] = None
+        self._loaded = False
+        logger.info("✅ Moteur initialisé (sans charger FAISS ni chunks).")
+    # ---------- lazy loader ----------
+    def _ensure_loaded(self):
+        if self._loaded:
+            return
+        t0 = time.perf_counter()
+        logger.info("⏳ Chargement lazy des données RAG (FAISS + chunks + embeddings)...")
+        # 1) chunks
+        with open(self.vector_path, "rb") as f:
+            chunk_texts: List[str] = pickle.load(f)
         nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        # 2) index FAISS
+        faiss_index = faiss.read_index(self.index_path)
         vector_store = FaissVectorStore(faiss_index=faiss_index)
+        # 3) modèle d'embedding
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        # 4) index LlamaIndex
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        self._loaded = True
+        logger.info(f"✅ RAG chargé en {time.perf_counter() - t0:.2f}s (lazy).")
+    # ---------- génération ----------
+    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
+                         max_tokens: int = MAX_TOKENS, raw: bool = False):
+        return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=True, raw=raw)
+    def _complete(self, prompt: str, stop: Optional[List[str]] = None,
+                  max_tokens: int = 128, raw: bool = False) -> str:
+        text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=False, raw=raw)
+        return (text or "").strip()
+    # ---------- heuristiques ----------
+    def _is_greeting(self, text: str) -> bool:
+        s = text.lower().strip()
+        return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
+    def _looks_general_knowledge(self, q: str) -> bool:
+        q = q.lower().strip()
+        gk_keywords = (
+            "capitale", "date de naissance", "qui est", "qu'est-ce", "definition",
+            "définition", "histoire", "pays", "ville", "math", "science", "sport"
+        )
+        if len(q.split()) <= 9:
+            if any(k in q for k in gk_keywords) or q.startswith(("quelle est", "qui est", "qu'est-ce", "c'est quoi")):
+                return True
+        return False
+    def _should_use_rag_fast(self, question: str) -> bool:
+        """N'active RAG que si on détecte des indices 'docs' / longueur significative."""
+        q = question.lower()
+        # 1) GK → pas de RAG
+        if self._looks_general_knowledge(q):
+            return False
+        # 2) indices RAG
+        doc_keywords = (
+            "document", "docs", "procédure", "politique", "policy",
+            "manuel", "guide", "pdf", "docling", "selon", "dans le contexte",
+            "page", "section", "chapitre", "référence", "références", "conformément",
+            "note technique", "spécification", "spec", "architecture", "adr"
+        )
+        if any(k in q for k in doc_keywords):
+            return True
+        # 3) question longue → probable RAG
+        if len(q.split()) >= 14:
+            return True
+        return False
+    def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
+        if is_greeting:
+            return "llm"
+        top = scores[0] if scores else 0.0
+        return "rag" if top >= tau else "llm"
+    # ---------- retrieval ----------
     def get_adaptive_top_k(self, question: str) -> int:
         q = question.lower()
         if len(q.split()) <= 7:
         logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
         return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
+        assert self.embed_model is not None
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
         q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes: List[Tuple[float, TextNode]] = []
         for node in retrieved_nodes:
+            chunk_emb = self.embed_model.get_text_embedding(node.get_content())
             score = cos_sim(q_emb, chunk_emb).item()
             scored_nodes.append((score, node))
+        ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        top = ranked[:top_k]
+        return [s for s, _ in top], [n for _, n in top]
+    def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
+        self._ensure_loaded()
+        retriever = self.index.as_retriever(similarity_top_k=top_k)  # type: ignore
         retrieved_nodes = retriever.retrieve(question)
+        scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in nodes)
+        return context, nodes, scores
+    # ---------- API publique ----------
+    def ask(self, question: str, allow_fallback: bool = False) -> str:
+        logger.info(f"💬 [Non-stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        use_rag = (self._loaded and not is_hello) or (not self._loaded and self._should_use_rag_fast(question))
+        if use_rag:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+            mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+            if mode == "rag":
+                prompt = (
+                    "Instruction: Réponds uniquement à partir du contexte. "
+                    "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                    f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+                )
+                return self._complete(prompt, stop=DEFAULT_STOPS, raw=False)
+        # LLM pur
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        return self._complete(prompt_llm, stop=DEFAULT_STOPS, raw=False)
+    def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
         logger.info(f"💬 [Stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        use_rag = (self._loaded and not is_hello) or (not self._loaded and self._should_use_rag_fast(question))
+        if use_rag:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+            mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+            if mode == "rag":
+                prompt = (
+                    "Instruction: Réponds uniquement à partir du contexte. "
+                    "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                    f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+                )
+                logger.info("📡 Début streaming (RAG)...")
+                for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
+                    yield token
+                logger.info("📡 Fin streaming (RAG).")
+                return
+        # LLM pur
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        logger.info("📡 Début streaming (LLM pur)...")
+        for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
             yield token
+        logger.info("📡 Fin streaming (LLM pur).")

rag_model_ollama_v1_ok_full_load.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import pickle
+import logging
+from typing import List, Optional, Dict, Any, Iterable, Tuple
+import requests
+import faiss
+import json
+from llama_index.core import VectorStoreIndex
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers.util import cos_sim
+# === Logger configuration ===
+logger = logging.getLogger("RAGEngine")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+MAX_TOKENS = 64
+DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
+class OllamaClient:
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
+        self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        self.timeout = timeout
+        self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(self, prompt: str, stop: Optional[List[str]] = None,
+                 max_tokens: Optional[int] = None, stream: bool = False,
+                 options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": stream,
+        }
+        if raw:
+            payload["raw"] = True
+        if stop:
+            payload["stop"] = stop
+        if max_tokens is not None:
+            payload["num_predict"] = int(max_tokens)
+        # ❌ Pas d'options envoyées pour laisser Ollama choisir ses defaults
+        logger.debug(f"POST {self._gen_url} (stream={stream})")
+        if stream:
+            with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                    except Exception:
+                        continue
+                    if "response" in data and not data.get("done"):
+                        yield data["response"]
+                    if data.get("done"):
+                        break
+            return
+        r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        return data.get("response", "")
+class RAGEngine:
+    def __init__(self, model_name: str, vector_path: str, index_path: str,
+                 model_threads: int = 4, ollama_host: Optional[str] = None,
+                 ollama_opts: Optional[Dict[str, Any]] = None):
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur RAG (Ollama)...")
+        # ❌ Pas d'options Ollama stockées
+        self.llm = OllamaClient(model=model_name, host=ollama_host)
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
+        with open(vector_path, "rb") as f:
+            chunk_texts: List[str] = pickle.load(f)
+        nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        faiss_index = faiss.read_index(index_path)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
+        # Warmup pour charger le modèle
+        try:
+            logger.info("⚡ Warmup du modèle Ollama...")
+            for _ in self._complete_stream("Bonjour", max_tokens=8, raw=False):
+                pass
+        except Exception as e:
+            logger.warning(f"Warmup échoué : {e}")
+    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
+                         max_tokens: int = MAX_TOKENS, raw: bool = False):
+        return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=True, raw=raw)
+    def _complete(self, prompt: str, stop: Optional[List[str]] = None,
+                  max_tokens: int = 128, raw: bool = False) -> str:
+        text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=False, raw=raw)
+        return (text or "").strip()
+    def _is_greeting(self, text: str) -> bool:
+        s = text.lower().strip()
+        return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
+    def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
+        if is_greeting:
+            return "llm"
+        top = scores[0] if scores else 0.0
+        return "rag" if top >= tau else "llm"
+    def get_adaptive_top_k(self, question: str) -> int:
+        q = question.lower()
+        if len(q.split()) <= 7:
+            top_k = 8
+        elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
+            top_k = 10
+        else:
+            top_k = 8
+        logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
+        return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
+        q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes: List[Tuple[float, TextNode]] = []
+        for node in retrieved_nodes:
+            chunk_emb = self.embed_model.get_text_embedding(node.get_content())
+            score = cos_sim(q_emb, chunk_emb).item()
+            scored_nodes.append((score, node))
+        ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        top = ranked[:top_k]
+        return [s for s, _ in top], [n for _, n in top]
+    def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        retrieved_nodes = retriever.retrieve(question)
+        scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in nodes)
+        return context, nodes, scores
+    def ask(self, question: str, allow_fallback: bool = False) -> str:
+        """Génération non-stream"""
+        logger.info(f"💬 [Non-stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        context, scores = "", []
+        if not is_hello:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+        mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+        logger.info(f"🧭 Mode choisi (non-stream) : {mode}")
+        if mode == "rag":
+            prompt = (
+                "Instruction: Réponds uniquement à partir du contexte. "
+                "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+            )
+            return self._complete(prompt, stop=DEFAULT_STOPS, raw=False)
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        return self._complete(prompt_llm, stop=DEFAULT_STOPS, raw=False)
+    def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
+        logger.info(f"💬 [Stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        context, scores = "", []
+        if not is_hello:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+        mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+        logger.info(f"🧭 Mode choisi (stream) : {mode}")
+        if mode == "rag":
+            prompt = (
+                "Instruction: Réponds uniquement à partir du contexte. "
+                "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+            )
+            logger.info("📡 Début streaming (RAG)...")
+            for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
+                yield token
+            logger.info("📡 Fin streaming (RAG).")
+            return
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        logger.info("📡 Début streaming (LLM pur)...")
+        for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
+            yield token
+        logger.info("📡 Fin streaming (LLM pur).")

rag_model_ollama_v1_ok_llm.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import os
+import pickle
+import logging
+from typing import List, Optional, Dict, Any, Iterable, Tuple
+import requests
+import faiss
+import json
+from llama_index.core import VectorStoreIndex
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers.util import cos_sim
+# === Logger configuration ===
+logger = logging.getLogger("RAGEngine")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+MAX_TOKENS = 64
+DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
+class OllamaClient:
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
+        self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        self.timeout = timeout
+        self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(self, prompt: str, stop: Optional[List[str]] = None,
+                 max_tokens: Optional[int] = None, stream: bool = False,
+                 options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
+        payload: Dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": stream,
+        }
+        if raw:
+            payload["raw"] = True
+        if stop:
+            payload["stop"] = stop
+        if max_tokens is not None:
+            payload["num_predict"] = int(max_tokens)
+        # ❌ Pas d'options envoyées pour laisser Ollama choisir ses defaults
+        logger.debug(f"POST {self._gen_url} (stream={stream})")
+        if stream:
+            with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                    except Exception:
+                        continue
+                    if "response" in data and not data.get("done"):
+                        yield data["response"]
+                    if data.get("done"):
+                        break
+            return
+        r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        return data.get("response", "")
+class RAGEngine:
+    def __init__(self, model_name: str, vector_path: str, index_path: str,
+                 model_threads: int = 4, ollama_host: Optional[str] = None,
+                 ollama_opts: Optional[Dict[str, Any]] = None):
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur RAG (Ollama)...")
+        # ❌ Pas d'options Ollama stockées
+        self.llm = OllamaClient(model=model_name, host=ollama_host)
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
+        with open(vector_path, "rb") as f:
+            chunk_texts: List[str] = pickle.load(f)
+        nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        faiss_index = faiss.read_index(index_path)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
+        # Warmup pour charger le modèle
+        try:
+            logger.info("⚡ Warmup du modèle Ollama...")
+            for _ in self._complete_stream("Bonjour", max_tokens=8, raw=False):
+                pass
+        except Exception as e:
+            logger.warning(f"Warmup échoué : {e}")
+    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
+                         max_tokens: int = MAX_TOKENS, raw: bool = False):
+        return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=True, raw=raw)
+    def _complete(self, prompt: str, stop: Optional[List[str]] = None,
+                  max_tokens: int = 128, raw: bool = False) -> str:
+        text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
+                                 stream=False, raw=raw)
+        return (text or "").strip()
+    def _is_greeting(self, text: str) -> bool:
+        s = text.lower().strip()
+        return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
+    def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
+        if is_greeting:
+            return "llm"
+        top = scores[0] if scores else 0.0
+        return "rag" if top >= tau else "llm"
+    def get_adaptive_top_k(self, question: str) -> int:
+        q = question.lower()
+        if len(q.split()) <= 7:
+            top_k = 8
+        elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
+            top_k = 10
+        else:
+            top_k = 8
+        logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
+        return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
+        q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes: List[Tuple[float, TextNode]] = []
+        for node in retrieved_nodes:
+            chunk_emb = self.embed_model.get_text_embedding(node.get_content())
+            score = cos_sim(q_emb, chunk_emb).item()
+            scored_nodes.append((score, node))
+        ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        top = ranked[:top_k]
+        return [s for s, _ in top], [n for _, n in top]
+    def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        retrieved_nodes = retriever.retrieve(question)
+        scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in nodes)
+        return context, nodes, scores
+    def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
+        logger.info(f"💬 [Stream] Question reçue : {question}")
+        is_hello = self._is_greeting(question)
+        context, scores = "", []
+        if not is_hello:
+            top_k = self.get_adaptive_top_k(question)
+            context, _, scores = self.retrieve_context(question, top_k)
+        mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
+        logger.info(f"🧭 Mode choisi (stream) : {mode}")
+        if mode == "rag":
+            prompt = (
+                "Instruction: Réponds uniquement à partir du contexte. "
+                "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
+                f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
+            )
+            logger.info("📡 Début streaming (RAG)...")
+            for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
+                yield token
+            logger.info("📡 Fin streaming (RAG).")
+            return
+        prompt_llm = (
+            "Réponds brièvement et précisément en français.\n"
+            f"Question : {question}\nRéponse :"
+        )
+        logger.info("📡 Début streaming (LLM pur)...")
+        for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
+            yield token
+        logger.info("📡 Fin streaming (LLM pur).")

rag_model_ollama_v2.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import os
+import pickle
+import textwrap
+import logging
+from typing import List, Optional, Dict, Any, Iterable
+import requests
+import faiss
+import numpy as np
+from llama_index.core import VectorStoreIndex
+from llama_index.core.schema import TextNode
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from sentence_transformers.util import cos_sim
+# === Logger configuration ===
+logger = logging.getLogger("RAGEngine")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+#MAX_TOKENS = 512
+MAX_TOKENS = 64
+class OllamaClient:
+    """
+    Minimal Ollama client for /api/generate (text completion) with streaming support.
+    Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
+    """
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
+        self.model = model
+        self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
+        self.timeout = timeout
+        self._gen_url = self.host.rstrip("/") + "/api/generate"
+    def generate(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        max_tokens: Optional[int] = None,
+        stream: bool = False,
+        options: Optional[Dict[str, Any]] = None,
+        raw: bool = False
+    ) -> str | Iterable[str]:
+        payload = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": stream,
+        }
+        if raw:
+            payload["raw"]=True
+        if stop:
+            payload["stop"] = stop
+        if max_tokens is not None:
+            # Ollama uses "num_predict" for max new tokens
+            payload["num_predict"] = int(max_tokens)
+        if options:
+            payload["options"] = options
+        logger.debug(f"POST {self._gen_url} (stream={stream})")
+        if stream:
+            with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
+                r.raise_for_status()
+                for line in r.iter_lines(decode_unicode=True):
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                    except Exception:
+                        # In case a broken line appears
+                        continue
+                    if "response" in data and data.get("done") is not True:
+                        yield data["response"]
+                    if data.get("done"):
+                        break
+            return
+        # Non-streaming
+        r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        return data.get("response", "")
+# Lazy import json to keep top clean
+import json
+class RAGEngine:
+    def __init__(
+        self,
+        model_name: str,
+        vector_path: str,
+        index_path: str,
+        model_threads: int = 4,
+        ollama_host: Optional[str] = None,
+        ollama_opts: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Args:
+            model_name: e.g. "nous-hermes2:Q4_K_M" or "llama3.1:8b-instruct-q4_K_M"
+            vector_path: pickle file with chunk texts list[str]
+            index_path: FAISS index path
+            model_threads: forwarded to Ollama via options.n_threads (if supported by the model)
+            ollama_host: override OLLAMA_HOST (default http://localhost:11434)
+            ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
+        """
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
+        logger.info("📦 Initialisation du moteur RAG (Ollama)...")
+        # Build options
+        opts = dict(ollama_opts or {})
+        # Common low-latency defaults; user can override via ollama_opts
+        opts.setdefault("temperature", 0.1)
+        # Try to pass thread hint if supported by the backend
+        if "num_thread" not in opts and model_threads:
+            opts["num_thread"] = int(model_threads)
+        self.llm = OllamaClient(model=model_name, host=ollama_host)
+        self.ollama_opts = opts
+        #self.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
+        logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
+        with open(vector_path, "rb") as f:
+            chunk_texts = pickle.load(f)
+        nodes = [TextNode(text=chunk) for chunk in chunk_texts]
+        faiss_index = faiss.read_index(index_path)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
+        logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
+    # ---------------- LLM helpers (via Ollama) ----------------
+    def _complete(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = 128,raw:bool=True) -> str:
+        text = self.llm.generate(
+            prompt=prompt,
+            stop=stop,
+            max_tokens=max_tokens,
+            stream=False,
+            options=self.ollama_opts,
+            raw=raw
+        )
+        # Some Ollama setups may stream even when stream=False. Coerce generators to string.
+        try:
+            if hasattr(text, "__iter__") and not isinstance(text, (str, bytes)):
+                chunks = []
+                for t in text:
+                    if not isinstance(t, (str, bytes)):
+                        continue
+                    chunks.append(t)
+                text = "".join(chunks)
+        except Exception:
+            pass
+        return (text or "").strip()
+    def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = MAX_TOKENS,raw : bool =True):
+        return self.llm.generate(
+            prompt=prompt,
+            stop=stop,
+            max_tokens=max_tokens,
+            stream=True,
+            options=self.ollama_opts,
+            raw=raw
+        )
+    # ---------------- Reformulation ----------------
+    def reformulate_question(self, question: str) -> str:
+        logger.info("🔁 Reformulation de la question (sans contexte)...")
+        prompt = f"""Tu es un assistant expert chargé de clarifier des questions floues.
+Transforme la question suivante en une question claire, explicite et complète, sans ajouter d'informations extérieures.
+Question floue : {question}
+Question reformulée :"""
+        reformulated = self._complete(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=128)
+        logger.info(f"📝 Reformulée : {reformulated}")
+        return reformulated.strip().split("###")[0]
+    def reformulate_with_context(self, question: str, context_sample: str) -> str:
+        logger.info("🔁 Reformulation de la question avec contexte...")
+        prompt = f"""Tu es un assistant expert en machine learning. Ton rôle est de reformuler les questions utilisateur en tenant compte du contexte ci-dessous, extrait d’un rapport technique sur un projet de reconnaissance de maladies de plantes.
+Ta mission est de transformer une question vague ou floue en une question précise et adaptée au contenu du rapport. Ne donne pas une interprétation hors sujet. Ne reformule pas en termes de produits commerciaux.
+Contexte :
+{context_sample}
+Question initiale : {question}
+Question reformulée :"""
+        reformulated = self._complete(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=128)
+        logger.info(f"📝 Reformulée avec contexte : {reformulated}")
+        return reformulated
+    # ---------------- Retrieval ----------------
+    def get_adaptive_top_k(self, question: str) -> int:
+        q = question.lower()
+        if len(q.split()) <= 7:
+            top_k = 8
+        elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
+            top_k = 10
+        else:
+            top_k = 8
+        logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
+        return top_k
+    def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3):
+        logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour la question : « {question} »")
+        q_emb = self.embed_model.get_query_embedding(question)
+        scored_nodes = []
+        for node in retrieved_nodes:
+            chunk_text = node.get_content()
+            chunk_emb = self.embed_model.get_text_embedding(chunk_text)
+            score = cos_sim(q_emb, chunk_emb).item()
+            scored_nodes.append((score, node))
+        ranked_nodes = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
+        logger.info("📊 Chunks les plus pertinents :")
+        for i, (score, node) in enumerate(ranked_nodes[:top_k]):
+            chunk_preview = textwrap.shorten(node.get_content().replace("\n", " "), width=100)
+            logger.info(f"#{i+1} | Score: {score:.4f} | {chunk_preview}")
+        return [n for _, n in ranked_nodes[:top_k]]
+    def retrieve_context(self, question: str, top_k: int = 3):
+        logger.info(f"📥 Récupération du contexte...")
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        retrieved_nodes = retriever.retrieve(question)
+        reranked_nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
+        context = "\n\n".join(n.get_content()[:500] for n in reranked_nodes)
+        return context, reranked_nodes
+    # ---------------- Public API ----------------
+    def ask(self, question_raw: str) -> str:
+        logger.info(f"💬 Question reçue : {question_raw}")
+        context=""
+        reformulate=False
+        if reformulate :
+            if len(question_raw.split()) <= 2:
+                context_sample, _ = self.retrieve_context(question_raw, top_k=3)
+                reformulated = self.reformulate_with_context(question_raw, context_sample)
+            else:
+                reformulated = self.reformulate_question(question_raw)
+            logger.info(f"📝 Question reformulée : {reformulated}")
+            top_k = self.get_adaptive_top_k(reformulated)
+            context, _ = self.retrieve_context(reformulated, top_k)
+        else:
+            reformulated=question_raw
+        prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
+Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
+Contexte :
+{context}
+Question : {reformulated}
+### Réponse:"""
+        response = self._complete(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=MAX_TOKENS)
+        response = response.strip().split("###")[0]
+        ellipsis = "..." if len(response) > 120 else ""
+        logger.info(f"🧠 Réponse générée : {response[:120]}{ellipsis}")
+        return response
+    def ask_stream(self, question: str):
+        logger.info(f"💬 [Stream] Question reçue : {question}")
+        top_k = self.get_adaptive_top_k(question)
+        context, _ = self.retrieve_context(question, top_k)
+        context="" #for test purpose
+        prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
+Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
+Contexte :
+{context}
+Question : {question}
+### Réponse:"""
+        logger.info("📡 Début du streaming de la réponse...")
+        for token in self._complete_stream(prompt,  stop=["### Réponse:", "\n\n", "###"], max_tokens=MAX_TOKENS,raw=False):
+            yield token
+        logger.info("📡 Fin du streaming de la réponse...")