rkonan commited on
Commit
e7a5765
·
1 Parent(s): 9bc7341
app_ollama_v1 copy 2.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import logging
4
+ import streamlit as st
5
+
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ # ✅ Nouveau moteur RAG (Ollama)
9
+ from rag_model_ollama_v1 import RAGEngine
10
+
11
+ # --- Config & logs ---
12
+ os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
13
+
14
+ logger = logging.getLogger("Streamlit")
15
+ logger.setLevel(logging.INFO)
16
+ handler = logging.StreamHandler()
17
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
18
+ handler.setFormatter(formatter)
19
+ if not logger.handlers:
20
+ logger.addHandler(handler)
21
+
22
+ st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
23
+
24
+ # --- ENV ---
25
+ ENV = os.getenv("ENV", "local") # "local" ou "space"
26
+ logger.info(f"ENV: {ENV}")
27
+
28
+ # --- Chemins FAISS & chunks ---
29
+ if ENV == "local":
30
+ # Adapte ces chemins à ton filesystem local
31
+ faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
32
+ vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
33
+ else:
34
+ # Télécharge depuis Hugging Face (dataset privé/public selon tes réglages)
35
+ faiss_index_path = hf_hub_download(
36
+ repo_id="rkonan/chatbot-models",
37
+ filename="chatbot-models/vectordb_docling/index.faiss",
38
+ repo_type="dataset"
39
+ )
40
+ vectors_path = hf_hub_download(
41
+ repo_id="rkonan/chatbot-models",
42
+ filename="chatbot-models/vectordb_docling/chunks.pkl",
43
+ repo_type="dataset"
44
+ )
45
+
46
+ # --- UI Sidebar ---
47
+ st.sidebar.header("⚙️ Paramètres")
48
+ default_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
49
+ ollama_host = st.sidebar.text_input("Ollama host", value=default_host, help="Ex: http://localhost:11434")
50
+
51
+ # Propose des modèles déjà présents ou courants
52
+ suggested_models = [
53
+ "qwen2.5:3b-instruct-q4_K_M",
54
+ "noushermes_rag",
55
+ "mistral", # présent chez toi
56
+ "gemma3", # présent chez toi
57
+ "deepseek-r1", # présent chez toi (raisonnement long, plus lent)
58
+ "granite3.3", # présent chez toi
59
+ "llama3.1:8b-instruct-q4_K_M",
60
+ "nous-hermes2:Q4_K_M",
61
+
62
+ ]
63
+ model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
64
+ num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
65
+ temperature = st.sidebar.slider("Température", min_value=0.0, max_value=1.5, value=0.1, step=0.1)
66
+
67
+ st.title("🤖 Chatbot RAG Local (Ollama)")
68
+
69
+ # --- Cache du moteur ---
70
+ @st.cache_resource(show_spinner=True)
71
+ # def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float,_version: int =1):
72
+ # # Options pour Ollama
73
+ # ollama_opts = {
74
+ # "num_thread": int(_threads),
75
+ # "temperature": float(_temp),
76
+ # "num_ctx": 256,
77
+ # "num_batch": 16,
78
+
79
+ # }
80
+
81
+ # rag = RAGEngine(
82
+ # model_name=_model_name,
83
+ # vector_path=vectors_path,
84
+ # index_path=faiss_index_path,
85
+ # model_threads=_threads,
86
+ # ollama_host=_host,
87
+ # ollama_opts=ollama_opts
88
+ # )
89
+
90
+ # # Warmup léger (évite la latence au 1er token)
91
+ # try:
92
+ # gen = rag._complete_stream("Bonjour", max_tokens=1)
93
+ # next(gen,"")
94
+
95
+ # except Exception as e:
96
+ # logger.warning(f"Warmup Ollama échoué: {e}")
97
+ # return rag
98
+
99
+ @st.cache_resource(show_spinner=True)
100
+ def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float, _version: int = 1):
101
+ # Applique KEEP_ALIVE pour garder le modèle en mémoire après usage
102
+ os.environ["OLLAMA_KEEP_ALIVE"] = "15m"
103
+
104
+ ollama_opts = {
105
+ "num_thread": int(_threads),
106
+ "temperature": float(_temp),
107
+ "num_ctx": 512, # identique au CLI
108
+ "num_batch": 16,
109
+ }
110
+
111
+ rag = RAGEngine(
112
+ model_name=_model_name,
113
+ vector_path=vectors_path,
114
+ index_path=faiss_index_path,
115
+ model_threads=_threads,
116
+ ollama_host=_host,
117
+ ollama_opts=ollama_opts
118
+ )
119
+
120
+ # Warmup proche du CLI (plus de 1 token pour remplir le cache)
121
+ try:
122
+ list(rag._complete_stream("Bonjour", max_tokens=8))
123
+ except Exception as e:
124
+ logger.warning(f"Warmup Ollama échoué: {e}")
125
+
126
+ return rag
127
+
128
+ rag = load_rag_engine(model_name, ollama_host, num_threads, temperature,_version=2)
129
+
130
+ # --- Chat simple ---
131
+ user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")
132
+ col1, col2 = st.columns([1,1])
133
+
134
+ if col1.button("Envoyer"):
135
+ if user_input.strip():
136
+ with st.spinner("Génération en cours..."):
137
+ try:
138
+ response = rag.ask(user_input)
139
+ st.markdown("**Réponse :**")
140
+ st.success(response)
141
+ except Exception as e:
142
+ st.error(f"Erreur pendant la génération: {e}")
143
+ else:
144
+ st.info("Saisissez une question.")
145
+
146
+ if col2.button("Envoyer (stream)"):
147
+ if user_input.strip():
148
+ with st.spinner("Génération en cours (stream)..."):
149
+ try:
150
+ # Affichage token-par-token
151
+ ph = st.empty()
152
+ acc = ""
153
+ for token in rag.ask_stream(user_input):
154
+ acc += token
155
+ ph.markdown(acc)
156
+ st.balloons()
157
+ except Exception as e:
158
+ st.error(f"Erreur pendant la génération (stream): {e}")
159
+ else:
160
+ st.info("Saisissez une question.")
app_ollama_v1 copy.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import streamlit as st
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ # ✅ Nouveau moteur RAG (Ollama)
7
+ from rag_model_ollama_v1 import RAGEngine
8
+
9
+ # --- Config & logs ---
10
+ os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
11
+ os.environ["OLLAMA_KEEP_ALIVE"] = "15m" # garde le modèle chaud
12
+
13
+ logger = logging.getLogger("Streamlit")
14
+ logger.setLevel(logging.INFO)
15
+ handler = logging.StreamHandler()
16
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
17
+ handler.setFormatter(formatter)
18
+ if not logger.handlers:
19
+ logger.addHandler(handler)
20
+
21
+ st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
22
+
23
+ # --- ENV ---
24
+ ENV = os.getenv("ENV", "local") # "local" ou "space"
25
+ logger.info(f"ENV: {ENV}")
26
+
27
+ # --- Chemins FAISS & chunks ---
28
+ if ENV == "local":
29
+ faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
30
+ vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
31
+ else:
32
+ faiss_index_path = hf_hub_download(
33
+ repo_id="rkonan/chatbot-models",
34
+ filename="chatbot-models/vectordb_docling/index.faiss",
35
+ repo_type="dataset"
36
+ )
37
+ vectors_path = hf_hub_download(
38
+ repo_id="rkonan/chatbot-models",
39
+ filename="chatbot-models/vectordb_docling/chunks.pkl",
40
+ repo_type="dataset"
41
+ )
42
+
43
+ # --- UI Sidebar ---
44
+ st.sidebar.header("⚙️ Paramètres")
45
+ default_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
46
+ ollama_host = st.sidebar.text_input("Ollama host", value=default_host)
47
+
48
+ suggested_models = [
49
+ "qwen2.5:3b-instruct-q4_K_M",
50
+ "noushermes_rag",
51
+ "mistral",
52
+ "gemma3",
53
+ "deepseek-r1",
54
+ "granite3.3",
55
+ "llama3.1:8b-instruct-q4_K_M",
56
+ "nous-hermes2:Q4_K_M",
57
+ ]
58
+ model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
59
+ num_threads = st.sidebar.slider("Threads", min_value=2, max_value=16, value=6, step=1)
60
+ temperature = st.sidebar.slider("Température", min_value=0.0, max_value=1.5, value=0.1, step=0.1)
61
+
62
+ st.title("🤖 Chatbot RAG Local (Ollama)")
63
+
64
+ # --- Cache du moteur ---
65
+ @st.cache_resource(show_spinner=True)
66
+ def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
67
+ ollama_opts = {
68
+ "num_thread": int(_threads),
69
+ "temperature": float(_temp),
70
+ "num_ctx": 512, # identique au CLI
71
+ "num_batch": 16,
72
+ }
73
+
74
+ rag = RAGEngine(
75
+ model_name=_model_name,
76
+ vector_path=vectors_path,
77
+ index_path=faiss_index_path,
78
+ model_threads=_threads,
79
+ ollama_host=_host,
80
+ ollama_opts=ollama_opts
81
+ )
82
+
83
+ # Warmup proche du CLI
84
+ try:
85
+ list(rag._complete_stream("Bonjour", max_tokens=8))
86
+ except Exception as e:
87
+ logger.warning(f"Warmup Ollama échoué: {e}")
88
+
89
+ return rag
90
+
91
+ rag = load_rag_engine(model_name, ollama_host, num_threads, temperature)
92
+
93
+ # --- Chat ---
94
+ user_input = st.text_area("Posez votre question :", height=120,
95
+ placeholder="Ex: Quels sont les traitements appliqués aux images ?")
96
+ col1, col2 = st.columns([1, 1])
97
+
98
+ if col1.button("Envoyer"):
99
+ if user_input.strip():
100
+ with st.spinner("Génération en cours..."):
101
+ try:
102
+ response = rag.ask(user_input)
103
+ st.markdown("**Réponse :**")
104
+ st.success(response)
105
+ except Exception as e:
106
+ st.error(f"Erreur pendant la génération: {e}")
107
+ else:
108
+ st.info("Saisissez une question.")
109
+
110
+ if col2.button("Envoyer (stream)"):
111
+ if user_input.strip():
112
+ with st.spinner("Génération en cours (stream)..."):
113
+ try:
114
+ ph = st.empty()
115
+ acc = ""
116
+ for token in rag.ask_stream(user_input):
117
+ acc += token
118
+ ph.markdown(acc)
119
+ st.balloons()
120
+ except Exception as e:
121
+ st.error(f"Erreur pendant la génération (stream): {e}")
122
+ else:
123
+ st.info("Saisissez une question.")
app_ollama_v1.py CHANGED
@@ -1,11 +1,9 @@
1
-
2
  import os
3
  import logging
4
  import streamlit as st
5
-
6
  from huggingface_hub import hf_hub_download
7
 
8
- # ✅ Nouveau moteur RAG (Ollama)
9
  from rag_model_ollama_v1 import RAGEngine
10
 
11
  # --- Config & logs ---
@@ -22,16 +20,14 @@ if not logger.handlers:
22
  st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
23
 
24
  # --- ENV ---
25
- ENV = os.getenv("ENV", "local") # "local" ou "space"
26
  logger.info(f"ENV: {ENV}")
27
 
28
  # --- Chemins FAISS & chunks ---
29
  if ENV == "local":
30
- # Adapte ces chemins à ton filesystem local
31
  faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
32
  vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
33
  else:
34
- # Télécharge depuis Hugging Face (dataset privé/public selon tes réglages)
35
  faiss_index_path = hf_hub_download(
36
  repo_id="rkonan/chatbot-models",
37
  filename="chatbot-models/vectordb_docling/index.faiss",
@@ -45,19 +41,17 @@ else:
45
 
46
  # --- UI Sidebar ---
47
  st.sidebar.header("⚙️ Paramètres")
48
- default_host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
49
- ollama_host = st.sidebar.text_input("Ollama host", value=default_host, help="Ex: http://localhost:11434")
50
-
51
- # Propose des modèles déjà présents ou courants
52
  suggested_models = [
 
53
  "noushermes_rag",
54
- "mistral", # présent chez toi
55
- "gemma3", # présent chez toi
56
- "deepseek-r1", # présent chez toi (raisonnement long, plus lent)
57
- "granite3.3", # présent chez toi
58
  "llama3.1:8b-instruct-q4_K_M",
59
  "nous-hermes2:Q4_K_M",
60
-
61
  ]
62
  model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
63
  num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
@@ -67,54 +61,41 @@ st.title("🤖 Chatbot RAG Local (Ollama)")
67
 
68
  # --- Cache du moteur ---
69
  @st.cache_resource(show_spinner=True)
70
- def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float,_version: int =1):
71
- # Options pour Ollama
72
- ollama_opts = {
73
- "num_thread": int(_threads),
74
- "temperature": float(_temp),
75
- }
76
-
77
  rag = RAGEngine(
78
  model_name=_model_name,
79
  vector_path=vectors_path,
80
  index_path=faiss_index_path,
81
  model_threads=_threads,
82
- ollama_host=_host,
83
- ollama_opts=ollama_opts
84
  )
85
-
86
- # Warmup léger (évite la latence au 1er token)
87
- try:
88
- gen = rag._complete_stream("Bonjour", max_tokens=1)
89
- next(gen,"")
90
-
91
- except Exception as e:
92
- logger.warning(f"Warmup Ollama échoué: {e}")
93
  return rag
94
 
95
- rag = load_rag_engine(model_name, ollama_host, num_threads, temperature,_version=2)
96
 
97
  # --- Chat simple ---
98
- user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")
99
- col1, col2 = st.columns([1,1])
100
-
101
- if col1.button("Envoyer"):
102
- if user_input.strip():
103
- with st.spinner("Génération en cours..."):
104
- try:
105
- response = rag.ask(user_input)
106
- st.markdown("**Réponse :**")
107
- st.success(response)
108
- except Exception as e:
109
- st.error(f"Erreur pendant la génération: {e}")
110
- else:
111
- st.info("Saisissez une question.")
 
112
 
113
  if col2.button("Envoyer (stream)"):
114
  if user_input.strip():
115
  with st.spinner("Génération en cours (stream)..."):
116
  try:
117
- # Affichage token-par-token
118
  ph = st.empty()
119
  acc = ""
120
  for token in rag.ask_stream(user_input):
 
 
1
  import os
2
  import logging
3
  import streamlit as st
 
4
  from huggingface_hub import hf_hub_download
5
 
6
+ # ✅ Nouveau moteur RAG (sans ollama_opts)
7
  from rag_model_ollama_v1 import RAGEngine
8
 
9
  # --- Config & logs ---
 
20
  st.set_page_config(page_title="Chatbot RAG (Ollama)", page_icon="🤖")
21
 
22
  # --- ENV ---
23
+ ENV = os.getenv("ENV", "local")
24
  logger.info(f"ENV: {ENV}")
25
 
26
  # --- Chemins FAISS & chunks ---
27
  if ENV == "local":
 
28
  faiss_index_path = "chatbot-models/vectordb_docling/index.faiss"
29
  vectors_path = "chatbot-models/vectordb_docling/chunks.pkl"
30
  else:
 
31
  faiss_index_path = hf_hub_download(
32
  repo_id="rkonan/chatbot-models",
33
  filename="chatbot-models/vectordb_docling/index.faiss",
 
41
 
42
  # --- UI Sidebar ---
43
  st.sidebar.header("⚙️ Paramètres")
44
+ default_host = os.getenv("OLLAMA_HOST", "http://localhost:11435")
45
+ ollama_host = st.sidebar.text_input("Ollama host", value=default_host)
 
 
46
  suggested_models = [
47
+ "qwen2.5:3b-instruct-q4_K_M",
48
  "noushermes_rag",
49
+ "mistral",
50
+ "gemma3",
51
+ "deepseek-r1",
52
+ "granite3.3",
53
  "llama3.1:8b-instruct-q4_K_M",
54
  "nous-hermes2:Q4_K_M",
 
55
  ]
56
  model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
57
  num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
 
61
 
62
  # --- Cache du moteur ---
63
  @st.cache_resource(show_spinner=True)
64
+ def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
65
+ os.environ["OLLAMA_KEEP_ALIVE"] = "15m"
 
 
 
 
 
66
  rag = RAGEngine(
67
  model_name=_model_name,
68
  vector_path=vectors_path,
69
  index_path=faiss_index_path,
70
  model_threads=_threads,
71
+ ollama_host=_host
72
+ # ❌ pas d'ollama_opts → Ollama choisit les defaults
73
  )
 
 
 
 
 
 
 
 
74
  return rag
75
 
76
+ rag = load_rag_engine(model_name, ollama_host, num_threads, temperature)
77
 
78
  # --- Chat simple ---
79
+ user_input = st.text_area("Posez votre question :", height=120,
80
+ placeholder="Ex: Quels sont les traitements appliqués aux images ?")
81
+ col1, col2 = st.columns([1, 1])
82
+
83
+ # if col1.button("Envoyer"):
84
+ # if user_input.strip():
85
+ # with st.spinner("Génération en cours..."):
86
+ # try:
87
+ # response = rag.ask(user_input)
88
+ # st.markdown("**Réponse :**")
89
+ # st.success(response)
90
+ # except Exception as e:
91
+ # st.error(f"Erreur pendant la génération: {e}")
92
+ # else:
93
+ # st.info("Saisissez une question.")
94
 
95
  if col2.button("Envoyer (stream)"):
96
  if user_input.strip():
97
  with st.spinner("Génération en cours (stream)..."):
98
  try:
 
99
  ph = st.empty()
100
  acc = ""
101
  for token in rag.ask_stream(user_input):
app_ollama_v1_chat.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import streamlit as st
4
+ import requests
5
+ import json
6
+
7
+ # --- Config & logs ---
8
+ os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
9
+ logger = logging.getLogger("Streamlit")
10
+ logger.setLevel(logging.INFO)
11
+ handler = logging.StreamHandler()
12
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
13
+ handler.setFormatter(formatter)
14
+ if not logger.handlers:
15
+ logger.addHandler(handler)
16
+
17
+ st.set_page_config(page_title="Chat Ollama", page_icon="🤖")
18
+
19
+ # --- UI Sidebar ---
20
+ st.sidebar.header("⚙️ Paramètres")
21
+ default_host = os.getenv("OLLAMA_HOST", "http://localhost:11435")
22
+ ollama_host = st.sidebar.text_input("Ollama host", value=default_host)
23
+ model_name = st.sidebar.text_input("Modèle Ollama", value="qwen2.5:3b-instruct-q4_K_M")
24
+
25
+ st.title("💬 Chat Ollama (simple)")
26
+
27
+ # --- Historique ---
28
+ if "messages" not in st.session_state:
29
+ st.session_state["messages"] = []
30
+
31
+ user_input = st.text_area("Votre message :", height=100, placeholder="Ex: Bonjour ?")
32
+ col1, col2 = st.columns([1, 1])
33
+
34
+ # --- Fonction d'appel API /api/chat ---
35
+ def ollama_chat(messages, stream=False):
36
+ url = ollama_host.rstrip("/") + "/api/chat"
37
+ payload = {"model": model_name, "messages": messages, "stream": stream}
38
+
39
+ if stream:
40
+ # renvoie un générateur de tokens
41
+ def token_gen():
42
+ with requests.post(url, json=payload, stream=True, timeout=300) as r:
43
+ r.raise_for_status()
44
+ for line in r.iter_lines(decode_unicode=True):
45
+ if not line:
46
+ continue
47
+ data = json.loads(line)
48
+ if "message" in data and not data.get("done"):
49
+ yield data["message"]["content"]
50
+ if data.get("done"):
51
+ break
52
+ return token_gen()
53
+ else:
54
+ # renvoie directement la réponse complète (dict)
55
+ r = requests.post(url, json=payload, timeout=300)
56
+ r.raise_for_status()
57
+ return r.json()
58
+
59
+ # --- Bouton : envoi normal ---
60
+ if col1.button("Envoyer"):
61
+ if user_input.strip():
62
+ st.session_state["messages"].append({"role": "user", "content": user_input})
63
+ with st.spinner("Génération en cours..."):
64
+ try:
65
+ result = ollama_chat(st.session_state["messages"], stream=False)
66
+ content = result.get("message", {}).get("content", "")
67
+ st.session_state["messages"].append({"role": "assistant", "content": content})
68
+ st.markdown("**Réponse :**")
69
+ st.success(content)
70
+ st.write(f"⏱ Temps total : {result['total_duration']/1e9:.2f}s")
71
+ st.write(f"📝 Tokens prompt : {result['prompt_eval_count']}, génération : {result['eval_count']}")
72
+
73
+ except Exception as e:
74
+ st.error(f"Erreur: {e}")
75
+ else:
76
+ st.info("Saisissez un message.")
77
+
78
+ # --- Bouton : envoi streaming ---
79
+ if col2.button("Envoyer (stream)"):
80
+ if user_input.strip():
81
+ st.session_state["messages"].append({"role": "user", "content": user_input})
82
+ with st.spinner("Génération en cours (stream)..."):
83
+ try:
84
+ ph = st.empty()
85
+ acc = ""
86
+ for token in ollama_chat(st.session_state["messages"], stream=True):
87
+ acc += token
88
+ ph.markdown(acc)
89
+ st.session_state["messages"].append({"role": "assistant", "content": acc})
90
+ except Exception as e:
91
+ st.error(f"Erreur (stream): {e}")
92
+ else:
93
+ st.info("Saisissez un message.")
94
+
95
+ # --- Affichage historique ---
96
+ st.subheader("Historique de la conversation")
97
+ for msg in st.session_state["messages"]:
98
+ role = "🧑‍💻" if msg["role"] == "user" else "🤖"
99
+ st.markdown(f"{role} **{msg['role']}**: {msg['content']}")
log_app.txt ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rkonan@rkonan-ThinkPad-T460:~$ OLLAMA_DEBUG=1 ollama serve
2
+ time=2025-08-09T22:42:53.523+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
3
+ time=2025-08-09T22:42:53.524+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
4
+ time=2025-08-09T22:42:53.525+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
5
+ time=2025-08-09T22:42:53.525+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
6
+ time=2025-08-09T22:42:53.525+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
7
+ time=2025-08-09T22:42:53.525+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
8
+ time=2025-08-09T22:42:53.527+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
9
+ time=2025-08-09T22:42:53.527+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
10
+ time=2025-08-09T22:42:53.527+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
11
+ time=2025-08-09T22:42:53.539+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
12
+ time=2025-08-09T22:42:53.539+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
13
+ time=2025-08-09T22:42:53.539+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
14
+ time=2025-08-09T22:42:53.543+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
15
+ cudaSetDevice err: 35
16
+ time=2025-08-09T22:42:53.544+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"
17
+ time=2025-08-09T22:42:53.544+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
18
+ time=2025-08-09T22:42:53.544+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
19
+ time=2025-08-09T22:42:53.544+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.6 GiB"
20
+ time=2025-08-09T22:42:53.544+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
21
+ time=2025-08-09T22:46:23.269+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="10.7 GiB" now.free_swap="2.1 GiB"
22
+ time=2025-08-09T22:46:23.269+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
23
+ time=2025-08-09T22:46:23.313+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
24
+ time=2025-08-09T22:46:23.419+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
25
+ time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="10.7 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="10.7 GiB" now.free_swap="2.1 GiB"
26
+ time=2025-08-09T22:46:23.420+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="10.7 GiB" free_swap="2.1 GiB"
27
+ time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[10.7 GiB]"
28
+ time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
29
+ time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
30
+ time=2025-08-09T22:46:23.420+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
31
+ time=2025-08-09T22:46:23.421+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[10.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="1.9 GiB" memory.required.partial="0 B" memory.required.kv="72.0 MiB" memory.required.allocations="[1.9 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="9.4 MiB" memory.graph.partial="252.8 MiB"
32
+ time=2025-08-09T22:46:23.421+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
33
+ llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
34
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
35
+ llama_model_loader: - kv 0: general.architecture str = qwen2
36
+ llama_model_loader: - kv 1: general.type str = model
37
+ llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct
38
+ llama_model_loader: - kv 3: general.finetune str = Instruct
39
+ llama_model_loader: - kv 4: general.basename str = Qwen2.5
40
+ llama_model_loader: - kv 5: general.size_label str = 3B
41
+ llama_model_loader: - kv 6: general.license str = other
42
+ llama_model_loader: - kv 7: general.license.name str = qwen-research
43
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3...
44
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
45
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B
46
+ llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen
47
+ llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B
48
+ llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"]
49
+ llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"]
50
+ llama_model_loader: - kv 15: qwen2.block_count u32 = 36
51
+ llama_model_loader: - kv 16: qwen2.context_length u32 = 32768
52
+ llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048
53
+ llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008
54
+ llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16
55
+ llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2
56
+ llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000
57
+ llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
58
+ llama_model_loader: - kv 23: general.file_type u32 = 15
59
+ llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2
60
+ llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2
61
+ llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
62
+ llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
63
+ llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
64
+ llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645
65
+ llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643
66
+ llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643
67
+ llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false
68
+ llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
69
+ llama_model_loader: - kv 34: general.quantization_version u32 = 2
70
+ llama_model_loader: - type f32: 181 tensors
71
+ llama_model_loader: - type q4_K: 216 tensors
72
+ llama_model_loader: - type q6_K: 37 tensors
73
+ print_info: file format = GGUF V3 (latest)
74
+ print_info: file type = Q4_K - Medium
75
+ print_info: file size = 1.79 GiB (4.99 BPW)
76
+ init_tokenizer: initializing tokenizer for type 2
77
+ load: control token: 151660 '<|fim_middle|>' is not marked as EOG
78
+ load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
79
+ load: control token: 151653 '<|vision_end|>' is not marked as EOG
80
+ load: control token: 151648 '<|box_start|>' is not marked as EOG
81
+ load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
82
+ load: control token: 151649 '<|box_end|>' is not marked as EOG
83
+ load: control token: 151655 '<|image_pad|>' is not marked as EOG
84
+ load: control token: 151651 '<|quad_end|>' is not marked as EOG
85
+ load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
86
+ load: control token: 151652 '<|vision_start|>' is not marked as EOG
87
+ load: control token: 151654 '<|vision_pad|>' is not marked as EOG
88
+ load: control token: 151656 '<|video_pad|>' is not marked as EOG
89
+ load: control token: 151644 '<|im_start|>' is not marked as EOG
90
+ load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
91
+ load: control token: 151650 '<|quad_start|>' is not marked as EOG
92
+ load: special tokens cache size = 22
93
+ load: token to piece cache size = 0.9310 MB
94
+ print_info: arch = qwen2
95
+ print_info: vocab_only = 1
96
+ print_info: model type = ?B
97
+ print_info: model params = 3.09 B
98
+ print_info: general.name = Qwen2.5 3B Instruct
99
+ print_info: vocab type = BPE
100
+ print_info: n_vocab = 151936
101
+ print_info: n_merges = 151387
102
+ print_info: BOS token = 151643 '<|endoftext|>'
103
+ print_info: EOS token = 151645 '<|im_end|>'
104
+ print_info: EOT token = 151645 '<|im_end|>'
105
+ print_info: PAD token = 151643 '<|endoftext|>'
106
+ print_info: LF token = 198 'Ċ'
107
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
108
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
109
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
110
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
111
+ print_info: FIM REP token = 151663 '<|repo_name|>'
112
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
113
+ print_info: EOG token = 151643 '<|endoftext|>'
114
+ print_info: EOG token = 151645 '<|im_end|>'
115
+ print_info: EOG token = 151662 '<|fim_pad|>'
116
+ print_info: EOG token = 151663 '<|repo_name|>'
117
+ print_info: EOG token = 151664 '<|file_sep|>'
118
+ print_info: max token length = 256
119
+ llama_model_load: vocab only - skipping tensors
120
+ time=2025-08-09T22:46:24.490+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
121
+ time=2025-08-09T22:46:24.490+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 256 --batch-size 16 --threads 6 --no-mmap --parallel 1 --port 37337"
122
+ time=2025-08-09T22:46:24.490+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
123
+ time=2025-08-09T22:46:24.493+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
124
+ time=2025-08-09T22:46:24.493+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
125
+ time=2025-08-09T22:46:24.497+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
126
+ time=2025-08-09T22:46:24.537+02:00 level=INFO source=runner.go:815 msg="starting go runner"
127
+ time=2025-08-09T22:46:24.537+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
128
+ load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
129
+ time=2025-08-09T22:46:24.555+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
130
+ time=2025-08-09T22:46:24.557+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:37337"
131
+ llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
132
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
133
+ llama_model_loader: - kv 0: general.architecture str = qwen2
134
+ llama_model_loader: - kv 1: general.type str = model
135
+ llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct
136
+ llama_model_loader: - kv 3: general.finetune str = Instruct
137
+ llama_model_loader: - kv 4: general.basename str = Qwen2.5
138
+ llama_model_loader: - kv 5: general.size_label str = 3B
139
+ llama_model_loader: - kv 6: general.license str = other
140
+ llama_model_loader: - kv 7: general.license.name str = qwen-research
141
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3...
142
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
143
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B
144
+ llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen
145
+ llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B
146
+ llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"]
147
+ llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"]
148
+ llama_model_loader: - kv 15: qwen2.block_count u32 = 36
149
+ llama_model_loader: - kv 16: qwen2.context_length u32 = 32768
150
+ llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048
151
+ llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008
152
+ llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16
153
+ llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2
154
+ llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000
155
+ llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
156
+ llama_model_loader: - kv 23: general.file_type u32 = 15
157
+ llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2
158
+ llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2
159
+ llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
160
+ time=2025-08-09T22:46:24.755+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
161
+ llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
162
+ llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
163
+ llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645
164
+ llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643
165
+ llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643
166
+ llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false
167
+ llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
168
+ llama_model_loader: - kv 34: general.quantization_version u32 = 2
169
+ llama_model_loader: - type f32: 181 tensors
170
+ llama_model_loader: - type q4_K: 216 tensors
171
+ llama_model_loader: - type q6_K: 37 tensors
172
+ print_info: file format = GGUF V3 (latest)
173
+ print_info: file type = Q4_K - Medium
174
+ print_info: file size = 1.79 GiB (4.99 BPW)
175
+ init_tokenizer: initializing tokenizer for type 2
176
+ load: control token: 151660 '<|fim_middle|>' is not marked as EOG
177
+ load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
178
+ load: control token: 151653 '<|vision_end|>' is not marked as EOG
179
+ load: control token: 151648 '<|box_start|>' is not marked as EOG
180
+ load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
181
+ load: control token: 151649 '<|box_end|>' is not marked as EOG
182
+ load: control token: 151655 '<|image_pad|>' is not marked as EOG
183
+ load: control token: 151651 '<|quad_end|>' is not marked as EOG
184
+ load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
185
+ load: control token: 151652 '<|vision_start|>' is not marked as EOG
186
+ load: control token: 151654 '<|vision_pad|>' is not marked as EOG
187
+ load: control token: 151656 '<|video_pad|>' is not marked as EOG
188
+ load: control token: 151644 '<|im_start|>' is not marked as EOG
189
+ load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
190
+ load: control token: 151650 '<|quad_start|>' is not marked as EOG
191
+ load: special tokens cache size = 22
192
+ load: token to piece cache size = 0.9310 MB
193
+ print_info: arch = qwen2
194
+ print_info: vocab_only = 0
195
+ print_info: n_ctx_train = 32768
196
+ print_info: n_embd = 2048
197
+ print_info: n_layer = 36
198
+ print_info: n_head = 16
199
+ print_info: n_head_kv = 2
200
+ print_info: n_rot = 128
201
+ print_info: n_swa = 0
202
+ print_info: n_swa_pattern = 1
203
+ print_info: n_embd_head_k = 128
204
+ print_info: n_embd_head_v = 128
205
+ print_info: n_gqa = 8
206
+ print_info: n_embd_k_gqa = 256
207
+ print_info: n_embd_v_gqa = 256
208
+ print_info: f_norm_eps = 0.0e+00
209
+ print_info: f_norm_rms_eps = 1.0e-06
210
+ print_info: f_clamp_kqv = 0.0e+00
211
+ print_info: f_max_alibi_bias = 0.0e+00
212
+ print_info: f_logit_scale = 0.0e+00
213
+ print_info: f_attn_scale = 0.0e+00
214
+ print_info: n_ff = 11008
215
+ print_info: n_expert = 0
216
+ print_info: n_expert_used = 0
217
+ print_info: causal attn = 1
218
+ print_info: pooling type = -1
219
+ print_info: rope type = 2
220
+ print_info: rope scaling = linear
221
+ print_info: freq_base_train = 1000000.0
222
+ print_info: freq_scale_train = 1
223
+ print_info: n_ctx_orig_yarn = 32768
224
+ print_info: rope_finetuned = unknown
225
+ print_info: ssm_d_conv = 0
226
+ print_info: ssm_d_inner = 0
227
+ print_info: ssm_d_state = 0
228
+ print_info: ssm_dt_rank = 0
229
+ print_info: ssm_dt_b_c_rms = 0
230
+ print_info: model type = 3B
231
+ print_info: model params = 3.09 B
232
+ print_info: general.name = Qwen2.5 3B Instruct
233
+ print_info: vocab type = BPE
234
+ print_info: n_vocab = 151936
235
+ print_info: n_merges = 151387
236
+ print_info: BOS token = 151643 '<|endoftext|>'
237
+ print_info: EOS token = 151645 '<|im_end|>'
238
+ print_info: EOT token = 151645 '<|im_end|>'
239
+ print_info: PAD token = 151643 '<|endoftext|>'
240
+ print_info: LF token = 198 'Ċ'
241
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
242
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
243
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
244
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
245
+ print_info: FIM REP token = 151663 '<|repo_name|>'
246
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
247
+ print_info: EOG token = 151643 '<|endoftext|>'
248
+ print_info: EOG token = 151645 '<|im_end|>'
249
+ print_info: EOG token = 151662 '<|fim_pad|>'
250
+ print_info: EOG token = 151663 '<|repo_name|>'
251
+ print_info: EOG token = 151664 '<|file_sep|>'
252
+ print_info: max token length = 256
253
+ load_tensors: loading model tensors, this can take a while... (mmap = false)
254
+ load_tensors: layer 0 assigned to device CPU, is_swa = 0
255
+ load_tensors: layer 1 assigned to device CPU, is_swa = 0
256
+ load_tensors: layer 2 assigned to device CPU, is_swa = 0
257
+ load_tensors: layer 3 assigned to device CPU, is_swa = 0
258
+ load_tensors: layer 4 assigned to device CPU, is_swa = 0
259
+ load_tensors: layer 5 assigned to device CPU, is_swa = 0
260
+ load_tensors: layer 6 assigned to device CPU, is_swa = 0
261
+ load_tensors: layer 7 assigned to device CPU, is_swa = 0
262
+ load_tensors: layer 8 assigned to device CPU, is_swa = 0
263
+ load_tensors: layer 9 assigned to device CPU, is_swa = 0
264
+ load_tensors: layer 10 assigned to device CPU, is_swa = 0
265
+ load_tensors: layer 11 assigned to device CPU, is_swa = 0
266
+ load_tensors: layer 12 assigned to device CPU, is_swa = 0
267
+ load_tensors: layer 13 assigned to device CPU, is_swa = 0
268
+ load_tensors: layer 14 assigned to device CPU, is_swa = 0
269
+ load_tensors: layer 15 assigned to device CPU, is_swa = 0
270
+ load_tensors: layer 16 assigned to device CPU, is_swa = 0
271
+ load_tensors: layer 17 assigned to device CPU, is_swa = 0
272
+ load_tensors: layer 18 assigned to device CPU, is_swa = 0
273
+ load_tensors: layer 19 assigned to device CPU, is_swa = 0
274
+ load_tensors: layer 20 assigned to device CPU, is_swa = 0
275
+ load_tensors: layer 21 assigned to device CPU, is_swa = 0
276
+ load_tensors: layer 22 assigned to device CPU, is_swa = 0
277
+ load_tensors: layer 23 assigned to device CPU, is_swa = 0
278
+ load_tensors: layer 24 assigned to device CPU, is_swa = 0
279
+ load_tensors: layer 25 assigned to device CPU, is_swa = 0
280
+ load_tensors: layer 26 assigned to device CPU, is_swa = 0
281
+ load_tensors: layer 27 assigned to device CPU, is_swa = 0
282
+ load_tensors: layer 28 assigned to device CPU, is_swa = 0
283
+ load_tensors: layer 29 assigned to device CPU, is_swa = 0
284
+ load_tensors: layer 30 assigned to device CPU, is_swa = 0
285
+ load_tensors: layer 31 assigned to device CPU, is_swa = 0
286
+ load_tensors: layer 32 assigned to device CPU, is_swa = 0
287
+ load_tensors: layer 33 assigned to device CPU, is_swa = 0
288
+ load_tensors: layer 34 assigned to device CPU, is_swa = 0
289
+ load_tensors: layer 35 assigned to device CPU, is_swa = 0
290
+ load_tensors: layer 36 assigned to device CPU, is_swa = 0
291
+ load_tensors: CPU model buffer size = 1834.82 MiB
292
+ load_all_data: no device found for buffer type CPU for async uploads
293
+ time=2025-08-09T22:46:25.773+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.18"
294
+ time=2025-08-09T22:46:26.025+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.27"
295
+ time=2025-08-09T22:46:26.276+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.39"
296
+ time=2025-08-09T22:46:26.526+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.54"
297
+ time=2025-08-09T22:46:26.777+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.68"
298
+ time=2025-08-09T22:46:27.029+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.81"
299
+ time=2025-08-09T22:46:27.281+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.94"
300
+ llama_context: constructing llama_context
301
+ llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
302
+ llama_context: n_seq_max = 1
303
+ llama_context: n_ctx = 256
304
+ llama_context: n_ctx_per_seq = 256
305
+ llama_context: n_batch = 64
306
+ llama_context: n_ubatch = 64
307
+ llama_context: causal_attn = 1
308
+ llama_context: flash_attn = 0
309
+ llama_context: freq_base = 1000000.0
310
+ llama_context: freq_scale = 1
311
+ llama_context: n_ctx_per_seq (256) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
312
+ set_abort_callback: call
313
+ llama_context: CPU output buffer size = 0.59 MiB
314
+ create_memory: n_ctx = 256 (padded)
315
+ llama_kv_cache_unified: kv_size = 256, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
316
+ llama_kv_cache_unified: layer 0: dev = CPU
317
+ llama_kv_cache_unified: layer 1: dev = CPU
318
+ llama_kv_cache_unified: layer 2: dev = CPU
319
+ llama_kv_cache_unified: layer 3: dev = CPU
320
+ llama_kv_cache_unified: layer 4: dev = CPU
321
+ llama_kv_cache_unified: layer 5: dev = CPU
322
+ llama_kv_cache_unified: layer 6: dev = CPU
323
+ llama_kv_cache_unified: layer 7: dev = CPU
324
+ llama_kv_cache_unified: layer 8: dev = CPU
325
+ llama_kv_cache_unified: layer 9: dev = CPU
326
+ llama_kv_cache_unified: layer 10: dev = CPU
327
+ llama_kv_cache_unified: layer 11: dev = CPU
328
+ llama_kv_cache_unified: layer 12: dev = CPU
329
+ llama_kv_cache_unified: layer 13: dev = CPU
330
+ llama_kv_cache_unified: layer 14: dev = CPU
331
+ llama_kv_cache_unified: layer 15: dev = CPU
332
+ llama_kv_cache_unified: layer 16: dev = CPU
333
+ llama_kv_cache_unified: layer 17: dev = CPU
334
+ llama_kv_cache_unified: layer 18: dev = CPU
335
+ llama_kv_cache_unified: layer 19: dev = CPU
336
+ llama_kv_cache_unified: layer 20: dev = CPU
337
+ llama_kv_cache_unified: layer 21: dev = CPU
338
+ llama_kv_cache_unified: layer 22: dev = CPU
339
+ llama_kv_cache_unified: layer 23: dev = CPU
340
+ llama_kv_cache_unified: layer 24: dev = CPU
341
+ llama_kv_cache_unified: layer 25: dev = CPU
342
+ llama_kv_cache_unified: layer 26: dev = CPU
343
+ llama_kv_cache_unified: layer 27: dev = CPU
344
+ llama_kv_cache_unified: layer 28: dev = CPU
345
+ llama_kv_cache_unified: layer 29: dev = CPU
346
+ llama_kv_cache_unified: layer 30: dev = CPU
347
+ llama_kv_cache_unified: layer 31: dev = CPU
348
+ llama_kv_cache_unified: layer 32: dev = CPU
349
+ llama_kv_cache_unified: layer 33: dev = CPU
350
+ llama_kv_cache_unified: layer 34: dev = CPU
351
+ llama_kv_cache_unified: layer 35: dev = CPU
352
+ llama_kv_cache_unified: CPU KV buffer size = 9.00 MiB
353
+ llama_kv_cache_unified: KV self size = 9.00 MiB, K (f16): 4.50 MiB, V (f16): 4.50 MiB
354
+ llama_context: enumerating backends
355
+ llama_context: backend_ptrs.size() = 1
356
+ llama_context: max_nodes = 65536
357
+ llama_context: worst-case: n_tokens = 64, n_seqs = 1, n_outputs = 0
358
+ llama_context: reserving graph for n_tokens = 64, n_seqs = 1
359
+ llama_context: reserving graph for n_tokens = 1, n_seqs = 1
360
+ llama_context: reserving graph for n_tokens = 64, n_seqs = 1
361
+ llama_context: CPU compute buffer size = 37.59 MiB
362
+ llama_context: graph nodes = 1338
363
+ llama_context: graph splits = 1
364
+ time=2025-08-09T22:46:27.533+02:00 level=INFO source=server.go:637 msg="llama runner started in 3.04 seconds"
365
+ time=2025-08-09T22:46:27.533+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256
366
+ time=2025-08-09T22:46:27.533+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=7 format=""
367
+ time=2025-08-09T22:46:27.539+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=1 used=0 remaining=1
368
+ time=2025-08-09T22:46:35.295+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
369
+ time=2025-08-09T22:46:35.295+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 duration=5m0s
370
+ time=2025-08-09T22:46:35.295+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 refCount=0
371
+ [GIN] 2025/08/09 - 22:46:35 | 200 | 12.149925014s | 127.0.0.1 | POST "/api/generate"
372
+ time=2025-08-09T22:47:21.815+02:00 level=DEBUG source=sched.go:613 msg="evaluating already loaded" model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6
373
+ time=2025-08-09T22:47:21.816+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=81 format=""
374
+ time=2025-08-09T22:47:21.829+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=2 prompt=20 used=0 remaining=20
375
+ [GIN] 2025/08/09 - 22:49:13 | 200 | 1m51s | 127.0.0.1 | POST "/api/generate"
376
+ time=2025-08-09T22:49:13.057+02:00 level=DEBUG source=sched.go:432 msg="context for request finished" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256
377
+ time=2025-08-09T22:49:13.057+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 duration=5m0s
378
+ time=2025-08-09T22:49:13.058+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="1.9 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=215351 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=256 refCount=0
log_cli.txt ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rkonan@rkonan-ThinkPad-T460:~$ OLLAMA_DEBUG=1 ollama serve
2
+ time=2025-08-09T22:41:31.741+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
3
+ time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
4
+ time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
5
+ time=2025-08-09T22:41:31.743+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
6
+ time=2025-08-09T22:41:31.744+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
7
+ time=2025-08-09T22:41:31.744+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
8
+ time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
9
+ time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
10
+ time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
11
+ time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
12
+ time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
13
+ time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
14
+ time=2025-08-09T22:41:31.753+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
15
+ cudaSetDevice err: 35
16
+ time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"
17
+ time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
18
+ time=2025-08-09T22:41:31.754+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
19
+ time=2025-08-09T22:41:31.754+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.6 GiB"
20
+ time=2025-08-09T22:41:31.754+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
21
+ [GIN] 2025/08/09 - 22:41:51 | 200 | 96.9µs | 127.0.0.1 | HEAD "/"
22
+ time=2025-08-09T22:41:51.222+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
23
+ [GIN] 2025/08/09 - 22:41:51 | 200 | 110.417215ms | 127.0.0.1 | POST "/api/show"
24
+ time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB"
25
+ time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
26
+ time=2025-08-09T22:41:51.319+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
27
+ time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
28
+ time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB"
29
+ time=2025-08-09T22:41:51.380+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="11.6 GiB" free_swap="2.1 GiB"
30
+ time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[11.6 GiB]"
31
+ time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
32
+ time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
33
+ time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
34
+ time=2025-08-09T22:41:51.381+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[11.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.3 GiB" memory.required.partial="0 B" memory.required.kv="144.0 MiB" memory.required.allocations="[2.3 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="300.8 MiB" memory.graph.partial="544.2 MiB"
35
+ time=2025-08-09T22:41:51.381+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
36
+ llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
37
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
38
+ llama_model_loader: - kv 0: general.architecture str = qwen2
39
+ llama_model_loader: - kv 1: general.type str = model
40
+ llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct
41
+ llama_model_loader: - kv 3: general.finetune str = Instruct
42
+ llama_model_loader: - kv 4: general.basename str = Qwen2.5
43
+ llama_model_loader: - kv 5: general.size_label str = 3B
44
+ llama_model_loader: - kv 6: general.license str = other
45
+ llama_model_loader: - kv 7: general.license.name str = qwen-research
46
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3...
47
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
48
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B
49
+ llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen
50
+ llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B
51
+ llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"]
52
+ llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"]
53
+ llama_model_loader: - kv 15: qwen2.block_count u32 = 36
54
+ llama_model_loader: - kv 16: qwen2.context_length u32 = 32768
55
+ llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048
56
+ llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008
57
+ llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16
58
+ llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2
59
+ llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000
60
+ llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
61
+ llama_model_loader: - kv 23: general.file_type u32 = 15
62
+ llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2
63
+ llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2
64
+ llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
65
+ llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
66
+ llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
67
+ llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645
68
+ llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643
69
+ llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643
70
+ llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false
71
+ llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
72
+ llama_model_loader: - kv 34: general.quantization_version u32 = 2
73
+ llama_model_loader: - type f32: 181 tensors
74
+ llama_model_loader: - type q4_K: 216 tensors
75
+ llama_model_loader: - type q6_K: 37 tensors
76
+ print_info: file format = GGUF V3 (latest)
77
+ print_info: file type = Q4_K - Medium
78
+ print_info: file size = 1.79 GiB (4.99 BPW)
79
+ init_tokenizer: initializing tokenizer for type 2
80
+ load: control token: 151660 '<|fim_middle|>' is not marked as EOG
81
+ load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
82
+ load: control token: 151653 '<|vision_end|>' is not marked as EOG
83
+ load: control token: 151648 '<|box_start|>' is not marked as EOG
84
+ load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
85
+ load: control token: 151649 '<|box_end|>' is not marked as EOG
86
+ load: control token: 151655 '<|image_pad|>' is not marked as EOG
87
+ load: control token: 151651 '<|quad_end|>' is not marked as EOG
88
+ load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
89
+ load: control token: 151652 '<|vision_start|>' is not marked as EOG
90
+ load: control token: 151654 '<|vision_pad|>' is not marked as EOG
91
+ load: control token: 151656 '<|video_pad|>' is not marked as EOG
92
+ load: control token: 151644 '<|im_start|>' is not marked as EOG
93
+ load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
94
+ load: control token: 151650 '<|quad_start|>' is not marked as EOG
95
+ load: special tokens cache size = 22
96
+ load: token to piece cache size = 0.9310 MB
97
+ print_info: arch = qwen2
98
+ print_info: vocab_only = 1
99
+ print_info: model type = ?B
100
+ print_info: model params = 3.09 B
101
+ print_info: general.name = Qwen2.5 3B Instruct
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 151643 '<|endoftext|>'
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151643 '<|endoftext|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ llama_model_load: vocab only - skipping tensors
123
+ time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
124
+ time=2025-08-09T22:41:51.857+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 4096 --batch-size 512 --threads 2 --no-mmap --parallel 1 --port 42013"
125
+ time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
126
+ time=2025-08-09T22:41:51.857+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
127
+ time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
128
+ time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
129
+ time=2025-08-09T22:41:51.877+02:00 level=INFO source=runner.go:815 msg="starting go runner"
130
+ time=2025-08-09T22:41:51.878+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
131
+ load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
132
+ time=2025-08-09T22:41:51.892+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
133
+ time=2025-08-09T22:41:51.892+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:42013"
134
+ llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
135
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
136
+ llama_model_loader: - kv 0: general.architecture str = qwen2
137
+ llama_model_loader: - kv 1: general.type str = model
138
+ llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct
139
+ llama_model_loader: - kv 3: general.finetune str = Instruct
140
+ llama_model_loader: - kv 4: general.basename str = Qwen2.5
141
+ llama_model_loader: - kv 5: general.size_label str = 3B
142
+ llama_model_loader: - kv 6: general.license str = other
143
+ llama_model_loader: - kv 7: general.license.name str = qwen-research
144
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3...
145
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
146
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B
147
+ llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen
148
+ llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B
149
+ llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"]
150
+ llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"]
151
+ llama_model_loader: - kv 15: qwen2.block_count u32 = 36
152
+ llama_model_loader: - kv 16: qwen2.context_length u32 = 32768
153
+ llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048
154
+ llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008
155
+ llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16
156
+ llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2
157
+ llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000
158
+ llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
159
+ llama_model_loader: - kv 23: general.file_type u32 = 15
160
+ llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2
161
+ llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2
162
+ llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
163
+ llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
164
+ llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
165
+ llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645
166
+ llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643
167
+ llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643
168
+ llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false
169
+ llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
170
+ llama_model_loader: - kv 34: general.quantization_version u32 = 2
171
+ llama_model_loader: - type f32: 181 tensors
172
+ llama_model_loader: - type q4_K: 216 tensors
173
+ llama_model_loader: - type q6_K: 37 tensors
174
+ print_info: file format = GGUF V3 (latest)
175
+ print_info: file type = Q4_K - Medium
176
+ print_info: file size = 1.79 GiB (4.99 BPW)
177
+ time=2025-08-09T22:41:52.110+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
178
+ init_tokenizer: initializing tokenizer for type 2
179
+ load: control token: 151660 '<|fim_middle|>' is not marked as EOG
180
+ load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
181
+ load: control token: 151653 '<|vision_end|>' is not marked as EOG
182
+ load: control token: 151648 '<|box_start|>' is not marked as EOG
183
+ load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
184
+ load: control token: 151649 '<|box_end|>' is not marked as EOG
185
+ load: control token: 151655 '<|image_pad|>' is not marked as EOG
186
+ load: control token: 151651 '<|quad_end|>' is not marked as EOG
187
+ load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
188
+ load: control token: 151652 '<|vision_start|>' is not marked as EOG
189
+ load: control token: 151654 '<|vision_pad|>' is not marked as EOG
190
+ load: control token: 151656 '<|video_pad|>' is not marked as EOG
191
+ load: control token: 151644 '<|im_start|>' is not marked as EOG
192
+ load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
193
+ load: control token: 151650 '<|quad_start|>' is not marked as EOG
194
+ load: special tokens cache size = 22
195
+ load: token to piece cache size = 0.9310 MB
196
+ print_info: arch = qwen2
197
+ print_info: vocab_only = 0
198
+ print_info: n_ctx_train = 32768
199
+ print_info: n_embd = 2048
200
+ print_info: n_layer = 36
201
+ print_info: n_head = 16
202
+ print_info: n_head_kv = 2
203
+ print_info: n_rot = 128
204
+ print_info: n_swa = 0
205
+ print_info: n_swa_pattern = 1
206
+ print_info: n_embd_head_k = 128
207
+ print_info: n_embd_head_v = 128
208
+ print_info: n_gqa = 8
209
+ print_info: n_embd_k_gqa = 256
210
+ print_info: n_embd_v_gqa = 256
211
+ print_info: f_norm_eps = 0.0e+00
212
+ print_info: f_norm_rms_eps = 1.0e-06
213
+ print_info: f_clamp_kqv = 0.0e+00
214
+ print_info: f_max_alibi_bias = 0.0e+00
215
+ print_info: f_logit_scale = 0.0e+00
216
+ print_info: f_attn_scale = 0.0e+00
217
+ print_info: n_ff = 11008
218
+ print_info: n_expert = 0
219
+ print_info: n_expert_used = 0
220
+ print_info: causal attn = 1
221
+ print_info: pooling type = -1
222
+ print_info: rope type = 2
223
+ print_info: rope scaling = linear
224
+ print_info: freq_base_train = 1000000.0
225
+ print_info: freq_scale_train = 1
226
+ print_info: n_ctx_orig_yarn = 32768
227
+ print_info: rope_finetuned = unknown
228
+ print_info: ssm_d_conv = 0
229
+ print_info: ssm_d_inner = 0
230
+ print_info: ssm_d_state = 0
231
+ print_info: ssm_dt_rank = 0
232
+ print_info: ssm_dt_b_c_rms = 0
233
+ print_info: model type = 3B
234
+ print_info: model params = 3.09 B
235
+ print_info: general.name = Qwen2.5 3B Instruct
236
+ print_info: vocab type = BPE
237
+ print_info: n_vocab = 151936
238
+ print_info: n_merges = 151387
239
+ print_info: BOS token = 151643 '<|endoftext|>'
240
+ print_info: EOS token = 151645 '<|im_end|>'
241
+ print_info: EOT token = 151645 '<|im_end|>'
242
+ print_info: PAD token = 151643 '<|endoftext|>'
243
+ print_info: LF token = 198 'Ċ'
244
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
245
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
246
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
247
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
248
+ print_info: FIM REP token = 151663 '<|repo_name|>'
249
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
250
+ print_info: EOG token = 151643 '<|endoftext|>'
251
+ print_info: EOG token = 151645 '<|im_end|>'
252
+ print_info: EOG token = 151662 '<|fim_pad|>'
253
+ print_info: EOG token = 151663 '<|repo_name|>'
254
+ print_info: EOG token = 151664 '<|file_sep|>'
255
+ print_info: max token length = 256
256
+ load_tensors: loading model tensors, this can take a while... (mmap = false)
257
+ load_tensors: layer 0 assigned to device CPU, is_swa = 0
258
+ load_tensors: layer 1 assigned to device CPU, is_swa = 0
259
+ load_tensors: layer 2 assigned to device CPU, is_swa = 0
260
+ load_tensors: layer 3 assigned to device CPU, is_swa = 0
261
+ load_tensors: layer 4 assigned to device CPU, is_swa = 0
262
+ load_tensors: layer 5 assigned to device CPU, is_swa = 0
263
+ load_tensors: layer 6 assigned to device CPU, is_swa = 0
264
+ load_tensors: layer 7 assigned to device CPU, is_swa = 0
265
+ load_tensors: layer 8 assigned to device CPU, is_swa = 0
266
+ load_tensors: layer 9 assigned to device CPU, is_swa = 0
267
+ load_tensors: layer 10 assigned to device CPU, is_swa = 0
268
+ load_tensors: layer 11 assigned to device CPU, is_swa = 0
269
+ load_tensors: layer 12 assigned to device CPU, is_swa = 0
270
+ load_tensors: layer 13 assigned to device CPU, is_swa = 0
271
+ load_tensors: layer 14 assigned to device CPU, is_swa = 0
272
+ load_tensors: layer 15 assigned to device CPU, is_swa = 0
273
+ load_tensors: layer 16 assigned to device CPU, is_swa = 0
274
+ load_tensors: layer 17 assigned to device CPU, is_swa = 0
275
+ load_tensors: layer 18 assigned to device CPU, is_swa = 0
276
+ load_tensors: layer 19 assigned to device CPU, is_swa = 0
277
+ load_tensors: layer 20 assigned to device CPU, is_swa = 0
278
+ load_tensors: layer 21 assigned to device CPU, is_swa = 0
279
+ load_tensors: layer 22 assigned to device CPU, is_swa = 0
280
+ load_tensors: layer 23 assigned to device CPU, is_swa = 0
281
+ load_tensors: layer 24 assigned to device CPU, is_swa = 0
282
+ load_tensors: layer 25 assigned to device CPU, is_swa = 0
283
+ load_tensors: layer 26 assigned to device CPU, is_swa = 0
284
+ load_tensors: layer 27 assigned to device CPU, is_swa = 0
285
+ load_tensors: layer 28 assigned to device CPU, is_swa = 0
286
+ load_tensors: layer 29 assigned to device CPU, is_swa = 0
287
+ load_tensors: layer 30 assigned to device CPU, is_swa = 0
288
+ load_tensors: layer 31 assigned to device CPU, is_swa = 0
289
+ load_tensors: layer 32 assigned to device CPU, is_swa = 0
290
+ load_tensors: layer 33 assigned to device CPU, is_swa = 0
291
+ load_tensors: layer 34 assigned to device CPU, is_swa = 0
292
+ load_tensors: layer 35 assigned to device CPU, is_swa = 0
293
+ load_tensors: layer 36 assigned to device CPU, is_swa = 0
294
+ load_tensors: CPU model buffer size = 1834.82 MiB
295
+ load_all_data: no device found for buffer type CPU for async uploads
296
+ time=2025-08-09T22:41:52.865+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.28"
297
+ time=2025-08-09T22:41:53.116+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.50"
298
+ time=2025-08-09T22:41:53.366+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.72"
299
+ time=2025-08-09T22:41:53.617+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.97"
300
+ llama_context: constructing llama_context
301
+ llama_context: n_seq_max = 1
302
+ llama_context: n_ctx = 4096
303
+ llama_context: n_ctx_per_seq = 4096
304
+ llama_context: n_batch = 512
305
+ llama_context: n_ubatch = 512
306
+ llama_context: causal_attn = 1
307
+ llama_context: flash_attn = 0
308
+ llama_context: freq_base = 1000000.0
309
+ llama_context: freq_scale = 1
310
+ llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
311
+ set_abort_callback: call
312
+ llama_context: CPU output buffer size = 0.59 MiB
313
+ create_memory: n_ctx = 4096 (padded)
314
+ llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
315
+ llama_kv_cache_unified: layer 0: dev = CPU
316
+ llama_kv_cache_unified: layer 1: dev = CPU
317
+ llama_kv_cache_unified: layer 2: dev = CPU
318
+ llama_kv_cache_unified: layer 3: dev = CPU
319
+ llama_kv_cache_unified: layer 4: dev = CPU
320
+ llama_kv_cache_unified: layer 5: dev = CPU
321
+ llama_kv_cache_unified: layer 6: dev = CPU
322
+ llama_kv_cache_unified: layer 7: dev = CPU
323
+ llama_kv_cache_unified: layer 8: dev = CPU
324
+ llama_kv_cache_unified: layer 9: dev = CPU
325
+ llama_kv_cache_unified: layer 10: dev = CPU
326
+ llama_kv_cache_unified: layer 11: dev = CPU
327
+ llama_kv_cache_unified: layer 12: dev = CPU
328
+ llama_kv_cache_unified: layer 13: dev = CPU
329
+ llama_kv_cache_unified: layer 14: dev = CPU
330
+ llama_kv_cache_unified: layer 15: dev = CPU
331
+ llama_kv_cache_unified: layer 16: dev = CPU
332
+ llama_kv_cache_unified: layer 17: dev = CPU
333
+ llama_kv_cache_unified: layer 18: dev = CPU
334
+ llama_kv_cache_unified: layer 19: dev = CPU
335
+ llama_kv_cache_unified: layer 20: dev = CPU
336
+ llama_kv_cache_unified: layer 21: dev = CPU
337
+ llama_kv_cache_unified: layer 22: dev = CPU
338
+ llama_kv_cache_unified: layer 23: dev = CPU
339
+ llama_kv_cache_unified: layer 24: dev = CPU
340
+ llama_kv_cache_unified: layer 25: dev = CPU
341
+ llama_kv_cache_unified: layer 26: dev = CPU
342
+ llama_kv_cache_unified: layer 27: dev = CPU
343
+ llama_kv_cache_unified: layer 28: dev = CPU
344
+ llama_kv_cache_unified: layer 29: dev = CPU
345
+ llama_kv_cache_unified: layer 30: dev = CPU
346
+ llama_kv_cache_unified: layer 31: dev = CPU
347
+ llama_kv_cache_unified: layer 32: dev = CPU
348
+ llama_kv_cache_unified: layer 33: dev = CPU
349
+ llama_kv_cache_unified: layer 34: dev = CPU
350
+ llama_kv_cache_unified: layer 35: dev = CPU
351
+ llama_kv_cache_unified: CPU KV buffer size = 144.00 MiB
352
+ llama_kv_cache_unified: KV self size = 144.00 MiB, K (f16): 72.00 MiB, V (f16): 72.00 MiB
353
+ llama_context: enumerating backends
354
+ llama_context: backend_ptrs.size() = 1
355
+ llama_context: max_nodes = 65536
356
+ llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
357
+ llama_context: reserving graph for n_tokens = 512, n_seqs = 1
358
+ llama_context: reserving graph for n_tokens = 1, n_seqs = 1
359
+ llama_context: reserving graph for n_tokens = 512, n_seqs = 1
360
+ llama_context: CPU compute buffer size = 300.75 MiB
361
+ llama_context: graph nodes = 1338
362
+ llama_context: graph splits = 1
363
+ time=2025-08-09T22:41:53.869+02:00 level=INFO source=server.go:637 msg="llama runner started in 2.01 seconds"
364
+ time=2025-08-09T22:41:53.869+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096
365
+ time=2025-08-09T22:41:53.870+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=155 format=""
366
+ time=2025-08-09T22:41:53.880+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=31 used=0 remaining=31
367
+ [GIN] 2025/08/09 - 22:41:58 | 200 | 7.284707733s | 127.0.0.1 | POST "/api/generate"
368
+ time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
369
+ time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 duration=5m0s
370
+ time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 refCount=0
logs ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ rkonan@rkonan-ThinkPad-T460:~/chatbot-project/models$ OLLAMA_DEBUG=1 ollama serve
3
+ time=2025-08-09T19:38:55.291+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
4
+ time=2025-08-09T19:38:55.293+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
5
+ time=2025-08-09T19:38:55.295+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
6
+ time=2025-08-09T19:38:55.297+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
7
+ time=2025-08-09T19:38:55.298+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
8
+ time=2025-08-09T19:38:55.299+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
9
+ time=2025-08-09T19:38:55.304+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
10
+ time=2025-08-09T19:38:55.305+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
11
+ time=2025-08-09T19:38:55.307+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/chatbot-project/models/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
12
+ time=2025-08-09T19:38:55.319+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
13
+ time=2025-08-09T19:38:55.323+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
14
+ time=2025-08-09T19:38:55.323+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/chatbot-project/models/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
15
+ time=2025-08-09T19:38:55.334+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
16
+ cudaSetDevice err: 35
17
+ time=2025-08-09T19:38:55.335+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"
18
+ time=2025-08-09T19:38:55.335+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
19
+ time=2025-08-09T19:38:55.336+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
20
+ time=2025-08-09T19:38:55.336+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.3 GiB"
21
+ time=2025-08-09T19:38:55.336+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
22
+ [GIN] 2025/08/09 - 19:39:06 | 200 | 174.244µs | 127.0.0.1 | HEAD "/"
23
+ time=2025-08-09T19:39:06.454+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
24
+ [GIN] 2025/08/09 - 19:39:06 | 200 | 141.99526ms | 127.0.0.1 | POST "/api/show"
25
+ time=2025-08-09T19:39:06.555+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.3 GiB" before.free_swap="2.0 GiB" now.total="15.5 GiB" now.free="11.3 GiB" now.free_swap="2.0 GiB"
26
+ time=2025-08-09T19:39:06.555+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
27
+ time=2025-08-09T19:39:06.589+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
28
+ time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
29
+ time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.3 GiB" before.free_swap="2.0 GiB" now.total="15.5 GiB" now.free="11.3 GiB" now.free_swap="2.0 GiB"
30
+ time=2025-08-09T19:39:06.686+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="11.3 GiB" free_swap="2.0 GiB"
31
+ time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[11.3 GiB]"
32
+ time=2025-08-09T19:39:06.686+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
33
+ time=2025-08-09T19:39:06.687+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
34
+ time=2025-08-09T19:39:06.687+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
35
+ time=2025-08-09T19:39:06.687+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[11.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.3 GiB" memory.required.partial="0 B" memory.required.kv="144.0 MiB" memory.required.allocations="[2.3 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="300.8 MiB" memory.graph.partial="544.2 MiB"
36
+ time=2025-08-09T19:39:06.688+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
37
+ llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
38
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
39
+ llama_model_loader: - kv 0: general.architecture str = qwen2
40
+ llama_model_loader: - kv 1: general.type str = model
41
+ llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct
42
+ llama_model_loader: - kv 3: general.finetune str = Instruct
43
+ llama_model_loader: - kv 4: general.basename str = Qwen2.5
44
+ llama_model_loader: - kv 5: general.size_label str = 3B
45
+ llama_model_loader: - kv 6: general.license str = other
46
+ llama_model_loader: - kv 7: general.license.name str = qwen-research
47
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3...
48
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
49
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B
50
+ llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen
51
+ llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B
52
+ llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"]
53
+ llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"]
54
+ llama_model_loader: - kv 15: qwen2.block_count u32 = 36
55
+ llama_model_loader: - kv 16: qwen2.context_length u32 = 32768
56
+ llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048
57
+ llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008
58
+ llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16
59
+ llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2
60
+ llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000
61
+ llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
62
+ llama_model_loader: - kv 23: general.file_type u32 = 15
63
+ llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2
64
+ llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2
65
+ llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
66
+ llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
67
+ llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
68
+ llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645
69
+ llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643
70
+ llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643
71
+ llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false
72
+ llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
73
+ llama_model_loader: - kv 34: general.quantization_version u32 = 2
74
+ llama_model_loader: - type f32: 181 tensors
75
+ llama_model_loader: - type q4_K: 216 tensors
76
+ llama_model_loader: - type q6_K: 37 tensors
77
+ print_info: file format = GGUF V3 (latest)
78
+ print_info: file type = Q4_K - Medium
79
+ print_info: file size = 1.79 GiB (4.99 BPW)
80
+ init_tokenizer: initializing tokenizer for type 2
81
+ load: control token: 151660 '<|fim_middle|>' is not marked as EOG
82
+ load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
83
+ load: control token: 151653 '<|vision_end|>' is not marked as EOG
84
+ load: control token: 151648 '<|box_start|>' is not marked as EOG
85
+ load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
86
+ load: control token: 151649 '<|box_end|>' is not marked as EOG
87
+ load: control token: 151655 '<|image_pad|>' is not marked as EOG
88
+ load: control token: 151651 '<|quad_end|>' is not marked as EOG
89
+ load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
90
+ load: control token: 151652 '<|vision_start|>' is not marked as EOG
91
+ load: control token: 151654 '<|vision_pad|>' is not marked as EOG
92
+ load: control token: 151656 '<|video_pad|>' is not marked as EOG
93
+ load: control token: 151644 '<|im_start|>' is not marked as EOG
94
+ load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
95
+ load: control token: 151650 '<|quad_start|>' is not marked as EOG
96
+ load: special tokens cache size = 22
97
+ load: token to piece cache size = 0.9310 MB
98
+ print_info: arch = qwen2
99
+ print_info: vocab_only = 1
100
+ print_info: model type = ?B
101
+ print_info: model params = 3.09 B
102
+ print_info: general.name = Qwen2.5 3B Instruct
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 151643 '<|endoftext|>'
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151643 '<|endoftext|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ llama_model_load: vocab only - skipping tensors
124
+ time=2025-08-09T19:39:07.278+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
125
+ time=2025-08-09T19:39:07.278+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 4096 --batch-size 512 --threads 2 --no-mmap --parallel 1 --port 43905"
126
+ time=2025-08-09T19:39:07.278+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin:/home/rkonan/.vscode/extensions/ms-python.debugpy-2025.10.0-linux-x64/bundled/scripts/noConfigScripts:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
127
+ time=2025-08-09T19:39:07.279+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
128
+ time=2025-08-09T19:39:07.279+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
129
+ time=2025-08-09T19:39:07.279+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
130
+ time=2025-08-09T19:39:07.309+02:00 level=INFO source=runner.go:815 msg="starting go runner"
131
+ time=2025-08-09T19:39:07.309+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
132
+ load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
133
+ time=2025-08-09T19:39:07.336+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
134
+ time=2025-08-09T19:39:07.338+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:43905"
135
+ llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
136
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
137
+ llama_model_loader: - kv 0: general.architecture str = qwen2
138
+ llama_model_loader: - kv 1: general.type str = model
139
+ llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct
140
+ llama_model_loader: - kv 3: general.finetune str = Instruct
141
+ llama_model_loader: - kv 4: general.basename str = Qwen2.5
142
+ llama_model_loader: - kv 5: general.size_label str = 3B
143
+ llama_model_loader: - kv 6: general.license str = other
144
+ llama_model_loader: - kv 7: general.license.name str = qwen-research
145
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3...
146
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
147
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B
148
+ llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen
149
+ llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B
150
+ llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"]
151
+ llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"]
152
+ llama_model_loader: - kv 15: qwen2.block_count u32 = 36
153
+ llama_model_loader: - kv 16: qwen2.context_length u32 = 32768
154
+ llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048
155
+ llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008
156
+ llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16
157
+ llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2
158
+ llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000
159
+ llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
160
+ llama_model_loader: - kv 23: general.file_type u32 = 15
161
+ llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2
162
+ llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2
163
+ llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
164
+ llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
165
+ time=2025-08-09T19:39:07.530+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
166
+ llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
167
+ llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645
168
+ llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643
169
+ llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643
170
+ llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false
171
+ llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
172
+ llama_model_loader: - kv 34: general.quantization_version u32 = 2
173
+ llama_model_loader: - type f32: 181 tensors
174
+ llama_model_loader: - type q4_K: 216 tensors
175
+ llama_model_loader: - type q6_K: 37 tensors
176
+ print_info: file format = GGUF V3 (latest)
177
+ print_info: file type = Q4_K - Medium
178
+ print_info: file size = 1.79 GiB (4.99 BPW)
179
+ init_tokenizer: initializing tokenizer for type 2
180
+ load: control token: 151660 '<|fim_middle|>' is not marked as EOG
181
+ load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
182
+ load: control token: 151653 '<|vision_end|>' is not marked as EOG
183
+ load: control token: 151648 '<|box_start|>' is not marked as EOG
184
+ load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
185
+ load: control token: 151649 '<|box_end|>' is not marked as EOG
186
+ load: control token: 151655 '<|image_pad|>' is not marked as EOG
187
+ load: control token: 151651 '<|quad_end|>' is not marked as EOG
188
+ load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
189
+ load: control token: 151652 '<|vision_start|>' is not marked as EOG
190
+ load: control token: 151654 '<|vision_pad|>' is not marked as EOG
191
+ load: control token: 151656 '<|video_pad|>' is not marked as EOG
192
+ load: control token: 151644 '<|im_start|>' is not marked as EOG
193
+ load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
194
+ load: control token: 151650 '<|quad_start|>' is not marked as EOG
195
+ load: special tokens cache size = 22
196
+ load: token to piece cache size = 0.9310 MB
197
+ print_info: arch = qwen2
198
+ print_info: vocab_only = 0
199
+ print_info: n_ctx_train = 32768
200
+ print_info: n_embd = 2048
201
+ print_info: n_layer = 36
202
+ print_info: n_head = 16
203
+ print_info: n_head_kv = 2
204
+ print_info: n_rot = 128
205
+ print_info: n_swa = 0
206
+ print_info: n_swa_pattern = 1
207
+ print_info: n_embd_head_k = 128
208
+ print_info: n_embd_head_v = 128
209
+ print_info: n_gqa = 8
210
+ print_info: n_embd_k_gqa = 256
211
+ print_info: n_embd_v_gqa = 256
212
+ print_info: f_norm_eps = 0.0e+00
213
+ print_info: f_norm_rms_eps = 1.0e-06
214
+ print_info: f_clamp_kqv = 0.0e+00
215
+ print_info: f_max_alibi_bias = 0.0e+00
216
+ print_info: f_logit_scale = 0.0e+00
217
+ print_info: f_attn_scale = 0.0e+00
218
+ print_info: n_ff = 11008
219
+ print_info: n_expert = 0
220
+ print_info: n_expert_used = 0
221
+ print_info: causal attn = 1
222
+ print_info: pooling type = -1
223
+ print_info: rope type = 2
224
+ print_info: rope scaling = linear
225
+ print_info: freq_base_train = 1000000.0
226
+ print_info: freq_scale_train = 1
227
+ print_info: n_ctx_orig_yarn = 32768
228
+ print_info: rope_finetuned = unknown
229
+ print_info: ssm_d_conv = 0
230
+ print_info: ssm_d_inner = 0
231
+ print_info: ssm_d_state = 0
232
+ print_info: ssm_dt_rank = 0
233
+ print_info: ssm_dt_b_c_rms = 0
234
+ print_info: model type = 3B
235
+ print_info: model params = 3.09 B
236
+ print_info: general.name = Qwen2.5 3B Instruct
237
+ print_info: vocab type = BPE
238
+ print_info: n_vocab = 151936
239
+ print_info: n_merges = 151387
240
+ print_info: BOS token = 151643 '<|endoftext|>'
241
+ print_info: EOS token = 151645 '<|im_end|>'
242
+ print_info: EOT token = 151645 '<|im_end|>'
243
+ print_info: PAD token = 151643 '<|endoftext|>'
244
+ print_info: LF token = 198 'Ċ'
245
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
246
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
247
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
248
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
249
+ print_info: FIM REP token = 151663 '<|repo_name|>'
250
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
251
+ print_info: EOG token = 151643 '<|endoftext|>'
252
+ print_info: EOG token = 151645 '<|im_end|>'
253
+ print_info: EOG token = 151662 '<|fim_pad|>'
254
+ print_info: EOG token = 151663 '<|repo_name|>'
255
+ print_info: EOG token = 151664 '<|file_sep|>'
256
+ print_info: max token length = 256
257
+ load_tensors: loading model tensors, this can take a while... (mmap = false)
258
+ load_tensors: layer 0 assigned to device CPU, is_swa = 0
259
+ load_tensors: layer 1 assigned to device CPU, is_swa = 0
260
+ load_tensors: layer 2 assigned to device CPU, is_swa = 0
261
+ load_tensors: layer 3 assigned to device CPU, is_swa = 0
262
+ load_tensors: layer 4 assigned to device CPU, is_swa = 0
263
+ load_tensors: layer 5 assigned to device CPU, is_swa = 0
264
+ load_tensors: layer 6 assigned to device CPU, is_swa = 0
265
+ load_tensors: layer 7 assigned to device CPU, is_swa = 0
266
+ load_tensors: layer 8 assigned to device CPU, is_swa = 0
267
+ load_tensors: layer 9 assigned to device CPU, is_swa = 0
268
+ load_tensors: layer 10 assigned to device CPU, is_swa = 0
269
+ load_tensors: layer 11 assigned to device CPU, is_swa = 0
270
+ load_tensors: layer 12 assigned to device CPU, is_swa = 0
271
+ load_tensors: layer 13 assigned to device CPU, is_swa = 0
272
+ load_tensors: layer 14 assigned to device CPU, is_swa = 0
273
+ load_tensors: layer 15 assigned to device CPU, is_swa = 0
274
+ load_tensors: layer 16 assigned to device CPU, is_swa = 0
275
+ load_tensors: layer 17 assigned to device CPU, is_swa = 0
276
+ load_tensors: layer 18 assigned to device CPU, is_swa = 0
277
+ load_tensors: layer 19 assigned to device CPU, is_swa = 0
278
+ load_tensors: layer 20 assigned to device CPU, is_swa = 0
279
+ load_tensors: layer 21 assigned to device CPU, is_swa = 0
280
+ load_tensors: layer 22 assigned to device CPU, is_swa = 0
281
+ load_tensors: layer 23 assigned to device CPU, is_swa = 0
282
+ load_tensors: layer 24 assigned to device CPU, is_swa = 0
283
+ load_tensors: layer 25 assigned to device CPU, is_swa = 0
284
+ load_tensors: layer 26 assigned to device CPU, is_swa = 0
285
+ load_tensors: layer 27 assigned to device CPU, is_swa = 0
286
+ load_tensors: layer 28 assigned to device CPU, is_swa = 0
287
+ load_tensors: layer 29 assigned to device CPU, is_swa = 0
288
+ load_tensors: layer 30 assigned to device CPU, is_swa = 0
289
+ load_tensors: layer 31 assigned to device CPU, is_swa = 0
290
+ load_tensors: layer 32 assigned to device CPU, is_swa = 0
291
+ load_tensors: layer 33 assigned to device CPU, is_swa = 0
292
+ load_tensors: layer 34 assigned to device CPU, is_swa = 0
293
+ load_tensors: layer 35 assigned to device CPU, is_swa = 0
294
+ load_tensors: layer 36 assigned to device CPU, is_swa = 0
295
+ load_tensors: CPU model buffer size = 1834.82 MiB
296
+ load_all_data: no device found for buffer type CPU for async uploads
297
+ time=2025-08-09T19:39:08.284+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.19"
298
+ time=2025-08-09T19:39:08.538+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.32"
299
+ time=2025-08-09T19:39:08.791+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.42"
300
+ time=2025-08-09T19:39:09.043+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.52"
301
+ time=2025-08-09T19:39:09.294+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.67"
302
+ time=2025-08-09T19:39:09.545+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.86"
303
+ llama_context: constructing llama_context
304
+ llama_context: n_seq_max = 1
305
+ llama_context: n_ctx = 4096
306
+ llama_context: n_ctx_per_seq = 4096
307
+ llama_context: n_batch = 512
308
+ llama_context: n_ubatch = 512
309
+ llama_context: causal_attn = 1
310
+ llama_context: flash_attn = 0
311
+ llama_context: freq_base = 1000000.0
312
+ llama_context: freq_scale = 1
313
+ llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
314
+ set_abort_callback: call
315
+ llama_context: CPU output buffer size = 0.59 MiB
316
+ create_memory: n_ctx = 4096 (padded)
317
+ llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
318
+ llama_kv_cache_unified: layer 0: dev = CPU
319
+ llama_kv_cache_unified: layer 1: dev = CPU
320
+ llama_kv_cache_unified: layer 2: dev = CPU
321
+ llama_kv_cache_unified: layer 3: dev = CPU
322
+ llama_kv_cache_unified: layer 4: dev = CPU
323
+ llama_kv_cache_unified: layer 5: dev = CPU
324
+ llama_kv_cache_unified: layer 6: dev = CPU
325
+ llama_kv_cache_unified: layer 7: dev = CPU
326
+ llama_kv_cache_unified: layer 8: dev = CPU
327
+ llama_kv_cache_unified: layer 9: dev = CPU
328
+ llama_kv_cache_unified: layer 10: dev = CPU
329
+ llama_kv_cache_unified: layer 11: dev = CPU
330
+ llama_kv_cache_unified: layer 12: dev = CPU
331
+ llama_kv_cache_unified: layer 13: dev = CPU
332
+ llama_kv_cache_unified: layer 14: dev = CPU
333
+ llama_kv_cache_unified: layer 15: dev = CPU
334
+ llama_kv_cache_unified: layer 16: dev = CPU
335
+ llama_kv_cache_unified: layer 17: dev = CPU
336
+ llama_kv_cache_unified: layer 18: dev = CPU
337
+ llama_kv_cache_unified: layer 19: dev = CPU
338
+ llama_kv_cache_unified: layer 20: dev = CPU
339
+ llama_kv_cache_unified: layer 21: dev = CPU
340
+ llama_kv_cache_unified: layer 22: dev = CPU
341
+ llama_kv_cache_unified: layer 23: dev = CPU
342
+ llama_kv_cache_unified: layer 24: dev = CPU
343
+ llama_kv_cache_unified: layer 25: dev = CPU
344
+ llama_kv_cache_unified: layer 26: dev = CPU
345
+ llama_kv_cache_unified: layer 27: dev = CPU
346
+ llama_kv_cache_unified: layer 28: dev = CPU
347
+ llama_kv_cache_unified: layer 29: dev = CPU
348
+ llama_kv_cache_unified: layer 30: dev = CPU
349
+ llama_kv_cache_unified: layer 31: dev = CPU
350
+ llama_kv_cache_unified: layer 32: dev = CPU
351
+ llama_kv_cache_unified: layer 33: dev = CPU
352
+ llama_kv_cache_unified: layer 34: dev = CPU
353
+ llama_kv_cache_unified: layer 35: dev = CPU
354
+ time=2025-08-09T19:39:09.796+02:00 level=DEBUG source=server.go:643 msg="model load progress 1.00"
355
+ llama_kv_cache_unified: CPU KV buffer size = 144.00 MiB
356
+ llama_kv_cache_unified: KV self size = 144.00 MiB, K (f16): 72.00 MiB, V (f16): 72.00 MiB
357
+ llama_context: enumerating backends
358
+ llama_context: backend_ptrs.size() = 1
359
+ llama_context: max_nodes = 65536
360
+ llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
361
+ llama_context: reserving graph for n_tokens = 512, n_seqs = 1
362
+ llama_context: reserving graph for n_tokens = 1, n_seqs = 1
363
+ llama_context: reserving graph for n_tokens = 512, n_seqs = 1
364
+ llama_context: CPU compute buffer size = 300.75 MiB
365
+ llama_context: graph nodes = 1338
366
+ llama_context: graph splits = 1
367
+ time=2025-08-09T19:39:10.048+02:00 level=INFO source=server.go:637 msg="llama runner started in 2.77 seconds"
368
+ time=2025-08-09T19:39:10.048+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=188113 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096
369
+ time=2025-08-09T19:39:10.048+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=155 format=""
370
+ time=2025-08-09T19:39:10.053+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=30 used=0 remaining=30
371
+ [GIN] 2025/08/09 - 19:39:18 | 200 | 12.485055582s | 127.0.0.1 | POST "/api/generate"
372
+ time=2025-08-09T19:39:18.953+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
373
+ time=2025-08-09T19:39:18.953+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=188113 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 duration=5m0s
374
+ time=2025-08-09T19:39:18.954+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=188113 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 refCount=0
rag_model_ollama_v1 copy 2.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import textwrap
4
+ import logging
5
+ from typing import List, Optional, Dict, Any, Iterable, Tuple
6
+ import requests
7
+ import faiss
8
+ from llama_index.core import VectorStoreIndex
9
+ from llama_index.core.schema import TextNode
10
+ from llama_index.vector_stores.faiss import FaissVectorStore
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from sentence_transformers.util import cos_sim
13
+ import json
14
+
15
+ # === Logger configuration ===
16
+ logger = logging.getLogger("RAGEngine")
17
+ logger.setLevel(logging.INFO)
18
+ handler = logging.StreamHandler()
19
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
20
+ handler.setFormatter(formatter)
21
+ if not logger.handlers:
22
+ logger.addHandler(handler)
23
+
24
+ MAX_TOKENS = 64
25
+ DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
26
+
27
+ class OllamaClient:
28
+ def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
29
+ self.model = model
30
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
31
+ self.timeout = timeout
32
+ self._gen_url = self.host.rstrip("/") + "/api/generate"
33
+
34
+ def generate(self, prompt: str, stop: Optional[List[str]] = None,
35
+ max_tokens: Optional[int] = None, stream: bool = False,
36
+ options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
37
+ payload: Dict[str, Any] = {
38
+ "model": self.model,
39
+ "prompt": prompt,
40
+ "stream": stream,
41
+ }
42
+ if raw:
43
+ payload["raw"] = True
44
+ if stop:
45
+ payload["stop"] = stop
46
+ if max_tokens is not None:
47
+ payload["num_predict"] = int(max_tokens)
48
+ if options:
49
+ payload["options"] = options
50
+
51
+ logger.debug(f"POST {self._gen_url} (stream={stream})")
52
+
53
+ if stream:
54
+ with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
55
+ r.raise_for_status()
56
+ for line in r.iter_lines(decode_unicode=True):
57
+ if not line:
58
+ continue
59
+ try:
60
+ data = json.loads(line)
61
+ except Exception:
62
+ continue
63
+ if "response" in data and not data.get("done"):
64
+ yield data["response"]
65
+ if data.get("done"):
66
+ break
67
+ return
68
+
69
+ r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
70
+ r.raise_for_status()
71
+ data = r.json()
72
+ return data.get("response", "")
73
+
74
+
75
+ class RAGEngine:
76
+ def __init__(self, model_name: str, vector_path: str, index_path: str,
77
+ model_threads: int = 4, ollama_host: Optional[str] = None,
78
+ ollama_opts: Optional[Dict[str, Any]] = None):
79
+
80
+ logger.info(f"🔎 rag_model_ollama source: {__file__}")
81
+ logger.info("📦 Initialisation du moteur RAG (Ollama)...")
82
+
83
+ opts = dict(ollama_opts or {})
84
+ opts.setdefault("temperature", 0.0)
85
+ opts.setdefault("num_ctx", 512) # aligné avec CLI par défaut
86
+ opts.setdefault("num_batch", 16)
87
+ if "num_thread" not in opts and model_threads:
88
+ opts["num_thread"] = int(model_threads)
89
+
90
+ self.llm = OllamaClient(model=model_name, host=ollama_host)
91
+ self.ollama_opts = opts
92
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
93
+
94
+ logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
95
+ with open(vector_path, "rb") as f:
96
+ chunk_texts: List[str] = pickle.load(f)
97
+ nodes = [TextNode(text=chunk) for chunk in chunk_texts]
98
+
99
+ faiss_index = faiss.read_index(index_path)
100
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
101
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
102
+
103
+ logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
104
+
105
+ # Warmup pour charger le runner comme le CLI
106
+ try:
107
+ logger.info("⚡ Warmup du modèle Ollama...")
108
+ for _ in self._complete_stream("Bonjour", max_tokens=8, raw=False):
109
+ pass
110
+ except Exception as e:
111
+ logger.warning(f"Warmup échoué : {e}")
112
+
113
+ def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
114
+ max_tokens: int = MAX_TOKENS, raw: bool = False):
115
+ return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
116
+ stream=True, options=self.ollama_opts, raw=raw)
117
+
118
+ def _complete(self, prompt: str, stop: Optional[List[str]] = None,
119
+ max_tokens: int = 128, raw: bool = False) -> str:
120
+ text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
121
+ stream=False, options=self.ollama_opts, raw=raw)
122
+ return (text or "").strip()
123
+
124
+ def _is_greeting(self, text: str) -> bool:
125
+ s = text.lower().strip()
126
+ return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
127
+
128
+ def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
129
+ if is_greeting:
130
+ return "llm"
131
+ top = scores[0] if scores else 0.0
132
+ return "rag" if top >= tau else "llm"
133
+
134
+ def get_adaptive_top_k(self, question: str) -> int:
135
+ q = question.lower()
136
+ if len(q.split()) <= 7:
137
+ top_k = 8
138
+ elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
139
+ top_k = 10
140
+ else:
141
+ top_k = 8
142
+ logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
143
+ return top_k
144
+
145
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
146
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
147
+ q_emb = self.embed_model.get_query_embedding(question)
148
+ scored_nodes: List[Tuple[float, TextNode]] = []
149
+ for node in retrieved_nodes:
150
+ chunk_emb = self.embed_model.get_text_embedding(node.get_content())
151
+ score = cos_sim(q_emb, chunk_emb).item()
152
+ scored_nodes.append((score, node))
153
+ ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
154
+ top = ranked[:top_k]
155
+ return [s for s, _ in top], [n for _, n in top]
156
+
157
+ def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
158
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
159
+ retrieved_nodes = retriever.retrieve(question)
160
+ scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
161
+ context = "\n\n".join(n.get_content()[:500] for n in nodes)
162
+ return context, nodes, scores
163
+
164
+ def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
165
+ logger.info(f"💬 [Stream] Question reçue : {question}")
166
+ is_hello = self._is_greeting(question)
167
+ context, scores = "", []
168
+ if not is_hello:
169
+ top_k = self.get_adaptive_top_k(question)
170
+ context, _, scores = self.retrieve_context(question, top_k)
171
+
172
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
173
+ logger.info(f"🧭 Mode choisi (stream) : {mode}")
174
+
175
+ if mode == "rag":
176
+ prompt = (
177
+ "Instruction: Réponds uniquement à partir du contexte. "
178
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
179
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
180
+ )
181
+ logger.info("📡 Début streaming (RAG)...")
182
+ for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
183
+ yield token
184
+ logger.info("📡 Fin streaming (RAG).")
185
+ return
186
+
187
+ prompt_llm = (
188
+ "Réponds brièvement et précisément en français.\n"
189
+ f"Question : {question}\nRéponse :"
190
+ )
191
+ logger.info("📡 Début streaming (LLM pur)...")
192
+ for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
193
+ yield token
194
+ logger.info("📡 Fin streaming (LLM pur).")
rag_model_ollama_v1 copy.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pickle
4
+ import textwrap
5
+ import logging
6
+ from typing import List, Optional, Dict, Any, Iterable, Tuple
7
+
8
+ import requests
9
+ import faiss
10
+ import numpy as np
11
+ from llama_index.core import VectorStoreIndex
12
+ from llama_index.core.schema import TextNode
13
+ from llama_index.vector_stores.faiss import FaissVectorStore
14
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
15
+ from sentence_transformers.util import cos_sim
16
+
17
+ # === Logger configuration ===
18
+ logger = logging.getLogger("RAGEngine")
19
+ logger.setLevel(logging.INFO)
20
+ handler = logging.StreamHandler()
21
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
22
+ handler.setFormatter(formatter)
23
+ if not logger.handlers:
24
+ logger.addHandler(handler)
25
+
26
+ MAX_TOKENS = 64 # bornage court sur CPU-only
27
+ DEFAULT_STOPS = ["</s>", "\n\n", "\nQuestion:", "Question:"]
28
+
29
+
30
+ class OllamaClient:
31
+ """
32
+ Minimal Ollama client for /api/generate (text completion) with streaming support.
33
+ """
34
+ def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
35
+ self.model = model
36
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
37
+ self.timeout = timeout
38
+ self._gen_url = self.host.rstrip("/") + "/api/generate"
39
+
40
+ def generate(
41
+ self,
42
+ prompt: str,
43
+ stop: Optional[List[str]] = None,
44
+ max_tokens: Optional[int] = None,
45
+ stream: bool = False,
46
+ options: Optional[Dict[str, Any]] = None,
47
+ raw: bool = False,
48
+ ) -> str | Iterable[str]:
49
+ payload: Dict[str, Any] = {
50
+ "model": self.model,
51
+ "prompt": prompt,
52
+ "stream": stream,
53
+ }
54
+ if raw:
55
+ payload["raw"] = True # IMPORTANT: désactive le template Modelfile
56
+ if stop:
57
+ payload["stop"] = stop
58
+ if max_tokens is not None:
59
+ payload["num_predict"] = int(max_tokens) # nommage Ollama
60
+ if options:
61
+ payload["options"] = options
62
+
63
+ logger.debug(f"POST {self._gen_url} (stream={stream})")
64
+
65
+ if stream:
66
+ with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
67
+ r.raise_for_status()
68
+ for line in r.iter_lines(decode_unicode=True):
69
+ if not line:
70
+ continue
71
+ try:
72
+ data = json.loads(line)
73
+ except Exception:
74
+ continue
75
+ # En stream, Ollama renvoie des morceaux dans "response"
76
+ if "response" in data and not data.get("done"):
77
+ yield data["response"]
78
+ if data.get("done"):
79
+ break
80
+ return
81
+
82
+ r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
83
+ r.raise_for_status()
84
+ data = r.json()
85
+ return data.get("response", "")
86
+
87
+
88
+ class RAGEngine:
89
+ def __init__(
90
+ self,
91
+ model_name: str,
92
+ vector_path: str,
93
+ index_path: str,
94
+ model_threads: int = 4,
95
+ ollama_host: Optional[str] = None,
96
+ ollama_opts: Optional[Dict[str, Any]] = None,
97
+ ):
98
+ """
99
+ Args:
100
+ model_name: e.g. "noushermes_rag"
101
+ vector_path: pickle file with chunk texts list[str]
102
+ index_path: FAISS index path
103
+ model_threads: forwarded as a hint to Ollama options
104
+ ollama_host: override OLLAMA_HOST (default http://localhost:11434)
105
+ ollama_opts: extra Ollama options (temperature, num_ctx, num_batch, num_thread)
106
+ """
107
+ logger.info(f"🔎 rag_model_ollama source: {__file__}")
108
+ logger.info("📦 Initialisation du moteur RAG (Ollama)...")
109
+
110
+ # Options Ollama (par défaut optimisées CPU)
111
+ opts = dict(ollama_opts or {})
112
+ opts.setdefault("temperature", 0.0)
113
+ opts.setdefault("num_ctx", 512)
114
+ opts.setdefault("num_batch", 16)
115
+ if "num_thread" not in opts and model_threads:
116
+ opts["num_thread"] = int(model_threads)
117
+
118
+ self.llm = OllamaClient(model=model_name, host=ollama_host)
119
+ self.ollama_opts = opts
120
+
121
+ # Embedding model pour retrieval / rerank
122
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
123
+
124
+ logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
125
+ with open(vector_path, "rb") as f:
126
+ chunk_texts: List[str] = pickle.load(f)
127
+ nodes = [TextNode(text=chunk) for chunk in chunk_texts]
128
+
129
+ faiss_index = faiss.read_index(index_path)
130
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
131
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
132
+
133
+ logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
134
+
135
+ # ---------------- LLM helpers (via Ollama) ----------------
136
+
137
+ def _complete(
138
+ self,
139
+ prompt: str,
140
+ stop: Optional[List[str]] = None,
141
+ max_tokens: int = MAX_TOKENS,
142
+ raw: bool = True
143
+ ) -> str:
144
+ text = self.llm.generate(
145
+ prompt=prompt,
146
+ stop=stop or DEFAULT_STOPS,
147
+ max_tokens=max_tokens,
148
+ stream=False,
149
+ options=self.ollama_opts,
150
+ raw=raw, # toujours True pour bypass Modelfile
151
+ )
152
+ # Par sécurité si un générateur se glisse quand stream=False
153
+ try:
154
+ if hasattr(text, "__iter__") and not isinstance(text, (str, bytes)):
155
+ chunks = []
156
+ for t in text:
157
+ if not isinstance(t, (str, bytes)):
158
+ continue
159
+ chunks.append(t)
160
+ text = "".join(chunks)
161
+ except Exception:
162
+ pass
163
+ return (text or "").strip()
164
+
165
+ def _complete_stream(
166
+ self,
167
+ prompt: str,
168
+ stop: Optional[List[str]] = None,
169
+ max_tokens: int = MAX_TOKENS,
170
+ raw: bool = True
171
+ ) -> Iterable[str]:
172
+ return self.llm.generate(
173
+ prompt=prompt,
174
+ stop=stop or DEFAULT_STOPS,
175
+ max_tokens=max_tokens,
176
+ stream=True,
177
+ options=self.ollama_opts,
178
+ raw=raw, # toujours True pour bypass Modelfile
179
+ )
180
+
181
+ # ---------------- Utilities ----------------
182
+
183
+ def _is_greeting(self, text: str) -> bool:
184
+ s = text.lower().strip()
185
+ return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
186
+
187
+ def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
188
+ if is_greeting:
189
+ return "llm"
190
+ top = scores[0] if scores else 0.0
191
+ return "rag" if top >= tau else "llm"
192
+
193
+ def _stream_with_local_stops(self, tokens: Iterable[str], stops: List[str]) -> Iterable[str]:
194
+ """
195
+ Coupe localement le stream si un stop apparaît, même si le serveur ne s'arrête pas.
196
+ """
197
+ buffer = ""
198
+ for chunk in tokens:
199
+ buffer += chunk
200
+ # Check si un des stops est présent dans le buffer
201
+ hit = None
202
+ for s in stops:
203
+ idx = buffer.find(s)
204
+ if idx != -1:
205
+ hit = (s, idx)
206
+ break
207
+ if hit:
208
+ s, idx = hit
209
+ # Yield tout avant le stop, puis stoppe
210
+ yield buffer[:idx]
211
+ break
212
+ else:
213
+ # Si pas de stop, on envoie le chunk tel quel
214
+ yield chunk
215
+
216
+ # ---------------- Retrieval + (optional) rerank ----------------
217
+
218
+ def get_adaptive_top_k(self, question: str) -> int:
219
+ q = question.lower()
220
+ if len(q.split()) <= 7:
221
+ top_k = 8
222
+ elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
223
+ top_k = 10
224
+ else:
225
+ top_k = 8
226
+ logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
227
+ return top_k
228
+
229
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
230
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour la question : « {question} »")
231
+ q_emb = self.embed_model.get_query_embedding(question)
232
+ scored_nodes: List[Tuple[float, TextNode]] = []
233
+
234
+ for node in retrieved_nodes:
235
+ chunk_text = node.get_content()
236
+ chunk_emb = self.embed_model.get_text_embedding(chunk_text)
237
+ score = cos_sim(q_emb, chunk_emb).item()
238
+ scored_nodes.append((score, node))
239
+
240
+ ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
241
+
242
+ logger.info("📊 Chunks les plus pertinents :")
243
+ for i, (score, node) in enumerate(ranked[:top_k]):
244
+ chunk_preview = textwrap.shorten(node.get_content().replace("\n", " "), width=100)
245
+ logger.info(f"#{i+1} | Score: {score:.4f} | {chunk_preview}")
246
+
247
+ top = ranked[:top_k]
248
+ scores = [s for s, _ in top]
249
+ nodes = [n for _, n in top]
250
+ return scores, nodes
251
+
252
+ def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
253
+ logger.info("📥 Récupération du contexte...")
254
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
255
+ retrieved_nodes = retriever.retrieve(question)
256
+ scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
257
+ context = "\n\n".join(n.get_content()[:500] for n in nodes)
258
+ return context, nodes, scores
259
+
260
+ # ---------------- Public API ----------------
261
+
262
+ def ask(self, question_raw: str, allow_fallback: bool = True) -> str:
263
+ logger.info(f"💬 Question reçue : {question_raw}")
264
+ is_hello = self._is_greeting(question_raw)
265
+
266
+ # retrieval (sauf salutations)
267
+ context, scores = "", []
268
+ if not is_hello:
269
+ top_k = self.get_adaptive_top_k(question_raw)
270
+ context, _, scores = self.retrieve_context(question_raw, top_k)
271
+
272
+ # router RAG vs LLM
273
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
274
+ logger.info(f"🧭 Mode choisi : {mode}")
275
+
276
+ if mode == "rag":
277
+ prompt = (
278
+ "Instruction: Réponds uniquement à partir du contexte. "
279
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
280
+ "\n\nContexte :\n"
281
+ f"{context}\n\n"
282
+ f"Question : {question_raw}\n"
283
+ "Réponse :"
284
+ )
285
+
286
+ resp = self._complete(
287
+ prompt,
288
+ stop=DEFAULT_STOPS,
289
+ max_tokens=MAX_TOKENS,
290
+ raw=True, # ✅ bypass Modelfile/template
291
+ ).strip()
292
+
293
+ # fallback LLM‑pur si le RAG n'a rien trouvé
294
+ if allow_fallback and "Information non présente" in resp:
295
+ logger.info("↪️ Fallback LLM‑pur (hors contexte)")
296
+ prompt_llm = (
297
+ "Réponds brièvement et précisément en français.\n"
298
+ f"Question : {question_raw}\n"
299
+ "Réponse :"
300
+ )
301
+ resp = self._complete(
302
+ prompt_llm,
303
+ stop=DEFAULT_STOPS,
304
+ max_tokens=MAX_TOKENS,
305
+ raw=True
306
+ ).strip()
307
+
308
+ ellipsis = "..." if len(resp) > 120 else ""
309
+ logger.info(f"🧠 Réponse générée : {resp[:120]}{ellipsis}")
310
+ return resp
311
+
312
+ # LLM pur (salutation ou score faible)
313
+ prompt_llm = (
314
+ "Réponds brièvement et précisément en français.\n"
315
+ f"Question : {question_raw}\n"
316
+ "Réponse :"
317
+ )
318
+ resp = self._complete(
319
+ prompt_llm,
320
+ stop=DEFAULT_STOPS,
321
+ max_tokens=MAX_TOKENS,
322
+ raw=True
323
+ ).strip()
324
+ ellipsis = "..." if len(resp) > 120 else ""
325
+ logger.info(f"🧠 Réponse générée : {resp[:120]}{ellipsis}")
326
+ return resp
327
+
328
+ def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
329
+ logger.info(f"💬 [Stream] Question reçue : {question}")
330
+ is_hello = self._is_greeting(question)
331
+
332
+ context, scores = "", []
333
+ if not is_hello:
334
+ top_k = self.get_adaptive_top_k(question)
335
+ context, _, scores = self.retrieve_context(question, top_k)
336
+
337
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
338
+ logger.info(f"🧭 Mode choisi (stream) : {mode}")
339
+
340
+ stops = DEFAULT_STOPS
341
+
342
+ if mode == "rag":
343
+ prompt = (
344
+ "Instruction: Réponds uniquement à partir du contexte. "
345
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
346
+ "\n\nContexte :\n"
347
+ f"{context}\n\n"
348
+ f"Question : {question}\n"
349
+ "Réponse :"
350
+ )
351
+
352
+ logger.info("📡 Début du streaming de la réponse (RAG)...")
353
+ tokens = self._complete_stream(
354
+ prompt,
355
+ stop=stops,
356
+ max_tokens=MAX_TOKENS,
357
+ raw=True,
358
+ )
359
+ # Blindage local: coupe si un stop apparaît
360
+ for t in self._stream_with_local_stops(tokens, stops):
361
+ if t:
362
+ yield t
363
+ logger.info("📡 Fin du streaming de la réponse (RAG).")
364
+ return
365
+
366
+ # LLM pur en stream
367
+ prompt_llm = (
368
+ "Réponds brièvement et précisément en français.\n"
369
+ f"Question : {question}\n"
370
+ "Réponse :"
371
+ )
372
+ logger.info("📡 Début du streaming de la réponse (LLM pur)...")
373
+ tokens = self._complete_stream(
374
+ prompt_llm,
375
+ stop=stops,
376
+ max_tokens=MAX_TOKENS,
377
+ raw=True,
378
+ )
379
+ for t in self._stream_with_local_stops(tokens, stops):
380
+ if t:
381
+ yield t
382
+ logger.info("📡 Fin du streaming de la réponse (LLM pur).")
rag_model_ollama_v1 stable_lazy.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import logging
4
+ import time
5
+ from typing import List, Optional, Dict, Any, Iterable, Tuple
6
+ import requests
7
+ import faiss
8
+ import json
9
+ from llama_index.core import VectorStoreIndex
10
+ from llama_index.core.schema import TextNode
11
+ from llama_index.vector_stores.faiss import FaissVectorStore
12
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
13
+ from sentence_transformers.util import cos_sim
14
+
15
+ # === Logger configuration ===
16
+ logger = logging.getLogger("RAGEngine")
17
+ logger.setLevel(logging.INFO)
18
+ handler = logging.StreamHandler()
19
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
20
+ handler.setFormatter(formatter)
21
+ if not logger.handlers:
22
+ logger.addHandler(handler)
23
+
24
+ MAX_TOKENS = 64
25
+ DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
26
+
27
+ class OllamaClient:
28
+ def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
29
+ self.model = model
30
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
31
+ self.timeout = timeout
32
+ self._gen_url = self.host.rstrip("/") + "/api/generate"
33
+
34
+ def generate(self, prompt: str, stop: Optional[List[str]] = None,
35
+ max_tokens: Optional[int] = None, stream: bool = False,
36
+ options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
37
+ payload: Dict[str, Any] = {
38
+ "model": self.model,
39
+ "prompt": prompt,
40
+ "stream": stream,
41
+ }
42
+ if raw:
43
+ payload["raw"] = True
44
+ if stop:
45
+ payload["stop"] = stop
46
+ if max_tokens is not None:
47
+ payload["num_predict"] = int(max_tokens)
48
+ # ❌ AUCUNE options → laisser Ollama auto-tuner
49
+
50
+ if stream:
51
+ with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
52
+ r.raise_for_status()
53
+ for line in r.iter_lines(decode_unicode=True):
54
+ if not line:
55
+ continue
56
+ try:
57
+ data = json.loads(line)
58
+ except Exception:
59
+ continue
60
+ if "response" in data and not data.get("done"):
61
+ yield data["response"]
62
+ if data.get("done"):
63
+ break
64
+ return
65
+
66
+ r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
67
+ r.raise_for_status()
68
+ data = r.json()
69
+ return data.get("response", "")
70
+
71
+
72
+ class RAGEngine:
73
+ def __init__(self, model_name: str, vector_path: str, index_path: str,
74
+ model_threads: int = 4, ollama_host: Optional[str] = None,
75
+ ollama_opts: Optional[Dict[str, Any]] = None):
76
+ logger.info(f"🔎 rag_model_ollama source: {__file__}")
77
+ logger.info("📦 Initialisation du moteur (lazy RAG)...")
78
+
79
+ # -- LLM prêt tout de suite
80
+ self.llm = OllamaClient(model=model_name, host=ollama_host)
81
+
82
+ # -- Chemins pour lazy load
83
+ self.vector_path = vector_path
84
+ self.index_path = index_path
85
+
86
+ # -- Objets RAG, chargés plus tard
87
+ self.embed_model: Optional[HuggingFaceEmbedding] = None
88
+ self.index: Optional[VectorStoreIndex] = None
89
+ self._loaded = False
90
+
91
+ logger.info("✅ Moteur initialisé (sans charger FAISS ni chunks).")
92
+
93
+ # ❌ Pas de warmup “génération” ici ; le premier appel LLM sera rapide.
94
+ # (Si tu veux : décommente ce mini warmup 1 token)
95
+ # try:
96
+ # list(self._complete_stream("Bonjour", max_tokens=1))
97
+ # except Exception as e:
98
+ # logger.warning(f"Warmup échoué : {e}")
99
+
100
+ # ---------- Lazy loader ----------
101
+ def _ensure_loaded(self):
102
+ if self._loaded:
103
+ return
104
+ t0 = time.perf_counter()
105
+ logger.info("⏳ Chargement lazy des données RAG (FAISS + chunks + embeddings)...")
106
+
107
+ # 1) Charger les chunks (pickle)
108
+ with open(self.vector_path, "rb") as f:
109
+ chunk_texts: List[str] = pickle.load(f)
110
+ nodes = [TextNode(text=chunk) for chunk in chunk_texts]
111
+
112
+ # 2) Charger l'index FAISS
113
+ faiss_index = faiss.read_index(self.index_path)
114
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
115
+
116
+ # 3) Embedding model
117
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
118
+
119
+ # 4) Construire l'index LlamaIndex
120
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
121
+
122
+ self._loaded = True
123
+ logger.info(f"✅ RAG chargé en {time.perf_counter() - t0:.2f}s (lazy).")
124
+
125
+ # ---------- Génération ----------
126
+ def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
127
+ max_tokens: int = MAX_TOKENS, raw: bool = False):
128
+ return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
129
+ stream=True, raw=raw)
130
+
131
+ def _complete(self, prompt: str, stop: Optional[List[str]] = None,
132
+ max_tokens: int = 128, raw: bool = False) -> str:
133
+ text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
134
+ stream=False, raw=raw)
135
+ return (text or "").strip()
136
+
137
+ # ---------- Heuristiques légères ----------
138
+ def _is_greeting(self, text: str) -> bool:
139
+ s = text.lower().strip()
140
+ return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
141
+
142
+ def _should_use_rag_fast(self, question: str) -> bool:
143
+ """Heuristique avant de charger RAG : éviter de charger pour une question triviale."""
144
+ q = question.lower()
145
+ # Mots-clés “doc”, “procédure”, etc.
146
+ keywords = ("document", "docs", "procédure", "politique", "policy", "manuel", "guide", "pdf", "selon", "dans le contexte")
147
+ if any(k in q for k in keywords):
148
+ return True
149
+ # Longueur : si question courte, reste LLM
150
+ if len(q.split()) <= 7:
151
+ return False
152
+ # Par défaut, pour les questions moyennes/longues → on utilisera RAG
153
+ return True
154
+
155
+ def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
156
+ if is_greeting:
157
+ return "llm"
158
+ top = scores[0] if scores else 0.0
159
+ return "rag" if top >= tau else "llm"
160
+
161
+ # ---------- Récupération ----------
162
+ def get_adaptive_top_k(self, question: str) -> int:
163
+ q = question.lower()
164
+ if len(q.split()) <= 7:
165
+ top_k = 8
166
+ elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
167
+ top_k = 10
168
+ else:
169
+ top_k = 8
170
+ logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
171
+ return top_k
172
+
173
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
174
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
175
+ q_emb = self.embed_model.get_query_embedding(question)
176
+ scored_nodes: List[Tuple[float, TextNode]] = []
177
+ for node in retrieved_nodes:
178
+ chunk_emb = self.embed_model.get_text_embedding(node.get_content())
179
+ score = cos_sim(q_emb, chunk_emb).item()
180
+ scored_nodes.append((score, node))
181
+ ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
182
+ top = ranked[:top_k]
183
+ return [s for s, _ in top], [n for _, n in top]
184
+
185
+ def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
186
+ self._ensure_loaded()
187
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
188
+ retrieved_nodes = retriever.retrieve(question)
189
+ scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
190
+ context = "\n\n".join(n.get_content()[:500] for n in nodes)
191
+ return context, nodes, scores
192
+
193
+ # ---------- API publique ----------
194
+ def ask(self, question: str, allow_fallback: bool = False) -> str:
195
+ logger.info(f"💬 [Non-stream] Question reçue : {question}")
196
+ is_hello = self._is_greeting(question)
197
+
198
+ # ⚡ Heuristique avant de charger RAG
199
+ if not is_hello and (self._loaded or self._should_use_rag_fast(question)):
200
+ top_k = self.get_adaptive_top_k(question)
201
+ context, _, scores = self.retrieve_context(question, top_k)
202
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
203
+ if mode == "rag":
204
+ prompt = (
205
+ "Instruction: Réponds uniquement à partir du contexte. "
206
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
207
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
208
+ )
209
+ return self._complete(prompt, stop=DEFAULT_STOPS, raw=False)
210
+
211
+ # LLM pur
212
+ prompt_llm = (
213
+ "Réponds brièvement et précisément en français.\n"
214
+ f"Question : {question}\nRéponse :"
215
+ )
216
+ return self._complete(prompt_llm, stop=DEFAULT_STOPS, raw=False)
217
+
218
+ def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
219
+ logger.info(f"💬 [Stream] Question reçue : {question}")
220
+ is_hello = self._is_greeting(question)
221
+
222
+ # ⚡ Heuristique avant de charger RAG
223
+ if not is_hello and (self._loaded or self._should_use_rag_fast(question)):
224
+ top_k = self.get_adaptive_top_k(question)
225
+ context, _, scores = self.retrieve_context(question, top_k)
226
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
227
+ if mode == "rag":
228
+ prompt = (
229
+ "Instruction: Réponds uniquement à partir du contexte. "
230
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
231
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
232
+ )
233
+ logger.info("📡 Début streaming (RAG)...")
234
+ for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
235
+ yield token
236
+ logger.info("📡 Fin streaming (RAG).")
237
+ return
238
+
239
+ # LLM pur
240
+ prompt_llm = (
241
+ "Réponds brièvement et précisément en français.\n"
242
+ f"Question : {question}\nRéponse :"
243
+ )
244
+ logger.info("📡 Début streaming (LLM pur)...")
245
+ for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
246
+ yield token
247
+ logger.info("📡 Fin streaming (LLM pur).")
rag_model_ollama_v1.py CHANGED
@@ -1,20 +1,17 @@
1
-
2
  import os
3
  import pickle
4
- import textwrap
5
  import logging
6
- from typing import List, Optional, Dict, Any, Iterable
7
-
8
  import requests
9
  import faiss
10
- import numpy as np
11
  from llama_index.core import VectorStoreIndex
12
  from llama_index.core.schema import TextNode
13
  from llama_index.vector_stores.faiss import FaissVectorStore
14
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
15
  from sentence_transformers.util import cos_sim
16
 
17
-
18
  # === Logger configuration ===
19
  logger = logging.getLogger("RAGEngine")
20
  logger.setLevel(logging.INFO)
@@ -24,42 +21,33 @@ handler.setFormatter(formatter)
24
  if not logger.handlers:
25
  logger.addHandler(handler)
26
 
27
- MAX_TOKENS = 512
28
-
29
 
 
30
  class OllamaClient:
31
- """
32
- Minimal Ollama client for /api/generate (text completion) with streaming support.
33
- Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
34
- """
35
  def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
36
  self.model = model
37
- self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
 
38
  self.timeout = timeout
39
  self._gen_url = self.host.rstrip("/") + "/api/generate"
40
 
41
- def generate(
42
- self,
43
- prompt: str,
44
- stop: Optional[List[str]] = None,
45
- max_tokens: Optional[int] = None,
46
- stream: bool = False,
47
- options: Optional[Dict[str, Any]] = None,
48
- ) -> str | Iterable[str]:
49
- payload = {
50
  "model": self.model,
51
  "prompt": prompt,
52
  "stream": stream,
53
  }
 
 
54
  if stop:
55
  payload["stop"] = stop
56
  if max_tokens is not None:
57
- # Ollama uses "num_predict" for max new tokens
58
  payload["num_predict"] = int(max_tokens)
59
- if options:
60
- payload["options"] = options
61
-
62
- logger.debug(f"POST {self._gen_url} (stream={stream})")
63
 
64
  if stream:
65
  with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
@@ -70,133 +58,125 @@ class OllamaClient:
70
  try:
71
  data = json.loads(line)
72
  except Exception:
73
- # In case a broken line appears
74
  continue
75
- if "response" in data and data.get("done") is not True:
76
  yield data["response"]
77
  if data.get("done"):
78
  break
79
  return
80
 
81
- # Non-streaming
82
  r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
83
  r.raise_for_status()
84
  data = r.json()
85
  return data.get("response", "")
86
 
87
-
88
- # Lazy import json to keep top clean
89
- import json
90
-
91
-
92
  class RAGEngine:
93
- def __init__(
94
- self,
95
- model_name: str,
96
- vector_path: str,
97
- index_path: str,
98
- model_threads: int = 4,
99
- ollama_host: Optional[str] = None,
100
- ollama_opts: Optional[Dict[str, Any]] = None,
101
- ):
102
- """
103
- Args:
104
- model_name: e.g. "nous-hermes2:Q4_K_M" or "llama3.1:8b-instruct-q4_K_M"
105
- vector_path: pickle file with chunk texts list[str]
106
- index_path: FAISS index path
107
- model_threads: forwarded to Ollama via options.n_threads (if supported by the model)
108
- ollama_host: override OLLAMA_HOST (default http://localhost:11434)
109
- ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
110
- """
111
  logger.info(f"🔎 rag_model_ollama source: {__file__}")
112
- logger.info("📦 Initialisation du moteur RAG (Ollama)...")
113
- # Build options
114
- opts = dict(ollama_opts or {})
115
- # Common low-latency defaults; user can override via ollama_opts
116
- opts.setdefault("temperature", 0.1)
117
- # Try to pass thread hint if supported by the backend
118
- if "num_thread" not in opts and model_threads:
119
- opts["num_thread"] = int(model_threads)
120
 
 
121
  self.llm = OllamaClient(model=model_name, host=ollama_host)
122
- self.ollama_opts = opts
123
 
124
- self.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
 
125
 
126
- logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
127
- with open(vector_path, "rb") as f:
128
- chunk_texts = pickle.load(f)
 
 
 
 
 
 
 
 
 
129
  nodes = [TextNode(text=chunk) for chunk in chunk_texts]
130
 
131
- faiss_index = faiss.read_index(index_path)
 
132
  vector_store = FaissVectorStore(faiss_index=faiss_index)
133
- self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
134
 
135
- logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
 
136
 
137
- # ---------------- LLM helpers (via Ollama) ----------------
 
138
 
139
- def _complete(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = 128) -> str:
140
- text = self.llm.generate(
141
- prompt=prompt,
142
- stop=stop,
143
- max_tokens=max_tokens,
144
- stream=False,
145
- options=self.ollama_opts,
146
- )
147
- # Some Ollama setups may stream even when stream=False. Coerce generators to string.
148
- try:
149
- if hasattr(text, "__iter__") and not isinstance(text, (str, bytes)):
150
- chunks = []
151
- for t in text:
152
- if not isinstance(t, (str, bytes)):
153
- continue
154
- chunks.append(t)
155
- text = "".join(chunks)
156
- except Exception:
157
- pass
158
- return (text or "").strip()
159
 
160
- def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = MAX_TOKENS):
161
- return self.llm.generate(
162
- prompt=prompt,
163
- stop=stop,
164
- max_tokens=max_tokens,
165
- stream=True,
166
- options=self.ollama_opts,
167
- )
168
 
169
- # ---------------- Reformulation ----------------
 
 
 
 
170
 
171
- def reformulate_question(self, question: str) -> str:
172
- logger.info("🔁 Reformulation de la question (sans contexte)...")
173
- prompt = f"""Tu es un assistant expert chargé de clarifier des questions floues.
 
174
 
175
- Transforme la question suivante en une question claire, explicite et complète, sans ajouter d'informations extérieures.
 
 
 
 
 
 
 
 
 
176
 
177
- Question floue : {question}
178
- Question reformulée :"""
179
- reformulated = self._complete(prompt, stop=["\n"], max_tokens=128)
180
- logger.info(f"📝 Reformulée : {reformulated}")
181
- return reformulated
182
 
183
- def reformulate_with_context(self, question: str, context_sample: str) -> str:
184
- logger.info("🔁 Reformulation de la question avec contexte...")
185
- prompt = f"""Tu es un assistant expert en machine learning. Ton rôle est de reformuler les questions utilisateur en tenant compte du contexte ci-dessous, extrait d’un rapport technique sur un projet de reconnaissance de maladies de plantes.
186
 
187
- Ta mission est de transformer une question vague ou floue en une question précise et adaptée au contenu du rapport. Ne donne pas une interprétation hors sujet. Ne reformule pas en termes de produits commerciaux.
 
 
 
 
 
 
 
 
188
 
189
- Contexte :
190
- {context_sample}
 
191
 
192
- Question initiale : {question}
193
- Question reformulée :"""
194
- reformulated = self._complete(prompt, stop=["\n"], max_tokens=128)
195
- logger.info(f"📝 Reformulée avec contexte : {reformulated}")
196
- return reformulated
197
 
198
- # ---------------- Retrieval ----------------
 
 
 
 
199
 
 
200
  def get_adaptive_top_k(self, question: str) -> int:
201
  q = question.lower()
202
  if len(q.split()) <= 7:
@@ -208,78 +188,79 @@ Question reformulée :"""
208
  logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
209
  return top_k
210
 
211
- def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3):
212
- logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour la question : « {question} »")
 
213
  q_emb = self.embed_model.get_query_embedding(question)
214
- scored_nodes = []
215
-
216
  for node in retrieved_nodes:
217
- chunk_text = node.get_content()
218
- chunk_emb = self.embed_model.get_text_embedding(chunk_text)
219
  score = cos_sim(q_emb, chunk_emb).item()
220
  scored_nodes.append((score, node))
 
 
 
221
 
222
- ranked_nodes = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
223
-
224
- logger.info("📊 Chunks les plus pertinents :")
225
- for i, (score, node) in enumerate(ranked_nodes[:top_k]):
226
- chunk_preview = textwrap.shorten(node.get_content().replace("\n", " "), width=100)
227
- logger.info(f"#{i+1} | Score: {score:.4f} | {chunk_preview}")
228
-
229
- return [n for _, n in ranked_nodes[:top_k]]
230
-
231
- def retrieve_context(self, question: str, top_k: int = 3):
232
- logger.info(f"📥 Récupération du contexte...")
233
- retriever = self.index.as_retriever(similarity_top_k=top_k)
234
  retrieved_nodes = retriever.retrieve(question)
235
- reranked_nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
236
- context = "\n\n".join(n.get_content()[:500] for n in reranked_nodes)
237
- return context, reranked_nodes
238
-
239
- # ---------------- Public API ----------------
240
-
241
- def ask(self, question_raw: str) -> str:
242
- logger.info(f"💬 Question reçue : {question_raw}")
243
- if len(question_raw.split()) <= 100:
244
- context_sample, _ = self.retrieve_context(question_raw, top_k=3)
245
- reformulated = self.reformulate_with_context(question_raw, context_sample)
246
- else:
247
- reformulated = self.reformulate_question(question_raw)
248
-
249
- logger.info(f"📝 Question reformulée : {reformulated}")
250
- top_k = self.get_adaptive_top_k(reformulated)
251
- context, _ = self.retrieve_context(reformulated, top_k)
252
-
253
- prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
254
-
255
- Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
256
-
257
- Contexte :
258
- {context}
259
-
260
- Question : {reformulated}
261
- ### Réponse:"""
262
-
263
- response = self._complete(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS)
264
- response = response.strip().split("###")[0]
265
- logger.info(f"🧠 Réponse générée : {response[:120]}{{'...' if len(response) > 120 else ''}}")
266
- return response
267
 
268
- def ask_stream(self, question: str):
269
  logger.info(f"💬 [Stream] Question reçue : {question}")
270
- top_k = self.get_adaptive_top_k(question)
271
- context, _ = self.retrieve_context(question, top_k)
272
-
273
- prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
274
-
275
- Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
276
-
277
- Contexte :
278
- {context}
279
-
280
- Question : {question}
281
- ### Réponse:"""
282
-
283
- logger.info("📡 Début du streaming de la réponse...")
284
- for token in self._complete_stream(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS):
 
 
 
 
 
 
 
 
 
 
 
285
  yield token
 
 
 
1
  import os
2
  import pickle
 
3
  import logging
4
+ import time
5
+ from typing import List, Optional, Dict, Any, Iterable, Tuple
6
  import requests
7
  import faiss
8
+ import json
9
  from llama_index.core import VectorStoreIndex
10
  from llama_index.core.schema import TextNode
11
  from llama_index.vector_stores.faiss import FaissVectorStore
12
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
13
  from sentence_transformers.util import cos_sim
14
 
 
15
  # === Logger configuration ===
16
  logger = logging.getLogger("RAGEngine")
17
  logger.setLevel(logging.INFO)
 
21
  if not logger.handlers:
22
  logger.addHandler(handler)
23
 
24
+ MAX_TOKENS = 64
25
+ DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
26
 
27
+ # ---------- Client Ollama (use /api/generate, no options) ----------
28
  class OllamaClient:
 
 
 
 
29
  def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
30
  self.model = model
31
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11435") #mode proxy
32
+ #self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
33
  self.timeout = timeout
34
  self._gen_url = self.host.rstrip("/") + "/api/generate"
35
 
36
+ def generate(self, prompt: str, stop: Optional[List[str]] = None,
37
+ max_tokens: Optional[int] = None, stream: bool = False,
38
+ options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
39
+ payload: Dict[str, Any] = {
 
 
 
 
 
40
  "model": self.model,
41
  "prompt": prompt,
42
  "stream": stream,
43
  }
44
+ if raw:
45
+ payload["raw"] = True
46
  if stop:
47
  payload["stop"] = stop
48
  if max_tokens is not None:
 
49
  payload["num_predict"] = int(max_tokens)
50
+ # ❌ aucune "options" pour laisser Ollama auto-tuner
 
 
 
51
 
52
  if stream:
53
  with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
 
58
  try:
59
  data = json.loads(line)
60
  except Exception:
 
61
  continue
62
+ if "response" in data and not data.get("done"):
63
  yield data["response"]
64
  if data.get("done"):
65
  break
66
  return
67
 
 
68
  r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
69
  r.raise_for_status()
70
  data = r.json()
71
  return data.get("response", "")
72
 
73
+ # ---------- RAG Engine (lazy load + heuristique GK) ----------
 
 
 
 
74
  class RAGEngine:
75
+ def __init__(self, model_name: str, vector_path: str, index_path: str,
76
+ model_threads: int = 4, ollama_host: Optional[str] = None,
77
+ ollama_opts: Optional[Dict[str, Any]] = None):
78
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  logger.info(f"🔎 rag_model_ollama source: {__file__}")
80
+ logger.info("📦 Initialisation du moteur (lazy RAG)...")
 
 
 
 
 
 
 
81
 
82
+ # LLM prêt immédiatement
83
  self.llm = OllamaClient(model=model_name, host=ollama_host)
 
84
 
85
+ # chemins pour chargement différé
86
+ self.vector_path = vector_path
87
+ self.index_path = index_path
88
+
89
+ # objets RAG paresseux
90
+ self.embed_model: Optional[HuggingFaceEmbedding] = None
91
+ self.index: Optional[VectorStoreIndex] = None
92
+ self._loaded = False
93
 
94
+ logger.info(" Moteur initialisé (sans charger FAISS ni chunks).")
95
+
96
+ # ---------- lazy loader ----------
97
+ def _ensure_loaded(self):
98
+ if self._loaded:
99
+ return
100
+ t0 = time.perf_counter()
101
+ logger.info("⏳ Chargement lazy des données RAG (FAISS + chunks + embeddings)...")
102
+
103
+ # 1) chunks
104
+ with open(self.vector_path, "rb") as f:
105
+ chunk_texts: List[str] = pickle.load(f)
106
  nodes = [TextNode(text=chunk) for chunk in chunk_texts]
107
 
108
+ # 2) index FAISS
109
+ faiss_index = faiss.read_index(self.index_path)
110
  vector_store = FaissVectorStore(faiss_index=faiss_index)
 
111
 
112
+ # 3) modèle d'embedding
113
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
114
 
115
+ # 4) index LlamaIndex
116
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
117
 
118
+ self._loaded = True
119
+ logger.info(f"✅ RAG chargé en {time.perf_counter() - t0:.2f}s (lazy).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # ---------- génération ----------
122
+ def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
123
+ max_tokens: int = MAX_TOKENS, raw: bool = False):
124
+ return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
125
+ stream=True, raw=raw)
 
 
 
126
 
127
+ def _complete(self, prompt: str, stop: Optional[List[str]] = None,
128
+ max_tokens: int = 128, raw: bool = False) -> str:
129
+ text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
130
+ stream=False, raw=raw)
131
+ return (text or "").strip()
132
 
133
+ # ---------- heuristiques ----------
134
+ def _is_greeting(self, text: str) -> bool:
135
+ s = text.lower().strip()
136
+ return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
137
 
138
+ def _looks_general_knowledge(self, q: str) -> bool:
139
+ q = q.lower().strip()
140
+ gk_keywords = (
141
+ "capitale", "date de naissance", "qui est", "qu'est-ce", "definition",
142
+ "définition", "histoire", "pays", "ville", "math", "science", "sport"
143
+ )
144
+ if len(q.split()) <= 9:
145
+ if any(k in q for k in gk_keywords) or q.startswith(("quelle est", "qui est", "qu'est-ce", "c'est quoi")):
146
+ return True
147
+ return False
148
 
149
+ def _should_use_rag_fast(self, question: str) -> bool:
150
+ """N'active RAG que si on détecte des indices 'docs' / longueur significative."""
151
+ q = question.lower()
 
 
152
 
153
+ # 1) GK pas de RAG
154
+ if self._looks_general_knowledge(q):
155
+ return False
156
 
157
+ # 2) indices RAG
158
+ doc_keywords = (
159
+ "document", "docs", "procédure", "politique", "policy",
160
+ "manuel", "guide", "pdf", "docling", "selon", "dans le contexte",
161
+ "page", "section", "chapitre", "référence", "références", "conformément",
162
+ "note technique", "spécification", "spec", "architecture", "adr"
163
+ )
164
+ if any(k in q for k in doc_keywords):
165
+ return True
166
 
167
+ # 3) question longue → probable RAG
168
+ if len(q.split()) >= 14:
169
+ return True
170
 
171
+ return False
 
 
 
 
172
 
173
+ def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
174
+ if is_greeting:
175
+ return "llm"
176
+ top = scores[0] if scores else 0.0
177
+ return "rag" if top >= tau else "llm"
178
 
179
+ # ---------- retrieval ----------
180
  def get_adaptive_top_k(self, question: str) -> int:
181
  q = question.lower()
182
  if len(q.split()) <= 7:
 
188
  logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
189
  return top_k
190
 
191
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
192
+ assert self.embed_model is not None
193
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
194
  q_emb = self.embed_model.get_query_embedding(question)
195
+ scored_nodes: List[Tuple[float, TextNode]] = []
 
196
  for node in retrieved_nodes:
197
+ chunk_emb = self.embed_model.get_text_embedding(node.get_content())
 
198
  score = cos_sim(q_emb, chunk_emb).item()
199
  scored_nodes.append((score, node))
200
+ ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
201
+ top = ranked[:top_k]
202
+ return [s for s, _ in top], [n for _, n in top]
203
 
204
+ def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
205
+ self._ensure_loaded()
206
+ retriever = self.index.as_retriever(similarity_top_k=top_k) # type: ignore
 
 
 
 
 
 
 
 
 
207
  retrieved_nodes = retriever.retrieve(question)
208
+ scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
209
+ context = "\n\n".join(n.get_content()[:500] for n in nodes)
210
+ return context, nodes, scores
211
+
212
+ # ---------- API publique ----------
213
+ def ask(self, question: str, allow_fallback: bool = False) -> str:
214
+ logger.info(f"💬 [Non-stream] Question reçue : {question}")
215
+ is_hello = self._is_greeting(question)
216
+
217
+ use_rag = (self._loaded and not is_hello) or (not self._loaded and self._should_use_rag_fast(question))
218
+ if use_rag:
219
+ top_k = self.get_adaptive_top_k(question)
220
+ context, _, scores = self.retrieve_context(question, top_k)
221
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
222
+ if mode == "rag":
223
+ prompt = (
224
+ "Instruction: Réponds uniquement à partir du contexte. "
225
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
226
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
227
+ )
228
+ return self._complete(prompt, stop=DEFAULT_STOPS, raw=False)
229
+
230
+ # LLM pur
231
+ prompt_llm = (
232
+ "Réponds brièvement et précisément en français.\n"
233
+ f"Question : {question}\nRéponse :"
234
+ )
235
+ return self._complete(prompt_llm, stop=DEFAULT_STOPS, raw=False)
 
 
 
 
236
 
237
+ def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
238
  logger.info(f"💬 [Stream] Question reçue : {question}")
239
+ is_hello = self._is_greeting(question)
240
+
241
+ use_rag = (self._loaded and not is_hello) or (not self._loaded and self._should_use_rag_fast(question))
242
+ if use_rag:
243
+ top_k = self.get_adaptive_top_k(question)
244
+ context, _, scores = self.retrieve_context(question, top_k)
245
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
246
+ if mode == "rag":
247
+ prompt = (
248
+ "Instruction: Réponds uniquement à partir du contexte. "
249
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
250
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
251
+ )
252
+ logger.info("📡 Début streaming (RAG)...")
253
+ for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
254
+ yield token
255
+ logger.info("📡 Fin streaming (RAG).")
256
+ return
257
+
258
+ # LLM pur
259
+ prompt_llm = (
260
+ "Réponds brièvement et précisément en français.\n"
261
+ f"Question : {question}\nRéponse :"
262
+ )
263
+ logger.info("📡 Début streaming (LLM pur)...")
264
+ for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
265
  yield token
266
+ logger.info("📡 Fin streaming (LLM pur).")
rag_model_ollama_v1_ok_full_load.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import logging
4
+ from typing import List, Optional, Dict, Any, Iterable, Tuple
5
+ import requests
6
+ import faiss
7
+ import json
8
+ from llama_index.core import VectorStoreIndex
9
+ from llama_index.core.schema import TextNode
10
+ from llama_index.vector_stores.faiss import FaissVectorStore
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from sentence_transformers.util import cos_sim
13
+
14
+ # === Logger configuration ===
15
+ logger = logging.getLogger("RAGEngine")
16
+ logger.setLevel(logging.INFO)
17
+ handler = logging.StreamHandler()
18
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
19
+ handler.setFormatter(formatter)
20
+ if not logger.handlers:
21
+ logger.addHandler(handler)
22
+
23
+ MAX_TOKENS = 64
24
+ DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
25
+
26
+ class OllamaClient:
27
+ def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
28
+ self.model = model
29
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
30
+ self.timeout = timeout
31
+ self._gen_url = self.host.rstrip("/") + "/api/generate"
32
+
33
+ def generate(self, prompt: str, stop: Optional[List[str]] = None,
34
+ max_tokens: Optional[int] = None, stream: bool = False,
35
+ options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
36
+ payload: Dict[str, Any] = {
37
+ "model": self.model,
38
+ "prompt": prompt,
39
+ "stream": stream,
40
+ }
41
+ if raw:
42
+ payload["raw"] = True
43
+ if stop:
44
+ payload["stop"] = stop
45
+ if max_tokens is not None:
46
+ payload["num_predict"] = int(max_tokens)
47
+ # ❌ Pas d'options envoyées pour laisser Ollama choisir ses defaults
48
+
49
+ logger.debug(f"POST {self._gen_url} (stream={stream})")
50
+
51
+ if stream:
52
+ with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
53
+ r.raise_for_status()
54
+ for line in r.iter_lines(decode_unicode=True):
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ except Exception:
60
+ continue
61
+ if "response" in data and not data.get("done"):
62
+ yield data["response"]
63
+ if data.get("done"):
64
+ break
65
+ return
66
+
67
+ r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
68
+ r.raise_for_status()
69
+ data = r.json()
70
+ return data.get("response", "")
71
+
72
+
73
+ class RAGEngine:
74
+ def __init__(self, model_name: str, vector_path: str, index_path: str,
75
+ model_threads: int = 4, ollama_host: Optional[str] = None,
76
+ ollama_opts: Optional[Dict[str, Any]] = None):
77
+
78
+ logger.info(f"🔎 rag_model_ollama source: {__file__}")
79
+ logger.info("📦 Initialisation du moteur RAG (Ollama)...")
80
+
81
+ # ❌ Pas d'options Ollama stockées
82
+ self.llm = OllamaClient(model=model_name, host=ollama_host)
83
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
84
+
85
+ logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
86
+ with open(vector_path, "rb") as f:
87
+ chunk_texts: List[str] = pickle.load(f)
88
+ nodes = [TextNode(text=chunk) for chunk in chunk_texts]
89
+
90
+ faiss_index = faiss.read_index(index_path)
91
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
92
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
93
+
94
+ logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
95
+
96
+ # Warmup pour charger le modèle
97
+ try:
98
+ logger.info("⚡ Warmup du modèle Ollama...")
99
+ for _ in self._complete_stream("Bonjour", max_tokens=8, raw=False):
100
+ pass
101
+ except Exception as e:
102
+ logger.warning(f"Warmup échoué : {e}")
103
+
104
+ def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
105
+ max_tokens: int = MAX_TOKENS, raw: bool = False):
106
+ return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
107
+ stream=True, raw=raw)
108
+
109
+ def _complete(self, prompt: str, stop: Optional[List[str]] = None,
110
+ max_tokens: int = 128, raw: bool = False) -> str:
111
+ text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
112
+ stream=False, raw=raw)
113
+ return (text or "").strip()
114
+
115
+ def _is_greeting(self, text: str) -> bool:
116
+ s = text.lower().strip()
117
+ return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
118
+
119
+ def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
120
+ if is_greeting:
121
+ return "llm"
122
+ top = scores[0] if scores else 0.0
123
+ return "rag" if top >= tau else "llm"
124
+
125
+ def get_adaptive_top_k(self, question: str) -> int:
126
+ q = question.lower()
127
+ if len(q.split()) <= 7:
128
+ top_k = 8
129
+ elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
130
+ top_k = 10
131
+ else:
132
+ top_k = 8
133
+ logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
134
+ return top_k
135
+
136
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
137
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
138
+ q_emb = self.embed_model.get_query_embedding(question)
139
+ scored_nodes: List[Tuple[float, TextNode]] = []
140
+ for node in retrieved_nodes:
141
+ chunk_emb = self.embed_model.get_text_embedding(node.get_content())
142
+ score = cos_sim(q_emb, chunk_emb).item()
143
+ scored_nodes.append((score, node))
144
+ ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
145
+ top = ranked[:top_k]
146
+ return [s for s, _ in top], [n for _, n in top]
147
+
148
+ def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
149
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
150
+ retrieved_nodes = retriever.retrieve(question)
151
+ scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
152
+ context = "\n\n".join(n.get_content()[:500] for n in nodes)
153
+ return context, nodes, scores
154
+
155
+ def ask(self, question: str, allow_fallback: bool = False) -> str:
156
+ """Génération non-stream"""
157
+ logger.info(f"💬 [Non-stream] Question reçue : {question}")
158
+ is_hello = self._is_greeting(question)
159
+ context, scores = "", []
160
+ if not is_hello:
161
+ top_k = self.get_adaptive_top_k(question)
162
+ context, _, scores = self.retrieve_context(question, top_k)
163
+
164
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
165
+ logger.info(f"🧭 Mode choisi (non-stream) : {mode}")
166
+
167
+ if mode == "rag":
168
+ prompt = (
169
+ "Instruction: Réponds uniquement à partir du contexte. "
170
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
171
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
172
+ )
173
+ return self._complete(prompt, stop=DEFAULT_STOPS, raw=False)
174
+
175
+ prompt_llm = (
176
+ "Réponds brièvement et précisément en français.\n"
177
+ f"Question : {question}\nRéponse :"
178
+ )
179
+ return self._complete(prompt_llm, stop=DEFAULT_STOPS, raw=False)
180
+
181
+ def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
182
+ logger.info(f"💬 [Stream] Question reçue : {question}")
183
+ is_hello = self._is_greeting(question)
184
+ context, scores = "", []
185
+ if not is_hello:
186
+ top_k = self.get_adaptive_top_k(question)
187
+ context, _, scores = self.retrieve_context(question, top_k)
188
+
189
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
190
+ logger.info(f"🧭 Mode choisi (stream) : {mode}")
191
+
192
+ if mode == "rag":
193
+ prompt = (
194
+ "Instruction: Réponds uniquement à partir du contexte. "
195
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
196
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
197
+ )
198
+ logger.info("📡 Début streaming (RAG)...")
199
+ for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
200
+ yield token
201
+ logger.info("📡 Fin streaming (RAG).")
202
+ return
203
+
204
+ prompt_llm = (
205
+ "Réponds brièvement et précisément en français.\n"
206
+ f"Question : {question}\nRéponse :"
207
+ )
208
+ logger.info("📡 Début streaming (LLM pur)...")
209
+ for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
210
+ yield token
211
+ logger.info("📡 Fin streaming (LLM pur).")
rag_model_ollama_v1_ok_llm.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import logging
4
+ from typing import List, Optional, Dict, Any, Iterable, Tuple
5
+ import requests
6
+ import faiss
7
+ import json
8
+ from llama_index.core import VectorStoreIndex
9
+ from llama_index.core.schema import TextNode
10
+ from llama_index.vector_stores.faiss import FaissVectorStore
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from sentence_transformers.util import cos_sim
13
+
14
+ # === Logger configuration ===
15
+ logger = logging.getLogger("RAGEngine")
16
+ logger.setLevel(logging.INFO)
17
+ handler = logging.StreamHandler()
18
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
19
+ handler.setFormatter(formatter)
20
+ if not logger.handlers:
21
+ logger.addHandler(handler)
22
+
23
+ MAX_TOKENS = 64
24
+ DEFAULT_STOPS = ["### Réponse:", "\n\n", "###"]
25
+
26
+ class OllamaClient:
27
+ def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
28
+ self.model = model
29
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
30
+ self.timeout = timeout
31
+ self._gen_url = self.host.rstrip("/") + "/api/generate"
32
+
33
+ def generate(self, prompt: str, stop: Optional[List[str]] = None,
34
+ max_tokens: Optional[int] = None, stream: bool = False,
35
+ options: Optional[Dict[str, Any]] = None, raw: bool = False) -> str | Iterable[str]:
36
+ payload: Dict[str, Any] = {
37
+ "model": self.model,
38
+ "prompt": prompt,
39
+ "stream": stream,
40
+ }
41
+ if raw:
42
+ payload["raw"] = True
43
+ if stop:
44
+ payload["stop"] = stop
45
+ if max_tokens is not None:
46
+ payload["num_predict"] = int(max_tokens)
47
+ # ❌ Pas d'options envoyées pour laisser Ollama choisir ses defaults
48
+
49
+ logger.debug(f"POST {self._gen_url} (stream={stream})")
50
+
51
+ if stream:
52
+ with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
53
+ r.raise_for_status()
54
+ for line in r.iter_lines(decode_unicode=True):
55
+ if not line:
56
+ continue
57
+ try:
58
+ data = json.loads(line)
59
+ except Exception:
60
+ continue
61
+ if "response" in data and not data.get("done"):
62
+ yield data["response"]
63
+ if data.get("done"):
64
+ break
65
+ return
66
+
67
+ r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
68
+ r.raise_for_status()
69
+ data = r.json()
70
+ return data.get("response", "")
71
+
72
+
73
+ class RAGEngine:
74
+ def __init__(self, model_name: str, vector_path: str, index_path: str,
75
+ model_threads: int = 4, ollama_host: Optional[str] = None,
76
+ ollama_opts: Optional[Dict[str, Any]] = None):
77
+
78
+ logger.info(f"🔎 rag_model_ollama source: {__file__}")
79
+ logger.info("📦 Initialisation du moteur RAG (Ollama)...")
80
+
81
+ # ❌ Pas d'options Ollama stockées
82
+ self.llm = OllamaClient(model=model_name, host=ollama_host)
83
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
84
+
85
+ logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
86
+ with open(vector_path, "rb") as f:
87
+ chunk_texts: List[str] = pickle.load(f)
88
+ nodes = [TextNode(text=chunk) for chunk in chunk_texts]
89
+
90
+ faiss_index = faiss.read_index(index_path)
91
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
92
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
93
+
94
+ logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
95
+
96
+ # Warmup pour charger le modèle
97
+ try:
98
+ logger.info("⚡ Warmup du modèle Ollama...")
99
+ for _ in self._complete_stream("Bonjour", max_tokens=8, raw=False):
100
+ pass
101
+ except Exception as e:
102
+ logger.warning(f"Warmup échoué : {e}")
103
+
104
+ def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None,
105
+ max_tokens: int = MAX_TOKENS, raw: bool = False):
106
+ return self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
107
+ stream=True, raw=raw)
108
+
109
+ def _complete(self, prompt: str, stop: Optional[List[str]] = None,
110
+ max_tokens: int = 128, raw: bool = False) -> str:
111
+ text = self.llm.generate(prompt=prompt, stop=stop, max_tokens=max_tokens,
112
+ stream=False, raw=raw)
113
+ return (text or "").strip()
114
+
115
+ def _is_greeting(self, text: str) -> bool:
116
+ s = text.lower().strip()
117
+ return s in {"bonjour", "salut", "hello", "bonsoir", "hi", "coucou", "yo"} or len(s.split()) <= 2
118
+
119
+ def _decide_mode(self, scores: List[float], tau: float = 0.32, is_greeting: bool = False) -> str:
120
+ if is_greeting:
121
+ return "llm"
122
+ top = scores[0] if scores else 0.0
123
+ return "rag" if top >= tau else "llm"
124
+
125
+ def get_adaptive_top_k(self, question: str) -> int:
126
+ q = question.lower()
127
+ if len(q.split()) <= 7:
128
+ top_k = 8
129
+ elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
130
+ top_k = 10
131
+ else:
132
+ top_k = 8
133
+ logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
134
+ return top_k
135
+
136
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3) -> Tuple[List[float], List[TextNode]]:
137
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour : « {question} »")
138
+ q_emb = self.embed_model.get_query_embedding(question)
139
+ scored_nodes: List[Tuple[float, TextNode]] = []
140
+ for node in retrieved_nodes:
141
+ chunk_emb = self.embed_model.get_text_embedding(node.get_content())
142
+ score = cos_sim(q_emb, chunk_emb).item()
143
+ scored_nodes.append((score, node))
144
+ ranked = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
145
+ top = ranked[:top_k]
146
+ return [s for s, _ in top], [n for _, n in top]
147
+
148
+ def retrieve_context(self, question: str, top_k: int = 3) -> Tuple[str, List[TextNode], List[float]]:
149
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
150
+ retrieved_nodes = retriever.retrieve(question)
151
+ scores, nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
152
+ context = "\n\n".join(n.get_content()[:500] for n in nodes)
153
+ return context, nodes, scores
154
+
155
+ def ask_stream(self, question: str, allow_fallback: bool = False) -> Iterable[str]:
156
+ logger.info(f"💬 [Stream] Question reçue : {question}")
157
+ is_hello = self._is_greeting(question)
158
+ context, scores = "", []
159
+ if not is_hello:
160
+ top_k = self.get_adaptive_top_k(question)
161
+ context, _, scores = self.retrieve_context(question, top_k)
162
+
163
+ mode = self._decide_mode(scores, tau=0.32, is_greeting=is_hello)
164
+ logger.info(f"🧭 Mode choisi (stream) : {mode}")
165
+
166
+ if mode == "rag":
167
+ prompt = (
168
+ "Instruction: Réponds uniquement à partir du contexte. "
169
+ "Si la réponse n'est pas déductible, réponds exactement: \"Information non présente dans le contexte.\""
170
+ f"\n\nContexte :\n{context}\n\nQuestion : {question}\nRéponse :"
171
+ )
172
+ logger.info("📡 Début streaming (RAG)...")
173
+ for token in self._complete_stream(prompt, stop=DEFAULT_STOPS, raw=False):
174
+ yield token
175
+ logger.info("📡 Fin streaming (RAG).")
176
+ return
177
+
178
+ prompt_llm = (
179
+ "Réponds brièvement et précisément en français.\n"
180
+ f"Question : {question}\nRéponse :"
181
+ )
182
+ logger.info("📡 Début streaming (LLM pur)...")
183
+ for token in self._complete_stream(prompt_llm, stop=DEFAULT_STOPS, raw=False):
184
+ yield token
185
+ logger.info("📡 Fin streaming (LLM pur).")
rag_model_ollama_v2.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import pickle
4
+ import textwrap
5
+ import logging
6
+ from typing import List, Optional, Dict, Any, Iterable
7
+
8
+ import requests
9
+ import faiss
10
+ import numpy as np
11
+ from llama_index.core import VectorStoreIndex
12
+ from llama_index.core.schema import TextNode
13
+ from llama_index.vector_stores.faiss import FaissVectorStore
14
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
15
+ from sentence_transformers.util import cos_sim
16
+
17
+
18
+ # === Logger configuration ===
19
+ logger = logging.getLogger("RAGEngine")
20
+ logger.setLevel(logging.INFO)
21
+ handler = logging.StreamHandler()
22
+ formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s")
23
+ handler.setFormatter(formatter)
24
+ if not logger.handlers:
25
+ logger.addHandler(handler)
26
+
27
+ #MAX_TOKENS = 512
28
+ MAX_TOKENS = 64
29
+
30
+
31
+ class OllamaClient:
32
+ """
33
+ Minimal Ollama client for /api/generate (text completion) with streaming support.
34
+ Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
35
+ """
36
+ def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
37
+ self.model = model
38
+ self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
39
+ self.timeout = timeout
40
+ self._gen_url = self.host.rstrip("/") + "/api/generate"
41
+
42
+ def generate(
43
+ self,
44
+ prompt: str,
45
+ stop: Optional[List[str]] = None,
46
+ max_tokens: Optional[int] = None,
47
+ stream: bool = False,
48
+ options: Optional[Dict[str, Any]] = None,
49
+ raw: bool = False
50
+ ) -> str | Iterable[str]:
51
+ payload = {
52
+ "model": self.model,
53
+ "prompt": prompt,
54
+ "stream": stream,
55
+ }
56
+ if raw:
57
+ payload["raw"]=True
58
+ if stop:
59
+ payload["stop"] = stop
60
+ if max_tokens is not None:
61
+ # Ollama uses "num_predict" for max new tokens
62
+ payload["num_predict"] = int(max_tokens)
63
+ if options:
64
+ payload["options"] = options
65
+
66
+ logger.debug(f"POST {self._gen_url} (stream={stream})")
67
+
68
+ if stream:
69
+ with requests.post(self._gen_url, json=payload, stream=True, timeout=self.timeout) as r:
70
+ r.raise_for_status()
71
+ for line in r.iter_lines(decode_unicode=True):
72
+ if not line:
73
+ continue
74
+ try:
75
+ data = json.loads(line)
76
+ except Exception:
77
+ # In case a broken line appears
78
+ continue
79
+ if "response" in data and data.get("done") is not True:
80
+ yield data["response"]
81
+ if data.get("done"):
82
+ break
83
+ return
84
+
85
+ # Non-streaming
86
+ r = requests.post(self._gen_url, json=payload, timeout=self.timeout)
87
+ r.raise_for_status()
88
+ data = r.json()
89
+ return data.get("response", "")
90
+
91
+
92
+ # Lazy import json to keep top clean
93
+ import json
94
+
95
+
96
+ class RAGEngine:
97
+ def __init__(
98
+ self,
99
+ model_name: str,
100
+ vector_path: str,
101
+ index_path: str,
102
+ model_threads: int = 4,
103
+ ollama_host: Optional[str] = None,
104
+ ollama_opts: Optional[Dict[str, Any]] = None,
105
+ ):
106
+ """
107
+ Args:
108
+ model_name: e.g. "nous-hermes2:Q4_K_M" or "llama3.1:8b-instruct-q4_K_M"
109
+ vector_path: pickle file with chunk texts list[str]
110
+ index_path: FAISS index path
111
+ model_threads: forwarded to Ollama via options.n_threads (if supported by the model)
112
+ ollama_host: override OLLAMA_HOST (default http://localhost:11434)
113
+ ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
114
+ """
115
+ logger.info(f"🔎 rag_model_ollama source: {__file__}")
116
+ logger.info("📦 Initialisation du moteur RAG (Ollama)...")
117
+ # Build options
118
+ opts = dict(ollama_opts or {})
119
+ # Common low-latency defaults; user can override via ollama_opts
120
+ opts.setdefault("temperature", 0.1)
121
+ # Try to pass thread hint if supported by the backend
122
+ if "num_thread" not in opts and model_threads:
123
+ opts["num_thread"] = int(model_threads)
124
+
125
+ self.llm = OllamaClient(model=model_name, host=ollama_host)
126
+ self.ollama_opts = opts
127
+
128
+ #self.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
129
+
130
+ self.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
131
+ logger.info(f"📂 Chargement des données vectorielles depuis {vector_path}")
132
+ with open(vector_path, "rb") as f:
133
+ chunk_texts = pickle.load(f)
134
+ nodes = [TextNode(text=chunk) for chunk in chunk_texts]
135
+
136
+ faiss_index = faiss.read_index(index_path)
137
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
138
+ self.index = VectorStoreIndex(nodes=nodes, embed_model=self.embed_model, vector_store=vector_store)
139
+
140
+ logger.info("✅ Moteur RAG (Ollama) initialisé avec succès.")
141
+
142
+ # ---------------- LLM helpers (via Ollama) ----------------
143
+
144
+ def _complete(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = 128,raw:bool=True) -> str:
145
+ text = self.llm.generate(
146
+ prompt=prompt,
147
+ stop=stop,
148
+ max_tokens=max_tokens,
149
+ stream=False,
150
+ options=self.ollama_opts,
151
+ raw=raw
152
+ )
153
+ # Some Ollama setups may stream even when stream=False. Coerce generators to string.
154
+ try:
155
+ if hasattr(text, "__iter__") and not isinstance(text, (str, bytes)):
156
+ chunks = []
157
+ for t in text:
158
+ if not isinstance(t, (str, bytes)):
159
+ continue
160
+ chunks.append(t)
161
+ text = "".join(chunks)
162
+ except Exception:
163
+ pass
164
+ return (text or "").strip()
165
+
166
+ def _complete_stream(self, prompt: str, stop: Optional[List[str]] = None, max_tokens: int = MAX_TOKENS,raw : bool =True):
167
+ return self.llm.generate(
168
+ prompt=prompt,
169
+ stop=stop,
170
+ max_tokens=max_tokens,
171
+ stream=True,
172
+ options=self.ollama_opts,
173
+ raw=raw
174
+ )
175
+
176
+ # ---------------- Reformulation ----------------
177
+
178
+ def reformulate_question(self, question: str) -> str:
179
+ logger.info("🔁 Reformulation de la question (sans contexte)...")
180
+ prompt = f"""Tu es un assistant expert chargé de clarifier des questions floues.
181
+
182
+ Transforme la question suivante en une question claire, explicite et complète, sans ajouter d'informations extérieures.
183
+
184
+ Question floue : {question}
185
+ Question reformulée :"""
186
+ reformulated = self._complete(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=128)
187
+ logger.info(f"📝 Reformulée : {reformulated}")
188
+ return reformulated.strip().split("###")[0]
189
+
190
+ def reformulate_with_context(self, question: str, context_sample: str) -> str:
191
+ logger.info("🔁 Reformulation de la question avec contexte...")
192
+ prompt = f"""Tu es un assistant expert en machine learning. Ton rôle est de reformuler les questions utilisateur en tenant compte du contexte ci-dessous, extrait d’un rapport technique sur un projet de reconnaissance de maladies de plantes.
193
+
194
+ Ta mission est de transformer une question vague ou floue en une question précise et adaptée au contenu du rapport. Ne donne pas une interprétation hors sujet. Ne reformule pas en termes de produits commerciaux.
195
+
196
+ Contexte :
197
+ {context_sample}
198
+
199
+ Question initiale : {question}
200
+ Question reformulée :"""
201
+ reformulated = self._complete(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=128)
202
+ logger.info(f"📝 Reformulée avec contexte : {reformulated}")
203
+ return reformulated
204
+
205
+ # ---------------- Retrieval ----------------
206
+
207
+ def get_adaptive_top_k(self, question: str) -> int:
208
+ q = question.lower()
209
+ if len(q.split()) <= 7:
210
+ top_k = 8
211
+ elif any(w in q for w in ["liste", "résume", "quels sont", "explique", "comment"]):
212
+ top_k = 10
213
+ else:
214
+ top_k = 8
215
+ logger.info(f"🔢 top_k déterminé automatiquement : {top_k}")
216
+ return top_k
217
+
218
+ def rerank_nodes(self, question: str, retrieved_nodes, top_k: int = 3):
219
+ logger.info(f"🔍 Re-ranking des {len(retrieved_nodes)} chunks pour la question : « {question} »")
220
+ q_emb = self.embed_model.get_query_embedding(question)
221
+ scored_nodes = []
222
+
223
+ for node in retrieved_nodes:
224
+ chunk_text = node.get_content()
225
+ chunk_emb = self.embed_model.get_text_embedding(chunk_text)
226
+ score = cos_sim(q_emb, chunk_emb).item()
227
+ scored_nodes.append((score, node))
228
+
229
+ ranked_nodes = sorted(scored_nodes, key=lambda x: x[0], reverse=True)
230
+
231
+ logger.info("📊 Chunks les plus pertinents :")
232
+ for i, (score, node) in enumerate(ranked_nodes[:top_k]):
233
+ chunk_preview = textwrap.shorten(node.get_content().replace("\n", " "), width=100)
234
+ logger.info(f"#{i+1} | Score: {score:.4f} | {chunk_preview}")
235
+
236
+ return [n for _, n in ranked_nodes[:top_k]]
237
+
238
+ def retrieve_context(self, question: str, top_k: int = 3):
239
+ logger.info(f"📥 Récupération du contexte...")
240
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
241
+ retrieved_nodes = retriever.retrieve(question)
242
+ reranked_nodes = self.rerank_nodes(question, retrieved_nodes, top_k)
243
+ context = "\n\n".join(n.get_content()[:500] for n in reranked_nodes)
244
+ return context, reranked_nodes
245
+
246
+ # ---------------- Public API ----------------
247
+
248
+ def ask(self, question_raw: str) -> str:
249
+ logger.info(f"💬 Question reçue : {question_raw}")
250
+ context=""
251
+ reformulate=False
252
+ if reformulate :
253
+ if len(question_raw.split()) <= 2:
254
+ context_sample, _ = self.retrieve_context(question_raw, top_k=3)
255
+ reformulated = self.reformulate_with_context(question_raw, context_sample)
256
+ else:
257
+ reformulated = self.reformulate_question(question_raw)
258
+
259
+ logger.info(f"📝 Question reformulée : {reformulated}")
260
+ top_k = self.get_adaptive_top_k(reformulated)
261
+ context, _ = self.retrieve_context(reformulated, top_k)
262
+ else:
263
+ reformulated=question_raw
264
+
265
+
266
+ prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
267
+
268
+ Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
269
+
270
+ Contexte :
271
+ {context}
272
+
273
+ Question : {reformulated}
274
+ ### Réponse:"""
275
+
276
+ response = self._complete(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=MAX_TOKENS)
277
+ response = response.strip().split("###")[0]
278
+ ellipsis = "..." if len(response) > 120 else ""
279
+ logger.info(f"🧠 Réponse générée : {response[:120]}{ellipsis}")
280
+ return response
281
+
282
+ def ask_stream(self, question: str):
283
+ logger.info(f"💬 [Stream] Question reçue : {question}")
284
+ top_k = self.get_adaptive_top_k(question)
285
+ context, _ = self.retrieve_context(question, top_k)
286
+ context="" #for test purpose
287
+
288
+ prompt = f"""### Instruction: En te basant uniquement sur le contexte ci-dessous, réponds à la question de manière précise et en français.
289
+
290
+ Si la réponse ne peut pas être déduite du contexte, indique : "Information non présente dans le contexte."
291
+
292
+ Contexte :
293
+ {context}
294
+
295
+ Question : {question}
296
+ ### Réponse:"""
297
+
298
+ logger.info("📡 Début du streaming de la réponse...")
299
+ for token in self._complete_stream(prompt, stop=["### Réponse:", "\n\n", "###"], max_tokens=MAX_TOKENS,raw=False):
300
+ yield token
301
+
302
+ logger.info("📡 Fin du streaming de la réponse...")