Spaces:
Paused
Paused
nouvelle version
Browse files
app_ollama.py → app_ollama_v1.py
RENAMED
@@ -6,7 +6,7 @@ import streamlit as st
|
|
6 |
from huggingface_hub import hf_hub_download
|
7 |
|
8 |
# ✅ Nouveau moteur RAG (Ollama)
|
9 |
-
from
|
10 |
|
11 |
# --- Config & logs ---
|
12 |
os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
|
@@ -50,12 +50,14 @@ ollama_host = st.sidebar.text_input("Ollama host", value=default_host, help="Ex:
|
|
50 |
|
51 |
# Propose des modèles déjà présents ou courants
|
52 |
suggested_models = [
|
|
|
53 |
"mistral", # présent chez toi
|
54 |
"gemma3", # présent chez toi
|
55 |
"deepseek-r1", # présent chez toi (raisonnement long, plus lent)
|
56 |
"granite3.3", # présent chez toi
|
57 |
"llama3.1:8b-instruct-q4_K_M",
|
58 |
"nous-hermes2:Q4_K_M",
|
|
|
59 |
]
|
60 |
model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
|
61 |
num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
|
@@ -65,7 +67,7 @@ st.title("🤖 Chatbot RAG Local (Ollama)")
|
|
65 |
|
66 |
# --- Cache du moteur ---
|
67 |
@st.cache_resource(show_spinner=True)
|
68 |
-
def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
|
69 |
# Options pour Ollama
|
70 |
ollama_opts = {
|
71 |
"num_thread": int(_threads),
|
@@ -83,12 +85,14 @@ def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
|
|
83 |
|
84 |
# Warmup léger (évite la latence au 1er token)
|
85 |
try:
|
86 |
-
|
|
|
|
|
87 |
except Exception as e:
|
88 |
logger.warning(f"Warmup Ollama échoué: {e}")
|
89 |
return rag
|
90 |
|
91 |
-
rag = load_rag_engine(model_name, ollama_host, num_threads, temperature)
|
92 |
|
93 |
# --- Chat simple ---
|
94 |
user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")
|
|
|
6 |
from huggingface_hub import hf_hub_download
|
7 |
|
8 |
# ✅ Nouveau moteur RAG (Ollama)
|
9 |
+
from rag_model_ollama_v1 import RAGEngine
|
10 |
|
11 |
# --- Config & logs ---
|
12 |
os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
|
|
|
50 |
|
51 |
# Propose des modèles déjà présents ou courants
|
52 |
suggested_models = [
|
53 |
+
"noushermes_rag",
|
54 |
"mistral", # présent chez toi
|
55 |
"gemma3", # présent chez toi
|
56 |
"deepseek-r1", # présent chez toi (raisonnement long, plus lent)
|
57 |
"granite3.3", # présent chez toi
|
58 |
"llama3.1:8b-instruct-q4_K_M",
|
59 |
"nous-hermes2:Q4_K_M",
|
60 |
+
|
61 |
]
|
62 |
model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
|
63 |
num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
|
|
|
67 |
|
68 |
# --- Cache du moteur ---
|
69 |
@st.cache_resource(show_spinner=True)
|
70 |
+
def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float,_version: int =1):
|
71 |
# Options pour Ollama
|
72 |
ollama_opts = {
|
73 |
"num_thread": int(_threads),
|
|
|
85 |
|
86 |
# Warmup léger (évite la latence au 1er token)
|
87 |
try:
|
88 |
+
gen = rag._complete_stream("Bonjour", max_tokens=1)
|
89 |
+
next(gen,"")
|
90 |
+
|
91 |
except Exception as e:
|
92 |
logger.warning(f"Warmup Ollama échoué: {e}")
|
93 |
return rag
|
94 |
|
95 |
+
rag = load_rag_engine(model_name, ollama_host, num_threads, temperature,_version=2)
|
96 |
|
97 |
# --- Chat simple ---
|
98 |
user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")
|
rag_model_ollama.py → rag_model_ollama_v1.py
RENAMED
@@ -32,7 +32,7 @@ class OllamaClient:
|
|
32 |
Minimal Ollama client for /api/generate (text completion) with streaming support.
|
33 |
Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
|
34 |
"""
|
35 |
-
def __init__(self, model: str, host: Optional[str] = None, timeout: int =
|
36 |
self.model = model
|
37 |
self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
38 |
self.timeout = timeout
|
@@ -108,6 +108,7 @@ class RAGEngine:
|
|
108 |
ollama_host: override OLLAMA_HOST (default http://localhost:11434)
|
109 |
ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
|
110 |
"""
|
|
|
111 |
logger.info("📦 Initialisation du moteur RAG (Ollama)...")
|
112 |
# Build options
|
113 |
opts = dict(ollama_opts or {})
|
@@ -281,4 +282,4 @@ Question : {question}
|
|
281 |
|
282 |
logger.info("📡 Début du streaming de la réponse...")
|
283 |
for token in self._complete_stream(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS):
|
284 |
-
|
|
|
32 |
Minimal Ollama client for /api/generate (text completion) with streaming support.
|
33 |
Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
|
34 |
"""
|
35 |
+
def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
|
36 |
self.model = model
|
37 |
self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
38 |
self.timeout = timeout
|
|
|
108 |
ollama_host: override OLLAMA_HOST (default http://localhost:11434)
|
109 |
ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
|
110 |
"""
|
111 |
+
logger.info(f"🔎 rag_model_ollama source: {__file__}")
|
112 |
logger.info("📦 Initialisation du moteur RAG (Ollama)...")
|
113 |
# Build options
|
114 |
opts = dict(ollama_opts or {})
|
|
|
282 |
|
283 |
logger.info("📡 Début du streaming de la réponse...")
|
284 |
for token in self._complete_stream(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS):
|
285 |
+
yield token
|