Spaces:

rkonan
/

chatbot-fr

Paused

App Files Files Community

rkonan commited on 11 days ago

Commit

9bc7341

1 Parent(s): 2004481

nouvelle version

Browse files

Files changed (2) hide show

app_ollama.py → app_ollama_v1.py +8 -4
rag_model_ollama.py → rag_model_ollama_v1.py +3 -2

app_ollama.py → app_ollama_v1.py RENAMED Viewed

@@ -6,7 +6,7 @@ import streamlit as st
 from huggingface_hub import hf_hub_download
 # ✅ Nouveau moteur RAG (Ollama)
-from rag_model_ollama import RAGEngine
 # --- Config & logs ---
 os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
@@ -50,12 +50,14 @@ ollama_host = st.sidebar.text_input("Ollama host", value=default_host, help="Ex:
 # Propose des modèles déjà présents ou courants
 suggested_models = [
     "mistral",            # présent chez toi
     "gemma3",             # présent chez toi
     "deepseek-r1",        # présent chez toi (raisonnement long, plus lent)
     "granite3.3",         # présent chez toi
     "llama3.1:8b-instruct-q4_K_M",
     "nous-hermes2:Q4_K_M",
 ]
 model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
 num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
@@ -65,7 +67,7 @@ st.title("🤖 Chatbot RAG Local (Ollama)")
 # --- Cache du moteur ---
 @st.cache_resource(show_spinner=True)
-def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
     # Options pour Ollama
     ollama_opts = {
         "num_thread": int(_threads),
@@ -83,12 +85,14 @@ def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float):
     # Warmup léger (évite la latence au 1er token)
     try:
-        _ = rag._complete("Bonjour", max_tokens=1)
     except Exception as e:
         logger.warning(f"Warmup Ollama échoué: {e}")
     return rag
-rag = load_rag_engine(model_name, ollama_host, num_threads, temperature)
 # --- Chat simple ---
 user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")

 from huggingface_hub import hf_hub_download
 # ✅ Nouveau moteur RAG (Ollama)
+from rag_model_ollama_v1 import RAGEngine
 # --- Config & logs ---
 os.environ.setdefault("NLTK_DATA", "/home/appuser/nltk_data")
 # Propose des modèles déjà présents ou courants
 suggested_models = [
+    "noushermes_rag",
     "mistral",            # présent chez toi
     "gemma3",             # présent chez toi
     "deepseek-r1",        # présent chez toi (raisonnement long, plus lent)
     "granite3.3",         # présent chez toi
     "llama3.1:8b-instruct-q4_K_M",
     "nous-hermes2:Q4_K_M",
 ]
 model_name = st.sidebar.selectbox("Modèle Ollama", options=suggested_models, index=0)
 num_threads = st.sidebar.slider("Threads (hint)", min_value=2, max_value=16, value=6, step=1)
 # --- Cache du moteur ---
 @st.cache_resource(show_spinner=True)
+def load_rag_engine(_model_name: str, _host: str, _threads: int, _temp: float,_version: int =1):
     # Options pour Ollama
     ollama_opts = {
         "num_thread": int(_threads),
     # Warmup léger (évite la latence au 1er token)
     try:
+        gen = rag._complete_stream("Bonjour", max_tokens=1)
+        next(gen,"")
     except Exception as e:
         logger.warning(f"Warmup Ollama échoué: {e}")
     return rag
+rag = load_rag_engine(model_name, ollama_host, num_threads, temperature,_version=2)
 # --- Chat simple ---
 user_input = st.text_area("Posez votre question :", height=120, placeholder="Ex: Quels sont les traitements appliqués aux images ?")

rag_model_ollama.py → rag_model_ollama_v1.py RENAMED Viewed

@@ -32,7 +32,7 @@ class OllamaClient:
     Minimal Ollama client for /api/generate (text completion) with streaming support.
     Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
     """
-    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 120):
         self.model = model
         self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
         self.timeout = timeout
@@ -108,6 +108,7 @@ class RAGEngine:
             ollama_host: override OLLAMA_HOST (default http://localhost:11434)
             ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
         """
         logger.info("📦 Initialisation du moteur RAG (Ollama)...")
         # Build options
         opts = dict(ollama_opts or {})
@@ -281,4 +282,4 @@ Question : {question}
         logger.info("📡 Début du streaming de la réponse...")
         for token in self._complete_stream(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS):
-            print(token, end="", flush=True)

     Minimal Ollama client for /api/generate (text completion) with streaming support.
     Docs: https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
     """
+    def __init__(self, model: str, host: Optional[str] = None, timeout: int = 300):
         self.model = model
         self.host = host or os.getenv("OLLAMA_HOST", "http://localhost:11434")
         self.timeout = timeout
             ollama_host: override OLLAMA_HOST (default http://localhost:11434)
             ollama_opts: extra Ollama options (e.g., temperature, top_p, num_gpu, num_thread)
         """
+        logger.info(f"🔎 rag_model_ollama source: {__file__}")
         logger.info("📦 Initialisation du moteur RAG (Ollama)...")
         # Build options
         opts = dict(ollama_opts or {})
         logger.info("📡 Début du streaming de la réponse...")
         for token in self._complete_stream(prompt, stop=["### Instruction:"], max_tokens=MAX_TOKENS):
+            yield token