Spaces:

jbl2024
/

publik_rag

Sleeping

App Files Files Community

jbl2024 commited on Feb 28

Commit

50705f6

verified ·

1 Parent(s): f6256fc

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.envrc +1 -0
.gitignore +124 -0
README.md +2 -8
app.py +78 -0
crawler.py +310 -0
data/chunks.pkl +3 -0
data/embeddings.pkl +3 -0
rag.py +149 -0
requirements.txt +5 -0

.envrc ADDED Viewed

	@@ -0,0 +1 @@


1	+ source .envrc.private

.gitignore ADDED Viewed

	@@ -0,0 +1,124 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyderworkspace
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+.envrc.private
+.gradio

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Publik Rag
-emoji: ⚡
-colorFrom: purple
-colorTo: gray
 sdk: gradio
 sdk_version: 5.19.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: publik_rag
+app_file: app.py
 sdk: gradio
 sdk_version: 5.19.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import pickle
+import gradio as gr
+from crawler import ContentCrawler
+from rag import RAGEngine
+# Define file paths for the pickled chunks and embeddings
+chunks = "./data/chunks.pkl"
+embeddings = "./data/embeddings.pkl"
+# Check if the chunks file exists; if not, crawl the website and save the results
+if os.path.exists(chunks):
+    print("Loading chunks")
+    with open(chunks, "rb") as f:
+        results = pickle.load(f)
+else:
+    print("Chunks file not found. Crawling the website...")
+    # Define the base URL and initialize the content crawler
+    base_url = "https://doc-publik.entrouvert.com/"
+    crawler = ContentCrawler(base_url)
+    results = crawler.crawl()
+    # Save the crawled chunks to a pickle file
+    with open(chunks, "wb") as f:
+        pickle.dump(results, f)
+# Initialize the RAGEngine with the loaded chunks
+rag_engine = RAGEngine(results)
+# Check if the embeddings file exists; if not, create the embeddings and save them
+if os.path.exists(embeddings):
+    print("Loading embeddings")
+    with open(embeddings, "rb") as f:
+        rag_engine.embeddings = pickle.load(f)
+else:
+    print("Creating embeddings")
+    rag_engine.index_documents()
+    with open(embeddings, "wb") as f:
+        pickle.dump(rag_engine.embeddings, f)
+# Define a function to answer questions using the RAG engine.
+# This function also retrieves the "urls" field and formats them as clickable Markdown links.
+def answer_question(question):
+    # Affiche immédiatement un message de chargement
+    yield "Chargement en cours..."
+    try:
+        result = rag_engine.rag(question, top_k=5)
+        # Récupération de la réponse et des URLs associées
+        prompt = result.get("prompt", "")
+        response = result.get("response", "")
+        urls = result.get("urls", [])
+        # Formatage de la réponse avec les liens Markdown si des URLs sont présentes
+        if urls:
+            links_md = "\n".join([f"- [{url}]({url})" for url in urls])
+            markdown_output = f"{response}\n\n**Sources:**\n{links_md}"
+        else:
+            markdown_output = response
+        # Envoi de la réponse finale
+        yield markdown_output
+    except Exception as e:
+        # En cas d'erreur, affiche le message de l'exception
+        yield f"Une erreur est survenue: {str(e)}"
+# Create a Gradio interface for the Q&A with Markdown formatted output and flagging disabled
+iface = gr.Interface(
+    fn=answer_question,
+    inputs=gr.Textbox(label="Votre question"),
+    outputs=gr.Markdown(label="Réponse"),
+    title="Publik Q&A",
+    flagging_mode="never",
+    description="Poser des questions sur Publik",
+)
+# Launch the Gradio interface
+iface.launch()

crawler.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import hashlib
+from typing import Dict, List, Set, Tuple
+from urllib.parse import urljoin, urlparse, urlunparse
+import requests
+from bs4 import BeautifulSoup, NavigableString, Tag
+class ContentCrawler:
+    def __init__(
+        self, base_url: str, ignore_prefixes: List[str] = None, max_length: int = 8000
+    ):
+        """
+        Initialize the crawler with the base URL, a list of URL prefixes to ignore, and the maximum chunk size.
+        Args:
+            base_url: The website URL to crawl.
+            ignore_prefixes: List of URL path prefixes to ignore.
+            max_length: Maximum allowed size for a chunk.
+        """
+        self.base_url = base_url
+        self.visited = set()
+        self.results = []
+        self.max_length = max_length
+        self.ignore_prefixes = ignore_prefixes or [
+            "manage/",
+            "password/",
+            "media/",
+            "notes-de-mises-a-jour/",
+        ]
+        # Pour éviter les doublons de contenu
+        self.content_hashes = set()
+    def crawl(self) -> List[Dict[str, str]]:
+        """
+        Recursively crawl the website starting from the homepage.
+        Returns:
+            A list of dictionaries with keys 'url' and 'text' (in markdown format).
+        """
+        try:
+            homepage_response = requests.get(self.base_url)
+            homepage_response.raise_for_status()
+        except Exception as e:
+            print(f"Error fetching homepage {self.base_url}: {e}")
+            return []
+        homepage_soup = BeautifulSoup(homepage_response.text, "html.parser")
+        initial_links = self._get_internal_links(homepage_soup)
+        # Utiliser un ensemble pour éviter les doublons d'URLs dans la file
+        queue = set()
+        for link in initial_links:
+            full_url = self._normalize_url(urljoin(self.base_url, link))
+            if full_url != self.base_url:
+                queue.add(full_url)
+                self.visited.add(full_url)
+        # Convertir en liste pour le traitement
+        queue_list = list(queue)
+        while queue_list:
+            current_url = queue_list.pop(0)
+            print(f"Processing {current_url}")
+            result, new_links = self._parse_page(current_url)
+            if result:
+                self.results.extend(result)
+            # Ajouter seulement les liens non visités
+            for link in new_links:
+                full_url = self._normalize_url(urljoin(self.base_url, link))
+                if full_url not in self.visited and full_url != self.base_url:
+                    self.visited.add(full_url)
+                    queue_list.append(full_url)
+        return self.results
+    def _normalize_url(self, url: str) -> str:
+        """Normaliser l'URL en supprimant les fragments et paramètres de requête."""
+        parsed = urlparse(url)
+        # Supprimer fragment et query params
+        normalized = parsed._replace(fragment="", query="")
+        return urlunparse(normalized)
+    def _get_internal_links(self, soup: BeautifulSoup) -> Set[str]:
+        """
+        Retrieve internal links from the BeautifulSoup object,
+        ignoring those whose path starts with any of the specified prefixes.
+        """
+        links = set()
+        for a_tag in soup.find_all("a", href=True):
+            href = a_tag["href"]
+            if href.startswith("#") or href.startswith("javascript:"):
+                continue
+            parsed_href = urlparse(href)
+            path = parsed_href.path.lstrip("/")
+            if any(path.startswith(prefix) for prefix in self.ignore_prefixes):
+                continue
+            # S'assurer que le lien est interne
+            is_internal = (
+                not parsed_href.netloc
+                or self.base_url in href
+                or parsed_href.netloc == urlparse(self.base_url).netloc
+            )
+            if is_internal:
+                links.add(href)
+        return links
+    def _parse_page(self, url: str) -> Tuple[List[Dict[str, str]], Set[str]]:
+        """Parse une page et extrait son contenu ainsi que ses liens."""
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+            return [], set()
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Trouver la div principale de contenu
+        content_div = soup.find(id="content")
+        if not content_div:
+            print(f"No content div found in {url}")
+            return [], self._get_internal_links(soup)
+        # Nettoyer le contenu
+        for script in content_div.find_all(["script", "style"]):
+            script.decompose()
+        # Récupérer le titre
+        h1_tag = content_div.find("h1")
+        page_title = h1_tag.get_text(strip=True) if h1_tag else ""
+        # Créer le markdown complet
+        markdown_content = self._extract_structured_content(content_div, page_title)
+        # Vérifier si le contenu est un doublon
+        content_hash = self._hash_content(markdown_content)
+        if content_hash in self.content_hashes:
+            print(f"Duplicate content skipped for {url}")
+            return [], self._get_internal_links(soup)
+        self.content_hashes.add(content_hash)
+        # Diviser en chunks si nécessaire
+        chunks = self._split_content(markdown_content)
+        # Créer la liste des résultats
+        results = []
+        for i, chunk in enumerate(chunks):
+            results.append({"url": f"{url}#chunk-{i+1}", "text": chunk})
+        return results, self._get_internal_links(soup)
+    def _extract_structured_content(self, content_div: Tag, page_title: str) -> str:
+        """Extrait le contenu de manière structurée en respectant la hiérarchie de titres."""
+        lines = []
+        # Ajouter le titre principal
+        if page_title:
+            lines.append(f"# {page_title}")
+        # Identifier tous les titres et le contenu
+        current_element = content_div.find_next()
+        while current_element and current_element.parent == content_div:
+            if current_element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+                # Convertir le niveau de titre
+                level = int(current_element.name[1])
+                text = current_element.get_text(strip=True)
+                lines.append(f"{'#' * level} {text}")
+            else:
+                markdown = self._convert_element_to_markdown(current_element)
+                if markdown:
+                    lines.append(markdown)
+            # Passer à l'élément suivant au même niveau
+            current_element = current_element.find_next_sibling()
+        return "\n\n".join(line for line in lines if line)
+    def _convert_element_to_markdown(self, element) -> str:
+        """Convertit un élément HTML en markdown."""
+        if isinstance(element, NavigableString):
+            text = element.strip()
+            return text if text else ""
+        if isinstance(element, Tag):
+            if element.name in ["script", "style", "iframe"]:
+                return ""
+            if element.name == "p":
+                return element.get_text(strip=True)
+            elif element.name == "a" and element.get("href"):
+                text = element.get_text(strip=True)
+                href = element.get("href")
+                return f"[{text}]({href})"
+            elif element.name in ["ul", "ol"]:
+                items = []
+                for li in element.find_all("li", recursive=False):
+                    text = li.get_text(strip=True)
+                    if text:
+                        items.append(f"* {text}")
+                return "\n".join(items)
+            elif element.name == "table":
+                # Extraction basique des tableaux
+                rows = []
+                for tr in element.find_all("tr"):
+                    cols = []
+                    for td in tr.find_all(["td", "th"]):
+                        cols.append(td.get_text(strip=True))
+                    rows.append(" | ".join(cols))
+                if rows:
+                    # Ajouter la ligne de séparation après l'en-tête
+                    if len(rows) > 1:
+                        rows.insert(1, "-" * len(rows[0]))
+                    return "\n".join(rows)
+                return ""
+            elif element.name in ["div", "section", "article"]:
+                parts = []
+                for child in element.children:
+                    part = self._convert_element_to_markdown(child)
+                    if part:
+                        parts.append(part)
+                return "\n\n".join(parts)
+            else:
+                text = element.get_text(strip=True)
+                return text if text else ""
+        return ""
+    def _split_content(self, content: str) -> List[str]:
+        """Divise le contenu en chunks de taille maximale."""
+        if len(content) <= self.max_length:
+            return [content]
+        # Extraction du titre principal pour le préserver dans chaque chunk
+        lines = content.split("\n\n")
+        main_title = lines[0] if lines and lines[0].startswith("# ") else ""
+        chunks = []
+        current_chunk = main_title
+        current_length = len(main_title)
+        for line in lines:
+            # Ignorer le titre principal déjà traité
+            if line == main_title:
+                continue
+            line_length = len(line)
+            # Si la ligne seule dépasse la taille max, on doit la diviser
+            if line_length > self.max_length:
+                # D'abord ajouter le chunk courant s'il y a du contenu
+                if current_length > len(main_title):
+                    chunks.append(current_chunk)
+                # Diviser cette longue ligne en sous-parties
+                start = 0
+                while start < line_length:
+                    part = line[start : start + self.max_length]
+                    if main_title and not part.startswith("#"):
+                        chunks.append(f"{main_title}\n\n{part}")
+                    else:
+                        chunks.append(part)
+                    start += self.max_length
+                # Réinitialiser le chunk courant
+                current_chunk = main_title
+                current_length = len(main_title)
+            else:
+                # Si l'ajout de cette ligne dépasse la taille max, créer un nouveau chunk
+                if current_length + line_length + 4 > self.max_length:  # +4 pour \n\n
+                    chunks.append(current_chunk)
+                    current_chunk = main_title
+                    current_length = len(main_title)
+                    if main_title and current_chunk:
+                        current_chunk += "\n\n"
+                        current_length += 2
+                # Ajouter la ligne au chunk courant
+                if current_chunk:
+                    current_chunk += "\n\n" + line
+                    current_length += line_length + 2
+                else:
+                    current_chunk = line
+                    current_length = line_length
+        # Ajouter le dernier chunk s'il reste du contenu
+        if current_length > len(main_title):
+            chunks.append(current_chunk)
+        return chunks
+    def _hash_content(self, content: str) -> str:
+        """Crée un hash du contenu pour identifier les doublons."""
+        # Utiliser seulement le contenu principal (pas les URLs) pour la détection de doublons
+        return hashlib.md5(content.encode()).hexdigest()

data/chunks.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c623fccae7e8dd58d582bbd8afcee7f2e2ab8db6bf72c43bfbf12a6378463146
+size 1259802

data/embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79145630ff16a45dc12fa301cb21dd4ae3735f67d12030f069786d6e050ec0e5
+size 2942230

rag.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+from typing import Dict, List
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModel
+from huggingface_hub import InferenceClient
+class RAGEngine:
+    def __init__(
+        self,
+        documents: List[Dict[str, str]],
+        embedding_model: str = "BAAI/bge-m3",
+        llm_model: str = "meta-llama/Llama-3.1-8B-Instruct",
+        batch_size: int = 64,
+    ):
+        """
+        Initialise le moteur RAG avec les documents (contenant chacun 'url' et 'text'),
+        les paramètres de configuration et les clients nécessaires.
+        Args:
+            documents: Liste de documents, chacun un dictionnaire avec les clés 'url' et 'text'.
+            embedding_model: Nom du modèle pour calculer les embeddings en local.
+            llm_model: Nom du modèle LLM pour les complétions.
+            batch_size: Nombre de documents à traiter par lot.
+        """
+        self.documents = documents
+        self.embedding_model = embedding_model  # Nom du modèle pour embeddings (local)
+        self.llm_model = llm_model
+        self.batch_size = batch_size
+        self.embeddings: List[List[float]] = []
+        # Filtrer les documents dont le texte est vide pour éviter les erreurs
+        self.indexed_documents = [doc for doc in self.documents if doc["text"].strip()]
+        # Initialiser le modèle et le tokenizer en local pour le calcul des embeddings
+        self.embedding_tokenizer = AutoTokenizer.from_pretrained(self.embedding_model)
+        self.embedding_model_local = AutoModel.from_pretrained(self.embedding_model)
+        # Initialiser le client pour le LLM (l'inférence reste à distance pour le LLM)
+        self._init_client_hf()
+    def _init_client_hf(self) -> None:
+        self.client = InferenceClient(
+            model=self.llm_model,
+            token=os.environ.get("HF_TOKEN"),
+        )
+    def index_documents(self) -> None:
+        """Calcule les embeddings par lots en local avec le modèle Hugging Face."""
+        texts = [doc["text"] for doc in self.indexed_documents]
+        for i in range(0, len(texts), self.batch_size):
+            batch = texts[i:i + self.batch_size]
+            if not batch:
+                continue
+            # Tokenisation et préparation des tenseurs
+            inputs = self.embedding_tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
+            with torch.no_grad():
+                outputs = self.embedding_model_local(**inputs)
+            # Calcul du pooling moyen sur la dernière couche
+            batch_embeddings_tensor = outputs.last_hidden_state.mean(dim=1)
+            batch_embeddings = batch_embeddings_tensor.cpu().tolist()
+            self.embeddings.extend(batch_embeddings)
+            print(f"Batch {i//self.batch_size + 1} traité, {len(batch_embeddings)} embeddings obtenus")
+    @staticmethod
+    def cosine_similarity(query_vec: np.ndarray, matrix: np.ndarray) -> np.ndarray:
+        """
+        Calcule la similarité cosinus entre un vecteur de requête et chaque vecteur d'une matrice.
+        """
+        query_norm = np.linalg.norm(query_vec)
+        query_normalized = query_vec / (query_norm + 1e-10)
+        matrix_norm = np.linalg.norm(matrix, axis=1, keepdims=True)
+        matrix_normalized = matrix / (matrix_norm + 1e-10)
+        return np.dot(matrix_normalized, query_normalized)
+    def search(self, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
+        """
+        Recherche des documents sur la base de la similarité cosinus.
+        Args:
+            query_embedding: L'embedding de la requête.
+            top_k: Nombre de résultats à renvoyer.
+        Returns:
+            Une liste de dictionnaires avec les clés "url", "text" et "score".
+        """
+        query_vec = np.array(query_embedding)
+        emb_matrix = np.array(self.embeddings)
+        scores = self.cosine_similarity(query_vec, emb_matrix)
+        top_indices = np.argsort(scores)[::-1][:top_k]
+        results = []
+        for idx in top_indices:
+            doc = self.indexed_documents[idx]
+            results.append(
+                {"url": doc["url"], "text": doc["text"], "score": float(scores[idx])}
+            )
+        return results
+    def ask_llm(self, prompt: str) -> str:
+        """
+        Appelle le LLM avec l'invite construite et renvoie la réponse générée.
+        """
+        messages = [{"role": "user", "content": prompt}]
+        response = self.client.chat.completions.create(
+            model=self.llm_model, messages=messages
+        )
+        return response.choices[0].message.content
+    def rag(self, question: str, top_k: int = 4) -> Dict[str, str]:
+        """
+        Effectue une génération augmentée par récupération (RAG) pour une question donnée.
+        Args:
+            question: La question posée.
+            top_k: Nombre de documents de contexte à inclure.
+        Returns:
+            Un dictionnaire avec les clés "response", "prompt" et "urls".
+        """
+        # 1. Calculer l'embedding de la question en local.
+        inputs = self.embedding_tokenizer(question, return_tensors="pt")
+        with torch.no_grad():
+            outputs = self.embedding_model_local(**inputs)
+        question_embedding_tensor = outputs.last_hidden_state.mean(dim=1)[0]
+        question_embedding = question_embedding_tensor.cpu().tolist()
+        # 2. Récupérer les documents les plus similaires.
+        results = self.search(query_embedding=question_embedding, top_k=top_k)
+        context = "\n\n".join([f"URL: {res['url']}\n{res['text']}" for res in results])
+        # 3. Construire l'invite.
+        prompt = (
+            "You are a highly capable, thoughtful, and precise assistant. Your goal is to deeply understand the user's intent, ask clarifying questions when needed, think step-by-step through complex problems, provide clear and accurate answers, and proactively anticipate helpful follow-up information. "
+            "Based on the following context, answer the question precisely and concisely. "
+            "If you do not know the answer, do not make it up.\n\n"
+            f"Context:\n{context}\n\n"
+            f"Question: {question}\n\n"
+            "Answer:"
+        )
+        urls = [res['url'] for res in results]
+        # 4. Appeler le LLM avec l'invite construite.
+        llm_response = self.ask_llm(prompt)
+        return {"response": llm_response, "prompt": prompt, "urls": urls}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+requests==2.32.3
+transformers==4.49.0
+numpy==2.2.3
+huggingface-hub==0.29.1
+torch==2.6.0