File size: 17,348 Bytes
a41e515
 
 
 
28467cd
a41e515
 
 
28467cd
 
 
 
a41e515
 
28467cd
 
 
 
 
 
8177aa9
 
 
 
28467cd
 
 
bc1b41c
 
06d069c
a41e515
28467cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a41e515
 
28467cd
 
 
 
 
 
a41e515
28467cd
a41e515
 
 
 
 
 
06d069c
a41e515
28467cd
a41e515
28467cd
a41e515
 
 
 
 
 
 
 
 
28467cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a41e515
28467cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a41e515
28467cd
 
 
 
 
 
a41e515
28467cd
a41e515
 
28467cd
ef51098
a41e515
 
8177aa9
 
 
3b6e12f
8177aa9
 
 
3b6e12f
8177aa9
 
 
 
 
3b6e12f
 
8177aa9
a41e515
8177aa9
28467cd
a41e515
 
 
28467cd
a41e515
 
 
8177aa9
a41e515
 
 
 
8177aa9
a41e515
28467cd
a41e515
 
 
 
 
8177aa9
 
 
 
28467cd
a41e515
 
 
 
 
 
 
 
28467cd
 
 
 
 
 
 
 
 
 
 
 
 
 
a41e515
28467cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a41e515
28467cd
a41e515
5a10d01
28467cd
a41e515
28467cd
 
 
a41e515
28467cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
import time
import gradio as gr
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer, SparseEncoder
from sentence_transformers.quantization import quantize_embeddings
import faiss
from usearch.index import Index
import torch
import numpy as np
from collections import defaultdict
from scipy import stats



# Load titles, texts and pre-computed sparse embeddings
wikipedia_dataset = load_dataset("CATIE-AQ/wikipedia_fr_2022_250K", split="train", num_proc=4).select_columns(["title", "text", "wiki_id", "sparse_emb"])


# A function to make the titles of Wikipedia articles clickable in the final dataframe, so that you can consult the article on the website
def add_link(example):
    example["title"] = '['+example["title"]+']('+'https://fr.wikipedia.org/wiki?curid='+str(example["wiki_id"])+')'
    return example
wikipedia_dataset = wikipedia_dataset.map(add_link)


# Load the int8 and binary indices for dense retrieval
int8_view = Index.restore("wikipedia_fr_2022_250K_int8_usearch.index", view=True)
binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary("wikipedia_fr_2022_250K_ubinary_faiss.index")
binary_ivf: faiss.IndexBinaryIVF = faiss.read_index_binary("wikipedia_fr_2022_250K_ubinary_ivf_faiss.index")


# Load models
dense_model = SentenceTransformer("OrdalieTech/Solon-embeddings-large-0.1")
sparse_model = SparseEncoder("CATIE-AQ/SPLADE_camembert-base_STS")


# Fusion methods
def reciprocal_rank_fusion(dense_results, sparse_results, k=20):
    """
    Perform Reciprocal Rank Fusion to combine dense and sparse retrieval results
    
    Args:
        dense_results: List of (doc_id, score) from dense retrieval
        sparse_results: List of (doc_id, score) from sparse retrieval  
        k: RRF parameter (default 20)
    
    Returns:
        List of (doc_id, rrf_score) sorted by RRF score
    """
    rrf_scores = defaultdict(float)
    
    # Add scores from dense retrieval
    for rank, (doc_id, _) in enumerate(dense_results, 1):
        rrf_scores[doc_id] += 1 / (k + rank)
        
    # Add scores from sparse retrieval
    for rank, (doc_id, _) in enumerate(sparse_results, 1):
        rrf_scores[doc_id] += 1 / (k + rank)
        
    # Sort by RRF score
    sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_results

    
def normalized_score_fusion(dense_results, sparse_results, dense_weight=0.5, sparse_weight=0.5):
    """
    Perform Normalized Score Fusion (NSF) with z-score normalization
    
    Args:
        dense_results: List of (doc_id, score) from dense retrieval
        sparse_results: List of (doc_id, score) from sparse retrieval
        dense_weight: Weight for dense scores (default 0.5)
        sparse_weight: Weight for sparse scores (default 0.5)
    
    Returns:
        List of (doc_id, normalized_score) sorted by normalized score
    """
    # Extract scores for normalization
    dense_scores = np.array([score for _, score in dense_results])
    sparse_scores = np.array([score for _, score in sparse_results])
    
    # Z-score normalization (handle edge cases)
    if len(dense_scores) > 1 and np.std(dense_scores) > 1e-10:
        dense_scores_norm = stats.zscore(dense_scores)
    else:
        dense_scores_norm = np.zeros_like(dense_scores)
    
    if len(sparse_scores) > 1 and np.std(sparse_scores) > 1e-10:
        sparse_scores_norm = stats.zscore(sparse_scores)
    else:
        sparse_scores_norm = np.zeros_like(sparse_scores)
    
    # Create dictionaries for normalized scores
    dense_norm_dict = {doc_id: score for (doc_id, _), score in zip(dense_results, dense_scores_norm)}
    sparse_norm_dict = {doc_id: score for (doc_id, _), score in zip(sparse_results, sparse_scores_norm)}
    
    # Combine all unique document IDs
    all_doc_ids = set()
    all_doc_ids.update(doc_id for doc_id, _ in dense_results)
    all_doc_ids.update(doc_id for doc_id, _ in sparse_results)
    
    # Calculate weighted normalized scores
    nsf_scores = {}
    for doc_id in all_doc_ids:
        dense_norm_score = dense_norm_dict.get(doc_id, 0.0)
        sparse_norm_score = sparse_norm_dict.get(doc_id, 0.0)
        
        # Weighted combination of normalized scores
        nsf_scores[doc_id] = (dense_weight * dense_norm_score + 
                              sparse_weight * sparse_norm_score)
    
    # Sort by NSF score
    sorted_results = sorted(nsf_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_results


# Search part
## Currentl to try to speed up things, I keep only non-zero values for the sparse search
## Need to optimise the sparse research part (about 25-30s currently...)
## Tom must surely have some tips on this point
sparse_index = {}
for i, sparse_emd in enumerate(wikipedia_dataset["sparse_emb"]):
    # Convert sparse_emd to a NumPy array
    sparse_emd = np.array(sparse_emd)

    # Only store non-zero values and their indices
    non_zero_indices = np.nonzero(sparse_emd)[0]
    non_zero_values = sparse_emd[non_zero_indices]
    sparse_index[i] = (non_zero_indices, non_zero_values)
    

def sparse_search(query, top_k=20):
    """
    Perform sparse retrieval using SPLADE representations with a dictionary-based sparse index.
    """
    # Encode the query, the output is a sparse torch.Tensor
    query_sparse_vector = sparse_model.encode(query, convert_to_numpy=False)
    
    # Convert the sparse tensor to a dense format before using np.nonzero
    query_sparse_vector = query_sparse_vector.to_dense().numpy()
    
    # Get non-zero values and indices for the query
    query_non_zero_indices = np.nonzero(query_sparse_vector)[0]
    query_non_zero_values = query_sparse_vector[query_non_zero_indices]
    
    # Compute dot product similarity efficiently
    scores = defaultdict(float)
    for doc_id, (doc_indices, doc_values) in sparse_index.items():
        # Find the intersection of non-zero indices
        common_indices = np.intersect1d(query_non_zero_indices, doc_indices)
        # Compute dot product for common indices only
        for idx in common_indices:
            query_val = query_non_zero_values[np.where(query_non_zero_indices == idx)[0][0]]
            doc_val = doc_values[np.where(doc_indices == idx)[0][0]]
            scores[doc_id] += query_val * doc_val
            
    # Sort and get top_k
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores[:top_k]


# Final search combining dense and sparse results
def search(query, top_k: int = 20, rescore_multiplier: int = 1, use_approx: bool = False, 
           fusion_method: str = "rrf", rrf_k: int = 20, dense_weight: float = 0.5, sparse_weight: float = 0.5):
    total_start_time = time.time()
    
    # 1. Dense retrieval pipeline (existing code)
    start_time = time.time()
    query_embedding = dense_model.encode(query, prompt="query: ")
    embed_time = time.time() - start_time

    start_time = time.time()
    query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
    quantize_time = time.time() - start_time

    index = binary_ivf if use_approx else binary_index
    start_time = time.time()
    _scores, binary_ids = index.search(query_embedding_ubinary, top_k * rescore_multiplier * 2)  # Get more for fusion
    binary_ids = binary_ids[0]
    dense_search_time = time.time() - start_time

    start_time = time.time()
    int8_embeddings = int8_view[binary_ids].astype(int)
    load_time = time.time() - start_time

    start_time = time.time()
    scores = query_embedding @ int8_embeddings.T
    rescore_time = time.time() - start_time

    # Prepare dense results
    dense_results = [(binary_ids[i], scores[i]) for i in range(len(binary_ids))]
    dense_results.sort(key=lambda x: x[1], reverse=True)
    
    timing_info = {
        "Temps pour créer l'embedding de la requête (dense)": f"{embed_time:.4f} s",
        "Temps pour la quantification": f"{quantize_time:.4f} s",
        "Temps pour effectuer la recherche dense": f"{dense_search_time:.4f} s",
        "Temps de chargement": f"{load_time:.4f} s",
        "Temps de rescorage": f"{rescore_time:.4f} s",
    }

    
    if fusion_method != "dense_only":
        # 2. Sparse retrieval pipeline
        start_time = time.time()
        sparse_results = sparse_search(query, top_k * rescore_multiplier * 2)
        sparse_search_time = time.time() - start_time
        
        # 3. Apply selected fusion method
        start_time = time.time()
        if fusion_method == "rrf":
            fusion_results = reciprocal_rank_fusion(dense_results, sparse_results, k=rrf_k)
            fusion_method_name = f"RRF (k={rrf_k})"
        elif fusion_method == "nsf":
            fusion_results = normalized_score_fusion(dense_results, sparse_results, 
                                                     dense_weight=dense_weight, sparse_weight=sparse_weight)
            fusion_method_name = f"NSF Z-score (dense={dense_weight:.1f}, sparse={sparse_weight:.1f})"
        else:
            fusion_results = dense_results  # fallback
            fusion_method_name = "Dense uniquement (fallback)"
        
        fusion_time = time.time() - start_time
        
        # Use fusion results
        final_results = fusion_results[:top_k * rescore_multiplier]
        final_doc_ids = [doc_id for doc_id, _ in final_results]
        final_scores = [score for _, score in final_results]
        
        timing_info.update({
            # Add time to create query embedding with sparse model
            # Do quantification for sparse model as dense one?
            "Temps pour la recherche sparse": f"{sparse_search_time:.4f} s",
            "Temps pour la fusion": f"{fusion_time:.4f} s",
        })
        timing_info["Méthode de fusion utilisée"] = fusion_method_name
    else:
        # Use only dense results
        final_doc_ids = [doc_id for doc_id, _ in dense_results[:top_k * rescore_multiplier]]
        final_scores = [score for _, score in dense_results[:top_k * rescore_multiplier]]
        timing_info["Méthode de fusion utilisée"] = "Dense uniquement"
    
    # 4. Prepare final results
    start_time = time.time()
    try:
        top_k_titles, top_k_texts = zip(*[(wikipedia_dataset[int(idx)]["title"], wikipedia_dataset[int(idx)]["text"]) 
                                             for idx in final_doc_ids[:top_k]])
        
        # Create DataFrame with results
        df = pd.DataFrame({
            "Score_paragraphe": [round(float(score), 4) for score in final_scores[:top_k]], 
            "Titre": top_k_titles, 
            "Texte": top_k_texts
        })
        
        # Calculate article scores (sum of paragraph scores)
        score_sum = df.groupby('Titre')['Score_paragraphe'].sum().reset_index()
        df = pd.merge(df, score_sum, on='Titre', how='left')
        df.rename(columns={'Score_paragraphe_y': 'Score_article', 'Score_paragraphe_x': 'Score_paragraphe'}, inplace=True)
        df = df[["Score_article", "Score_paragraphe", "Titre", "Texte"]]
        df = df.sort_values('Score_article', ascending=False)
        
    except Exception as e:
        print(f"Error creating results DataFrame: {e}")
        df = pd.DataFrame({"Error": [f"No results found or error processing results: {e}"]})
    
    sort_time = time.time() - start_time
    total_time = time.time() - total_start_time
    
    timing_info.update({
        "Temps pour afficher les résultats": f"{sort_time:.4f} s",
        "Temps total": f"{total_time:.4f} s",
    })

    return df, timing_info


with gr.Blocks(title="Requêter Wikipedia avec Fusion Hybride 🔍") as demo:
    
    gr.Markdown(
        """
## Requêter Wikipedia en temps réel 🔍
Ce démonstrateur permet de requêter un corpus composé des 250K paragraphes les plus consultés du Wikipédia francophone.    
Les résultats sont renvoyés en temps réel via un pipeline tournant sur un CPU 🚀  
Nous nous sommes grandement inspirés du Space [quantized-retrieval](https://huggingface.co/spaces/sentence-transformers/quantized-retrieval) conçu par [Tom Aarsen](https://huggingface.co/tomaarsen) 🤗    
Si vous voulez en savoir plus sur le processus complet derrière ce démonstrateur, n'hésitez pas à déplier les liens ci-dessous.

<details><summary>1. Détails sur les données</summary>
Le corpus utilisé correspond au 250 000 premières lignes du jeu de données <a href="https://hf.co/datasets/Cohere/wikipedia-22-12-fr-embeddings"><i>wikipedia-22-12-fr-embeddings</i></a> mis en ligne par Cohere.  
Comme son nom l'indique il s'agit d'un jeu de données datant de décembre 2022. Cette information est à prendre en compte lorsque vous effectuez votre requête.  
De même il s'agit ici d'un sous-ensemble du jeu de données total, à savoir les 250 000 paragraphes les plus consultés à cette date-là.  
Ainsi, si vous effectuez une recherche pointue sur un sujet peu consulté, ce démonstrateur ne reverra probablement rien de pertinent.  
A noter également que Cohere a effectué un prétraitement sur les données ce qui a conduit à la suppression de dates par exemple.  
Ce jeu de données n'est donc pas optimal. L'idée était de pouvoir proposer quelque chose en peu de temps. 
Dans un deuxième temps, ce démonstrateur sera étendu à l'ensemble du jeu de données <i>wikipedia-22-12-fr-embeddings</i> (soit 13M de paragraphes).
Il n'est pas exclus d'ensuite utiliser une version plus récente de Wikipedia (on peut penser par exemple à <a href="https://hf.co/datasets/wikimedia/wikipedia"><i>wikimedia/wikipedia</i></a>
</details>

<details><summary>2. Détails le pipeline</summary>
A écrire quand ça sera terminé.
</details>
"""
    )
    
    with gr.Row():
        with gr.Column(scale=75):
            query = gr.Textbox(
                label="Requêter le Wikipédia francophone",
                placeholder="Saisissez une requête pour rechercher des textes pertinents dans Wikipédia.",
            )
        with gr.Column(scale=25):
            use_approx = gr.Radio(
                choices=[("Exacte", False), ("Approximative", True)],
                value=True,
                label="Type de recherche dense",
            )

    with gr.Row():
        with gr.Column(scale=2):
            top_k = gr.Slider(
                minimum=3,
                maximum=40,
                step=1,
                value=15,
                label="Nombre de documents à retourner",
            )
        with gr.Column(scale=2):
            rescore_multiplier = gr.Slider(
                minimum=1,
                maximum=10,
                step=1,
                value=1,
                label="Coefficient de rescorage",
                info="Augmente le nombre de candidats avant fusion",
            )
    
    with gr.Row():
        with gr.Column(scale=3):
            fusion_method = gr.Radio(
                choices=[
                    ("Dense uniquement", "dense_only"),
                    ("Fusion RRF", "rrf"),  
                    ("Fusion NSF Z-score", "nsf")
                ],
                value="rrf",
                label="Méthode de fusion",
                info="Choisissez comment combiner les résultats des modèles dense et sparse"
            )
        with gr.Column(scale=2):
            rrf_k = gr.Slider(
                minimum=1,
                maximum=200,
                step=1,
                value=60,
                label="Paramètre k pour RRF",
                info="Plus k est élevé, moins les rangs ont d'importance",
                visible=True
            )
    
    with gr.Row():
        with gr.Column(scale=2):
            dense_weight = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=0.5,
                label="Poids Dense (NSF)",
                info="Importance des résultats du modèle dense",
                visible=False
            )
        with gr.Column(scale=2):
            sparse_weight = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=0.5,
                label="Poids Sparse (NSF)",
                info="Importance des résultats du modèle sparse",
                visible=False
            )
    
    # JavaScript to show/hide parameters based on fusion method
    fusion_method.change(
        fn=lambda method: (
            gr.update(visible=(method == "rrf")),  # rrf_k
            gr.update(visible=(method == "nsf")),  # dense_weight  
            gr.update(visible=(method == "nsf"))    # sparse_weight
        ),
        inputs=[fusion_method],
        outputs=[rrf_k, dense_weight, sparse_weight]
    )

    search_button = gr.Button(value="Rechercher", variant="primary")

    output = gr.Dataframe(headers=["Score_article", "Score_paragraphe", "Titre", "Texte"], datatype="markdown")
    json = gr.JSON(label="Informations de performance")

    query.submit(search, inputs=[query, top_k, rescore_multiplier, use_approx, fusion_method, rrf_k, dense_weight, sparse_weight], outputs=[output, json])
    search_button.click(search, inputs=[query, top_k, rescore_multiplier, use_approx, fusion_method, rrf_k, dense_weight, sparse_weight], outputs=[output, json])
    
demo.queue()
demo.launch()