om4r932 commited on
Commit
f7db7af
·
1 Parent(s): 02153e0

First version

Browse files
Files changed (6) hide show
  1. Dockerfile +17 -0
  2. app.py +181 -0
  3. classes.py +840 -0
  4. index.html +367 -0
  5. requirements.txt +10 -0
  6. schemas.py +6 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.3
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
5
+ apt-get clean && rm -rf /var/lib/apt/lists/*
6
+
7
+ RUN useradd -m -u 1000 user
8
+ USER user
9
+ ENV PATH="/home/user/.local/bin:$PATH"
10
+
11
+ WORKDIR /app
12
+
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+ RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, re, warnings
2
+ from dotenv import load_dotenv
3
+ from fastapi import FastAPI, Request, HTTPException
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.responses import FileResponse, StreamingResponse
6
+ from bs4 import BeautifulSoup
7
+
8
+ from huggingface_hub import configure_http_backend
9
+
10
+ from schemas import *
11
+ from classes import *
12
+
13
+ def backend_factory() -> requests.Session:
14
+ session = requests.Session()
15
+ session.verify = False
16
+ return session
17
+
18
+ configure_http_backend(backend_factory=backend_factory)
19
+ warnings.filterwarnings("ignore")
20
+ load_dotenv()
21
+
22
+ meetings_mapping = {
23
+ "SA": [
24
+ "TSG_SA",
25
+ "WG1_Serv",
26
+ "WG2_Arch",
27
+ "WG3_Security",
28
+ "WG4_CODEC",
29
+ "WG5_TM",
30
+ "WG6_MissionCritical"
31
+ ],
32
+ "CT": [
33
+ "TSG_CT",
34
+ "WG1_mm-cc-sm_ex-CN1",
35
+ "WG2_capability_ex-T2",
36
+ "WG3_interworking_ex-CN3",
37
+ "WG4_protocollars_ex-CN4",
38
+ "WG5_osa_ex-CN5",
39
+ "WG6_Smartcard_Ex-T3"
40
+ ],
41
+ "RAN": [
42
+ "TSG_RAN",
43
+ "WG1_RL1",
44
+ "WG2_RL2",
45
+ "WG3_Iu",
46
+ "WG4_Radio",
47
+ "WG5_Test_ex-T1",
48
+ "WG6_legacyRAN"
49
+ ]
50
+ }
51
+
52
+ tdoc_indexer = TDocIndexer()
53
+ spec_3gpp_indexer = Spec3GPPIndexer()
54
+ spec_etsi_indexer = SpecETSIIndexer()
55
+
56
+ app = FastAPI()
57
+ app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_origins=["*"])
58
+
59
+ @app.get('/')
60
+ def main():
61
+ return FileResponse("index.html")
62
+ def get_folder_name(working_group: str):
63
+ if working_group.endswith("P"):
64
+ if working_group.startswith("S"):
65
+ return ("SA", 0)
66
+ if working_group.startswith("C"):
67
+ return ("CT", 0)
68
+ if working_group.startswith("R"):
69
+ return ("RAN", 0)
70
+ m = re.match(r"([A-Z]+)(\d+)", working_group)
71
+ if m:
72
+ code, num = m.groups()
73
+ return (code, int(num))
74
+ else:
75
+ raise ValueError("Format inattendu")
76
+
77
+ @app.get("/get_meetings/{working_group}")
78
+ def get_meetings(working_group: str):
79
+ category, wg_number = get_folder_name(working_group)
80
+ folder = meetings_mapping[category][wg_number]
81
+ url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
82
+ response = requests.get(url, verify=False)
83
+ responseHTML = response.text
84
+ soup = BeautifulSoup(responseHTML, "html.parser")
85
+ return {"url": url, "meetings": [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG") or item.get_text().startswith("CT")]}
86
+
87
+ @app.post("/index_tdocs/working_group")
88
+ def index_tdocs_wg_progress(req: IndexTDoc):
89
+ if not req.wg:
90
+ raise HTTPException(status_code=400, detail="Working Group not defined !")
91
+ category, wg_number = get_folder_name(req.wg)
92
+ folder = meetings_mapping[category][wg_number]
93
+ url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}"
94
+
95
+ def generate_events():
96
+ tdoc_indexer.processed_count = 0 # Reset progress
97
+ tdoc_indexer.total_count = 0
98
+ tdoc_indexer.process_workgroup(folder, url)
99
+
100
+ while tdoc_indexer.processed_count < tdoc_indexer.total_count:
101
+ yield f"data: {tdoc_indexer.processed_count}/{tdoc_indexer.total_count}"
102
+ import time; time.sleep(0.2)
103
+ # Pour afficher la fin de l’indexation
104
+ yield f"data: {tdoc_indexer.total_count}/{tdoc_indexer.total_count}"
105
+
106
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
107
+
108
+ @app.post("/index_tdocs/meeting")
109
+ def index_tdocs_meeting_progress(req: IndexTDoc):
110
+ if not req.wg:
111
+ raise HTTPException(status_code=400, detail="Working Group not defined !")
112
+ if not req.meetings:
113
+ raise HTTPException(status_code=400, detail="Meetings not defined !")
114
+
115
+ category, wg_number = get_folder_name(req.wg)
116
+ folder = meetings_mapping[category][wg_number]
117
+ url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
118
+
119
+ def generate_events():
120
+ tdoc_indexer.processed_count = 0
121
+ tdoc_indexer.total_count = len(req.meetings)
122
+ for i, meet in enumerate(req.meetings):
123
+ tdoc_indexer.process_meeting(meet, url)
124
+ yield f"data: {i+1}/{tdoc_indexer.total_count}"
125
+ tdoc_indexer.save_indexer()
126
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
127
+
128
+
129
+ @app.post("/index_tdocs/all")
130
+ def index_all_tdocs_progress():
131
+ def generate_events():
132
+ tdoc_indexer.processed_count = 0
133
+ tdoc_indexer.total_count = 0
134
+ # On lance l’indexation (la méthode met à jour les compteurs)
135
+ tdoc_indexer.index_all_tdocs()
136
+ while tdoc_indexer.processed_count < tdoc_indexer.total_count:
137
+ yield f"data: {tdoc_indexer.processed_count}/{tdoc_indexer.total_count}"
138
+ import time; time.sleep(0.2)
139
+ yield f"data: {tdoc_indexer.total_count}/{tdoc_indexer.total_count}"
140
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
141
+
142
+
143
+ @app.post("/index_specs/3gpp")
144
+ def index_3gpp_specs_progress():
145
+ def generate_events():
146
+ spec_3gpp_indexer.processed_count = 0
147
+ spec_3gpp_indexer.total_count = 0
148
+ import threading
149
+
150
+ def worker():
151
+ spec_3gpp_indexer.run()
152
+ spec_3gpp_indexer.save()
153
+ spec_3gpp_indexer.create_bm25_index()
154
+
155
+ t = threading.Thread(target=worker)
156
+ t.start()
157
+ while t.is_alive() or spec_3gpp_indexer.processed_count < spec_3gpp_indexer.total_count:
158
+ yield f"data: {spec_3gpp_indexer.processed_count}/{spec_3gpp_indexer.total_count}"
159
+ import time; time.sleep(0.5)
160
+ yield f"data: {spec_3gpp_indexer.total_count}/{spec_3gpp_indexer.total_count}"
161
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
162
+
163
+ @app.post("/index_specs/etsi")
164
+ def index_etsi_specs_progress():
165
+ def generate_events():
166
+ spec_etsi_indexer.processed_count = 0
167
+ spec_etsi_indexer.total_count = 0
168
+ import threading
169
+
170
+ def worker():
171
+ spec_etsi_indexer.run()
172
+ spec_etsi_indexer.save()
173
+ spec_etsi_indexer.create_bm25_index()
174
+
175
+ t = threading.Thread(target=worker)
176
+ t.start()
177
+ while t.is_alive() or spec_etsi_indexer.processed_count < spec_etsi_indexer.total_count:
178
+ yield f"data: {spec_etsi_indexer.processed_count}/{spec_etsi_indexer.total_count}"
179
+ import time; time.sleep(0.5)
180
+ yield f"data: {spec_etsi_indexer.total_count}/{spec_etsi_indexer.total_count}"
181
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
classes.py ADDED
@@ -0,0 +1,840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import bm25s
3
+ from bm25s.hf import BM25HF
4
+ import threading, re, time, concurrent.futures, requests, os, hashlib, traceback, io, zipfile, subprocess, tempfile, json, fitz
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ from bs4 import BeautifulSoup
9
+ from datasets import load_dataset, Dataset
10
+ from datasets.data_files import EmptyDatasetError
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ class TDocIndexer:
16
+ def __init__(self, max_workers=33):
17
+ self.indexer_length = 0
18
+ self.dataset = "OrganizedProgrammers/3GPPTDocLocation"
19
+
20
+ self.indexer = self.load_indexer()
21
+ self.main_ftp_url = "https://3gpp.org/ftp"
22
+ self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
23
+ self.max_workers = max_workers
24
+
25
+ self.print_lock = threading.Lock()
26
+ self.indexer_lock = threading.Lock()
27
+
28
+ self.total_indexed = 0
29
+ self.processed_count = 0
30
+ self.total_count = 0
31
+
32
+ def load_indexer(self):
33
+ self.indexer_length = 0
34
+ all_docs = {}
35
+ tdoc_locations = load_dataset(self.dataset)
36
+ tdoc_locations = tdoc_locations["train"].to_list()
37
+ for doc in tdoc_locations:
38
+ self.indexer_length += 1
39
+ all_docs[doc["doc_id"]] = doc["url"]
40
+
41
+ return all_docs
42
+
43
+ def save_indexer(self):
44
+ """Save the updated index"""
45
+ data = []
46
+ for doc_id, url in self.indexer.items():
47
+ data.append({"doc_id": doc_id, "url": url})
48
+
49
+ dataset = Dataset.from_list(data)
50
+ dataset.push_to_hub(self.dataset, token=os.environ["HF"])
51
+ self.indexer = self.load_indexer()
52
+
53
+ def get_docs_from_url(self, url):
54
+ try:
55
+ response = requests.get(url, verify=False, timeout=10)
56
+ soup = BeautifulSoup(response.text, "html.parser")
57
+ return [item.get_text() for item in soup.select("tr td a")]
58
+ except Exception as e:
59
+ with self.print_lock:
60
+ print(f"Erreur lors de l'accès à {url}: {e}")
61
+ return []
62
+
63
+ def is_valid_document_pattern(self, filename):
64
+ return bool(self.valid_doc_pattern.match(filename))
65
+
66
+ def is_zip_file(self, filename):
67
+ return filename.lower().endswith('.zip')
68
+
69
+ def extract_doc_id(self, filename):
70
+ if self.is_valid_document_pattern(filename):
71
+ match = self.valid_doc_pattern.match(filename)
72
+ if match:
73
+ # Retourner le motif complet (comme S1-12345)
74
+ full_id = filename.split('.')[0] # Enlever l'extension si présente
75
+ return full_id.split('_')[0] # Enlever les suffixes après underscore si présents
76
+ return None
77
+
78
+ def process_zip_files(self, files_list, base_url, workshop=False):
79
+ """Traiter une liste de fichiers pour trouver et indexer les ZIP valides"""
80
+ indexed_count = 0
81
+
82
+ for file in files_list:
83
+ if file in ['./', '../', 'ZIP/', 'zip/']:
84
+ continue
85
+
86
+ # Vérifier si c'est un fichier ZIP et s'il correspond au motif
87
+ if self.is_zip_file(file) and (self.is_valid_document_pattern(file) or workshop):
88
+ file_url = f"{base_url}/{file}"
89
+
90
+ # Extraire l'ID du document
91
+ doc_id = self.extract_doc_id(file)
92
+ if doc_id is None:
93
+ doc_id = file.split('.')[0]
94
+ if doc_id:
95
+ # Vérifier si ce fichier est déjà indexé
96
+ with self.indexer_lock:
97
+ if doc_id in self.indexer and self.indexer[doc_id] == file_url:
98
+ continue
99
+
100
+ # Ajouter ou mettre à jour l'index
101
+ self.indexer[doc_id] = file_url
102
+ indexed_count += 1
103
+ self.total_indexed += 1
104
+
105
+ return indexed_count
106
+
107
+ def process_meeting(self, meeting, wg_url, workshop=False):
108
+ """Traiter une réunion individuelle avec multithreading"""
109
+ try:
110
+ if meeting in ['./', '../']:
111
+ return 0
112
+
113
+ meeting_url = f"{wg_url}/{meeting}"
114
+
115
+ with self.print_lock:
116
+ print(f"Vérification du meeting: {meeting}")
117
+
118
+ # Vérifier le contenu de la réunion
119
+ meeting_contents = self.get_docs_from_url(meeting_url)
120
+
121
+ key = None
122
+ if "docs" in [x.lower() for x in meeting_contents]:
123
+ key = "docs"
124
+ elif "tdocs" in [x.lower() for x in meeting_contents]:
125
+ key = "tdocs"
126
+ elif "tdoc" in [x.lower() for x in meeting_contents]:
127
+ key = "tdoc"
128
+
129
+ if key is not None:
130
+ docs_url = f"{meeting_url}/{key}"
131
+
132
+ with self.print_lock:
133
+ print(f"Vérification des documents présent dans {docs_url}")
134
+
135
+ # Récupérer la liste des fichiers dans le dossier Docs
136
+ docs_files = self.get_docs_from_url(docs_url)
137
+
138
+ # 1. Indexer les fichiers ZIP directement dans le dossier Docs
139
+ docs_indexed_count = self.process_zip_files(docs_files, docs_url, workshop)
140
+
141
+ if docs_indexed_count > 0:
142
+ with self.print_lock:
143
+ print(f"{docs_indexed_count} fichiers trouvés")
144
+
145
+ # 2. Vérifier le sous-dossier ZIP s'il existe
146
+ if "zip" in [x.lower() for x in docs_files]:
147
+ zip_url = f"{docs_url}/zip"
148
+
149
+ with self.print_lock:
150
+ print(f"Vérification du dossier ./zip: {zip_url}")
151
+
152
+ # Récupérer les fichiers dans le sous-dossier ZIP
153
+ zip_files = self.get_docs_from_url(zip_url)
154
+
155
+ # Indexer les fichiers ZIP dans le sous-dossier ZIP
156
+ zip_indexed_count = self.process_zip_files(zip_files, zip_url, workshop)
157
+
158
+ if zip_indexed_count > 0:
159
+ with self.print_lock:
160
+ print(f"{zip_indexed_count} fichiers trouvés")
161
+
162
+ # Mise à jour du compteur de progression
163
+ with self.indexer_lock:
164
+ self.processed_count += 1
165
+
166
+ # Affichage de la progression
167
+ with self.print_lock:
168
+ progress = (self.processed_count / self.total_count) * 100 if self.total_count > 0 else 0
169
+ print(f"\rProgression: {self.processed_count}/{self.total_count} réunions traitées ({progress:.1f}%)")
170
+
171
+ return 1 # Réunion traitée avec succès
172
+
173
+ except Exception as e:
174
+ with self.print_lock:
175
+ print(f"\nErreur lors du traitement de la réunion {meeting}: {str(e)}")
176
+ return 0
177
+
178
+ def process_workgroup(self, wg, main_url):
179
+ """Traiter un groupe de travail avec multithreading pour ses réunions"""
180
+ if wg in ['./', '../']:
181
+ return
182
+
183
+ wg_url = f"{main_url}/{wg}"
184
+
185
+ with self.print_lock:
186
+ print(f"Vérification du working group: {wg}")
187
+
188
+ # Récupérer les dossiers de réunion
189
+ meeting_folders = self.get_docs_from_url(wg_url)
190
+
191
+ # Ajouter au compteur total
192
+ self.total_count += len([m for m in meeting_folders if m not in ['./', '../']])
193
+
194
+ # Utiliser ThreadPoolExecutor pour traiter les réunions en parallèle
195
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
196
+ futures = [executor.submit(self.process_meeting, meeting, wg_url)
197
+ for meeting in meeting_folders if meeting not in ['./', '../']]
198
+
199
+ # Attendre que toutes les tâches soient terminées
200
+ concurrent.futures.wait(futures)
201
+
202
+ def index_all_tdocs(self):
203
+ """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
204
+ print("Démarrage de l'indexation des TDocs 3GPP complète")
205
+
206
+ start_time = time.time()
207
+ docs_count_before = self.indexer_length
208
+
209
+ # Principaux groupes TSG
210
+ main_groups = ["tsg_sa", "tsg_ct", "tsg_ran"] # Ajouter d'autres si nécessaire
211
+
212
+ for main_tsg in main_groups:
213
+ print(f"Indexation de {main_tsg.upper()}...")
214
+
215
+ main_url = f"{self.main_ftp_url}/{main_tsg}"
216
+
217
+ # Récupérer les groupes de travail
218
+ workgroups = self.get_docs_from_url(main_url)
219
+
220
+ # Traiter chaque groupe de travail séquentiellement
221
+ # (mais les réunions à l'intérieur seront traitées en parallèle)
222
+ for wg in workgroups:
223
+ self.process_workgroup(wg, main_url)
224
+
225
+ docs_count_after = len(self.indexer)
226
+ new_docs_count = abs(docs_count_after - docs_count_before)
227
+
228
+ print(f"Indexation terminée en {time.time() - start_time:.2f} secondes")
229
+ print(f"Nouveaux documents ZIP indexés: {new_docs_count}")
230
+ print(f"Total des documents dans l'index: {docs_count_after}")
231
+
232
+ return self.indexer
233
+
234
+ def index_all_workshops(self):
235
+ print("Démarrage de l'indexation des workshops ZIP 3GPP...")
236
+ start_time = time.time()
237
+ docs_count_before = len(self.indexer)
238
+
239
+ print("\nIndexation du dossier 'workshop'")
240
+ main_url = f"{self.main_ftp_url}/workshop"
241
+
242
+ # Récupérer les dossiers de réunion
243
+ meeting_folders = self.get_docs_from_url(main_url)
244
+
245
+ # Ajouter au compteur total
246
+ self.total_count += len([m for m in meeting_folders if m not in ['./', '../']])
247
+
248
+ # Utiliser ThreadPoolExecutor pour traiter les réunions en parallèle
249
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
250
+ futures = [executor.submit(self.process_meeting, meeting, main_url, workshop=True)
251
+ for meeting in meeting_folders if meeting not in ['./', '../']]
252
+ concurrent.futures.wait(futures)
253
+
254
+ docs_count_after = len(self.indexer)
255
+ new_docs_count = docs_count_after - docs_count_before
256
+
257
+ print(f"\nIndexation terminée en {time.time() - start_time:.2f} secondes")
258
+ print(f"Nouveaux documents ZIP indexés: {new_docs_count}")
259
+ print(f"Total des documents dans l'index: {docs_count_after}")
260
+
261
+ return self.indexer
262
+
263
+ class Spec3GPPIndexer:
264
+ def __init__(self, max_workers=16):
265
+ self.spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")["train"].to_list()
266
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
267
+ self.indexed_specifications = {}
268
+ self.specifications_passed = set()
269
+ self.processed_count = 0
270
+ self.total_count = 0
271
+
272
+ self.DICT_LOCK = threading.Lock()
273
+ self.DOCUMENT_LOCK = threading.Lock()
274
+ self.STOP_EVENT = threading.Event()
275
+ self.max_workers = max_workers
276
+ self.LIBREOFFICE_SEMAPHORE = threading.Semaphore(self.max_workers)
277
+
278
+ def _make_doc_index(self, specs):
279
+ doc_index = {}
280
+ for section in specs:
281
+ if section["doc_id"] not in doc_index:
282
+ doc_index[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
283
+ else:
284
+ doc_index[section["doc_id"]]["content"][section["section"]] = section["content"]
285
+ return doc_index
286
+
287
+ @staticmethod
288
+ def version_to_code(version_str):
289
+ chars = "0123456789abcdefghijklmnopqrstuvwxyz"
290
+ parts = version_str.split('.')
291
+ if len(parts) != 3:
292
+ return None
293
+ try:
294
+ x, y, z = [int(p) for p in parts]
295
+ except ValueError:
296
+ return None
297
+ if x < 36 and y < 36 and z < 36:
298
+ return f"{chars[x]}{chars[y]}{chars[z]}"
299
+ else:
300
+ return f"{str(x).zfill(2)}{str(y).zfill(2)}{str(z).zfill(2)}"
301
+
302
+ @staticmethod
303
+ def hasher(specification, version_code):
304
+ return hashlib.md5(f"{specification}{version_code}".encode()).hexdigest()
305
+
306
+ @staticmethod
307
+ def get_scope(content):
308
+ for title, text in content.items():
309
+ if title.lower().endswith("scope"):
310
+ return text
311
+ return ""
312
+
313
+ def get_text(self, specification, version_code):
314
+ if self.STOP_EVENT.is_set():
315
+ return []
316
+
317
+ doc_id = specification
318
+ series = doc_id.split(".")[0]
319
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
320
+
321
+ try:
322
+ response = requests.get(url, verify=False)
323
+ if response.status_code != 200:
324
+ return []
325
+
326
+ zip_bytes = io.BytesIO(response.content)
327
+ with zipfile.ZipFile(zip_bytes) as zip_file:
328
+ # Filtrer uniquement fichiers .doc et .docx
329
+ docx_files = [f for f in zip_file.namelist() if f.lower().endswith(('.doc', '.docx'))]
330
+ if not docx_files:
331
+ return []
332
+
333
+ full_text = []
334
+
335
+ for doc_file in docx_files:
336
+ with tempfile.TemporaryDirectory() as tmpdir:
337
+ extracted_path = os.path.join(tmpdir, os.path.basename(doc_file))
338
+ with open(extracted_path, 'wb') as f:
339
+ f.write(zip_file.read(doc_file))
340
+
341
+ # Profil libreoffice temp dédié
342
+ profile_dir = tempfile.mkdtemp(prefix="libreoffice_profile_")
343
+
344
+ try:
345
+ with self.LIBREOFFICE_SEMAPHORE:
346
+ cmd = [
347
+ 'soffice',
348
+ '--headless',
349
+ f'-env:UserInstallation=file://{profile_dir}',
350
+ '--convert-to', 'txt:Text',
351
+ '--outdir', tmpdir,
352
+ extracted_path
353
+ ]
354
+ subprocess.run(cmd, check=True, timeout=60*5, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
355
+
356
+ txt_file = os.path.splitext(extracted_path)[0] + '.txt'
357
+ if os.path.exists(txt_file):
358
+ with open(txt_file, 'r', encoding='utf-8', errors='ignore') as ftxt:
359
+ full_text.extend(ftxt.readlines())
360
+ finally:
361
+ shutil.rmtree(profile_dir, ignore_errors=True)
362
+
363
+ return full_text
364
+
365
+ except Exception as e:
366
+ print(f"Error getting text for {specification} v{version_code}: {e}")
367
+ return []
368
+
369
+ def get_spec_content(self, specification, version_code):
370
+ if self.STOP_EVENT.is_set():
371
+ return {}
372
+
373
+ text = self.get_text(specification, version_code)
374
+ if not text:
375
+ return {}
376
+
377
+ chapters = []
378
+ chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+[^\.]$")
379
+ for i, line in enumerate(text):
380
+ if chapter_regex.fullmatch(line):
381
+ chapters.append((i, line))
382
+
383
+ document = {}
384
+ for i in range(len(chapters)):
385
+ start_index, chapter_title = chapters[i]
386
+ end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
387
+ content_lines = text[start_index + 1:end_index]
388
+ document[chapter_title.replace("\t", " ")] = "\n".join(content_lines)
389
+
390
+ return document
391
+
392
+ def fetch_spec_table(self):
393
+ response = requests.get(
394
+ 'https://www.3gpp.org/dynareport?code=status-report.htm',
395
+ headers={"User-Agent": 'Mozilla/5.0'},
396
+ verify=False
397
+ )
398
+ dfs = pd.read_html(io.StringIO(response.text))
399
+ for x in range(len(dfs)):
400
+ dfs[x] = dfs[x].replace({np.nan: None})
401
+ columns_needed = [0, 1, 2, 3, 4]
402
+ extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
403
+ columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
404
+ specifications = []
405
+ for df in extracted_dfs:
406
+ for index, row in df.iterrows():
407
+ doc = row.to_list()
408
+ doc_dict = dict(zip(columns, doc))
409
+ specifications.append(doc_dict)
410
+ return specifications
411
+
412
+ def process_specification(self, spec):
413
+ if self.STOP_EVENT.is_set():
414
+ return
415
+ try:
416
+ doc_id = str(spec['spec_num'])
417
+ version_code = self.version_to_code(str(spec['vers']))
418
+ if not version_code:
419
+ with self.DICT_LOCK:
420
+ self.processed_count += 1
421
+ return
422
+
423
+ document = None
424
+ already_indexed = False
425
+ with self.DOCUMENT_LOCK:
426
+ doc_in_cache = doc_id in self.documents_by_spec_num and \
427
+ self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)
428
+
429
+ if doc_in_cache and doc_id not in self.specifications_passed:
430
+ document = self.documents_by_spec_num[doc_id]
431
+ self.specifications_passed.add(doc_id)
432
+ already_indexed = True
433
+ elif doc_id not in self.specifications_passed:
434
+ doc_content = self.get_spec_content(doc_id, version_code)
435
+ if doc_content:
436
+ document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
437
+ with self.DOCUMENT_LOCK:
438
+ self.documents_by_spec_num[doc_id] = document
439
+ self.specifications_passed.add(doc_id)
440
+ already_indexed = False
441
+
442
+ if document:
443
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{doc_id.split('.')[0]}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
444
+ metadata = {
445
+ "id": doc_id,
446
+ "title": spec.get("title", ""),
447
+ "type": spec.get("type", ""),
448
+ "version": str(spec.get("vers", "")),
449
+ "working_group": spec.get("WG", ""),
450
+ "url": url,
451
+ "scope": self.get_scope(document["content"])
452
+ }
453
+ key = f"{doc_id}+-+{spec.get('title', '')}+-+{spec.get('type', '')}+-+{spec.get('vers', '')}+-+{spec.get('WG', '')}"
454
+ with self.DICT_LOCK:
455
+ self.indexed_specifications[key] = metadata
456
+
457
+ with self.DICT_LOCK:
458
+ self.processed_count += 1
459
+ status = "already indexed" if already_indexed else "indexed now"
460
+ print(f"Spec {doc_id} ({spec.get('title', '')}): {status} - Progress {self.processed_count}/{self.total_count}")
461
+
462
+ except Exception as e:
463
+ traceback.print_exc()
464
+ print(f"Error processing spec {spec.get('spec_num')} v{spec.get('vers')}: {e}")
465
+ with self.DICT_LOCK:
466
+ self.processed_count += 1
467
+ print(f"Progress: {self.processed_count}/{self.total_count} specs processed")
468
+
469
+ def get_document(self, spec_id: str, spec_title: str):
470
+ text = [f"{spec_id} - {spec_title}\n"]
471
+ for section in self.spec_contents:
472
+ if spec_id == section["doc_id"]:
473
+ text.extend([f"{section['section']}\n\n{section['content']}"])
474
+ return text
475
+
476
+ def create_bm25_index(self):
477
+ dataset_metadata = self.indexed_specifications.values()
478
+ unique_specs = set()
479
+ corpus_json = []
480
+
481
+ for specification in dataset_metadata:
482
+ if specification['id'] in unique_specs: continue
483
+ for section in self.spec_contents:
484
+ if specification['id'] == section['doc_id']:
485
+ corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
486
+ "id": specification['id'],
487
+ "title": specification['title'],
488
+ "section_title": section['section'],
489
+ "version": specification['version'],
490
+ "type": specification['type'],
491
+ "working_group": specification['working_group'],
492
+ "url": specification['url'],
493
+ "scope": specification['scope']
494
+ }})
495
+
496
+ corpus_text = [doc["text"] for doc in corpus_json]
497
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
498
+
499
+ print("Indexing BM25")
500
+ retriever = BM25HF(corpus=corpus_json)
501
+ retriever.index(corpus_tokens)
502
+
503
+ retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF"))
504
+
505
+ unique_specs = set()
506
+ corpus_json = []
507
+
508
+ for specification in dataset_metadata:
509
+ if specification['id'] in unique_specs: continue
510
+ text_list = self.get_document(specification['id'], specification['title'])
511
+ text = "\n".join(text_list)
512
+ if len(text_list) == 1: continue
513
+ corpus_json.append({"text": text, "metadata": specification})
514
+ unique_specs.add(specification['id'])
515
+
516
+ corpus_text = [doc["text"] for doc in corpus_json]
517
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
518
+
519
+ print("Indexing BM25")
520
+ retriever = BM25HF(corpus=corpus_json)
521
+ retriever.index(corpus_tokens)
522
+
523
+ retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF"))
524
+
525
+ def run(self):
526
+ print("Fetching specification tables from 3GPP...")
527
+ specifications = self.fetch_spec_table()
528
+ self.total_count = len(specifications)
529
+ print(f"Processing {self.total_count} specs with {self.max_workers} threads...")
530
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
531
+ futures = [executor.submit(self.process_specification, spec) for spec in specifications]
532
+ for f in concurrent.futures.as_completed(futures):
533
+ if self.STOP_EVENT.is_set():
534
+ break
535
+ print("All specs processed.")
536
+
537
+ # Sauvegarde (identique au script original)
538
+ def save(self):
539
+ print("Saving indexed data...")
540
+ flat_metadata = [metadata for metadata in self.indexed_specifications.values()]
541
+ flat_docs = []
542
+ print("Flatting doc contents")
543
+ for doc_id, data in self.documents_by_spec_num.items():
544
+ for title, content in data["content"].items():
545
+ flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
546
+ print("Creating datasets ...")
547
+ push_spec_content = Dataset.from_list(flat_docs)
548
+ push_spec_metadata = Dataset.from_list(flat_metadata)
549
+ # Token handling assumed set in environment
550
+ print("Pushing ...")
551
+ push_spec_content.push_to_hub("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF"])
552
+ push_spec_metadata.push_to_hub("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF"])
553
+
554
+ self.spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")["train"].to_list()
555
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
556
+ print("Save finished.")
557
+
558
+ class SpecETSIIndexer:
559
+ def __init__(self, max_workers=16):
560
+ self.session = requests.Session()
561
+ self.session.verify = False
562
+
563
+ self.spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent")["train"].to_list()
564
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
565
+ self.indexed_specifications = {}
566
+ self.specifications_passed = set()
567
+ self.processed_count = 0
568
+ self.total_count = 0
569
+
570
+ self.DICT_LOCK = threading.Lock()
571
+ self.DOCUMENT_LOCK = threading.Lock()
572
+ self.STOP_EVENT = threading.Event()
573
+ self.max_workers = max_workers
574
+
575
+ self.df = self._fetch_spec_table()
576
+
577
+ def _make_doc_index(self, specs):
578
+ doc_index = {}
579
+ for section in specs:
580
+ if section["doc_id"] not in doc_index:
581
+ doc_index[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
582
+ else:
583
+ doc_index[section["doc_id"]]["content"][section["section"]] = section["content"]
584
+ return doc_index
585
+
586
+ def _fetch_spec_table(self):
587
+ # Connexion login et récupération CSV TS/TR
588
+ print("Connexion login ETSI...")
589
+ self.session.post(
590
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
591
+ verify=False,
592
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
593
+ data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
594
+ )
595
+
596
+ print("Récupération des métadonnées TS/TR …")
597
+ url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
598
+ url_tr = url_ts.replace("stdType=TS", "stdType=TR")
599
+ data_ts = self.session.get(url_ts, verify=False).content
600
+ data_tr = self.session.get(url_tr, verify=False).content
601
+ df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
602
+ df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
603
+
604
+ backup_ts = df_ts["ETSI deliverable"]
605
+ backup_tr = df_tr["ETSI deliverable"]
606
+ df_ts["ETSI deliverable"] = df_ts["ETSI deliverable"].str.extract(r"\s*ETSI TS (\d+ \d+(?:-\d+(?:-\d+)?)?)")
607
+ df_tr["ETSI deliverable"] = df_tr["ETSI deliverable"].str.extract(r"\s*ETSI TR (\d+ \d+(?:-\d+(?:-\d+)?)?)")
608
+ version1 = backup_ts.str.extract(r"\s*ETSI TS \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
609
+ version2 = backup_tr.str.extract(r"\s*ETSI TR \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
610
+ df_ts["Version"] = version1[0]
611
+ df_tr["Version"] = version2[0]
612
+
613
+ def ver_tuple(v):
614
+ return tuple(map(int, v.split(".")))
615
+ df_ts["temp"] = df_ts["Version"].apply(ver_tuple)
616
+ df_tr["temp"] = df_tr["Version"].apply(ver_tuple)
617
+ df_ts["Type"] = "TS"
618
+ df_tr["Type"] = "TR"
619
+ df = pd.concat([df_ts, df_tr])
620
+ unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
621
+ unique_df = unique_df.drop(columns="temp")
622
+ unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
623
+ df = df.drop(columns="temp")
624
+ df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
625
+ return df
626
+
627
+ @staticmethod
628
+ def hasher(specification: str, version: str):
629
+ return hashlib.md5(f"{specification}{version}".encode()).hexdigest()
630
+
631
+ @staticmethod
632
+ def get_scope(content):
633
+ for title, text in content.items():
634
+ if title.lower().endswith("scope"):
635
+ return text
636
+ return ""
637
+
638
+ def get_document(self, spec_id: str, spec_title: str):
639
+ text = [f"{spec_id} - {spec_title}\n"]
640
+ for section in self.spec_contents:
641
+ if spec_id == section["doc_id"]:
642
+ text.extend([f"{section['section']}\n\n{section['content']}"])
643
+ return text
644
+
645
+ def get_text(self, specification: str):
646
+ if self.STOP_EVENT.is_set():
647
+ return None, []
648
+ print(f"\n[INFO] Tentative de récupération de la spécification {specification}", flush=True)
649
+ try:
650
+ # Récupérer la ligne avec le bon lien PDF
651
+ row = self.df[self.df["ETSI deliverable"] == specification]
652
+ if row.empty:
653
+ print(f"[WARN] Spécification {specification} absente du tableau")
654
+ return None, []
655
+
656
+ pdf_link = row.iloc[0]["PDF link"]
657
+ response = self.session.get(
658
+ pdf_link,
659
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'}
660
+ )
661
+ if response.status_code != 200:
662
+ print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
663
+ return None, []
664
+ pdf = fitz.open(stream=response.content, filetype="pdf")
665
+ return pdf, pdf.get_toc()
666
+ except Exception as e:
667
+ print(f"[ERROR] Échec get_text pour {specification} : {e}", flush=True)
668
+ return None, []
669
+
670
+ def get_spec_content(self, specification: str):
671
+ def extract_sections(text, titles):
672
+ sections = {}
673
+ sorted_titles = sorted(titles, key=lambda t: text.find(t))
674
+ for i, title in enumerate(sorted_titles):
675
+ start = text.find(title)
676
+ if i + 1 < len(sorted_titles):
677
+ end = text.find(sorted_titles[i + 1])
678
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
679
+ else:
680
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
681
+ return sections
682
+
683
+ if self.STOP_EVENT.is_set():
684
+ return {}
685
+ print(f"[INFO] Extraction du contenu de {specification}", flush=True)
686
+ pdf, doc_toc = self.get_text(specification)
687
+ text = []
688
+ if not pdf or not doc_toc:
689
+ print("[ERREUR] Pas de texte ou table of contents trouvé !")
690
+ return {}
691
+ # On prend à partir de la première réelle page référencée
692
+ first_page = 0
693
+ for level, title, page in doc_toc:
694
+ first_page = page - 1
695
+ break
696
+ for page in pdf[first_page:]:
697
+ text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
698
+ text = "\n".join(text)
699
+ if not text or not doc_toc or self.STOP_EVENT.is_set():
700
+ print("[ERREUR] Pas de texte/table of contents récupéré !")
701
+ return {}
702
+ titles = []
703
+ for level, title, page in doc_toc:
704
+ if self.STOP_EVENT.is_set():
705
+ return {}
706
+ if title and title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
707
+ titles.append('\n'.join(title.strip().split(" ", 1)))
708
+ return extract_sections(text, titles)
709
+
710
+ def process_specification(self, spec):
711
+ if self.STOP_EVENT.is_set():
712
+ return
713
+ try:
714
+ version = spec.get('Version')
715
+ if not version: return
716
+ doc_id = str(spec.get("ETSI deliverable"))
717
+ document = None
718
+ already_indexed = False
719
+
720
+ with self.DOCUMENT_LOCK:
721
+ if (doc_id in self.documents_by_spec_num
722
+ and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)
723
+ and doc_id not in self.specifications_passed):
724
+ document = self.documents_by_spec_num[doc_id]
725
+ self.specifications_passed.add(doc_id)
726
+ already_indexed = True
727
+ elif doc_id in self.specifications_passed:
728
+ document = self.documents_by_spec_num[doc_id]
729
+ already_indexed = True
730
+ else:
731
+ document_content = self.get_spec_content(doc_id)
732
+ if document_content:
733
+ self.documents_by_spec_num[doc_id] = {"content": document_content, "hash": self.hasher(doc_id, version)}
734
+ document = {"content": document_content, "hash": self.hasher(doc_id, version)}
735
+ self.specifications_passed.add(doc_id)
736
+ already_indexed = False
737
+
738
+ if document:
739
+ string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
740
+ metadata = {
741
+ "id": str(doc_id),
742
+ "title": spec["title"],
743
+ "type": spec["Type"],
744
+ "version": version,
745
+ "url": spec["PDF link"],
746
+ "scope": "" if not document else self.get_scope(document["content"])
747
+ }
748
+ with self.DICT_LOCK:
749
+ self.indexed_specifications[string_key] = metadata
750
+
751
+ with self.DICT_LOCK:
752
+ self.processed_count += 1
753
+ status = "already indexed" if already_indexed else "indexed now"
754
+ print(f"Spec {doc_id} ({spec.get('title', '')}): {status} - Progress {self.processed_count}/{self.total_count}")
755
+
756
+ except Exception as e:
757
+ traceback.print_exc()
758
+ print(f"\n[ERREUR] Échec du traitement de {doc_id} {spec.get('Version')}: {e}", flush=True)
759
+ with self.DICT_LOCK:
760
+ self.processed_count += 1
761
+ print(f"Progress: {self.processed_count}/{self.total_count} specs processed")
762
+
763
+ def run(self):
764
+ print("Démarrage indexation ETSI…")
765
+ specifications = self.df.to_dict(orient="records")
766
+ self.total_count = len(specifications)
767
+ print(f"Traitement de {self.total_count} specs avec {self.max_workers} threads...\n")
768
+
769
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
770
+ futures = [executor.submit(self.process_specification, spec) for spec in specifications]
771
+ for f in concurrent.futures.as_completed(futures):
772
+ if self.STOP_EVENT.is_set():
773
+ break
774
+
775
+ print(f"\nAll {self.processed_count}/{self.total_count} specs processed.")
776
+
777
+ def save(self):
778
+ print("\nSauvegarde en cours...", flush=True)
779
+ flat_metadata = [metadata for metadata in self.indexed_specifications.values()]
780
+ flat_docs = []
781
+ for doc_id, data in self.documents_by_spec_num.items():
782
+ for title, content in data["content"].items():
783
+ flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
784
+ push_spec_content = Dataset.from_list(flat_docs)
785
+ push_spec_metadata = Dataset.from_list(flat_metadata)
786
+ push_spec_content.push_to_hub("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF"])
787
+ push_spec_metadata.push_to_hub("OrganizedProgrammers/ETSISpecMetadata", token=os.environ["HF"])
788
+
789
+ self.spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent")["train"].to_list()
790
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
791
+ print("Sauvegarde terminée.")
792
+
793
+ def create_bm25_index(self):
794
+ dataset_metadata = self.indexed_specifications.values()
795
+ unique_specs = set()
796
+ corpus_json = []
797
+
798
+ for specification in dataset_metadata:
799
+ if specification['id'] in unique_specs: continue
800
+ for section in self.spec_contents:
801
+ if specification['id'] == section['doc_id']:
802
+ corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
803
+ "id": specification['id'],
804
+ "title": specification['title'],
805
+ "section_title": section['section'],
806
+ "version": specification['version'],
807
+ "type": specification['type'],
808
+ "url": specification['url'],
809
+ "scope": specification['scope']
810
+ }})
811
+
812
+ corpus_text = [doc["text"] for doc in corpus_json]
813
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
814
+
815
+ print("Indexing BM25")
816
+ retriever = BM25HF(corpus=corpus_json)
817
+ retriever.index(corpus_tokens)
818
+
819
+ retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF"))
820
+
821
+ unique_specs = set()
822
+ corpus_json = []
823
+
824
+ for specification in dataset_metadata:
825
+ if specification['id'] in unique_specs: continue
826
+ text_list = self.get_document(specification['id'], specification['title'])
827
+ text = "\n".join(text_list)
828
+ if len(text_list) == 1: continue
829
+ corpus_json.append({"text": text, "metadata": specification})
830
+ unique_specs.add(specification['id'])
831
+
832
+ corpus_text = [doc["text"] for doc in corpus_json]
833
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
834
+
835
+ print("Indexing BM25")
836
+ retriever = BM25HF(corpus=corpus_json)
837
+ retriever.index(corpus_tokens)
838
+
839
+ retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF"))
840
+
index.html ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="fr">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>3GPP/ETSI Document Indexer Main Menu</title>
7
+ <style>
8
+ body {
9
+ font-family: "Montserrat", sans-serif;
10
+ background: #fafafa;
11
+ margin: 24px;
12
+ color: #1f2937;
13
+ }
14
+ h1 {
15
+ font-size: 1.8rem;
16
+ margin-bottom: 24px;
17
+ }
18
+ .row {
19
+ display: flex;
20
+ gap: 24px;
21
+ margin-bottom: 24px;
22
+ }
23
+ .column {
24
+ flex: 1;
25
+ display: flex;
26
+ flex-direction: column;
27
+ gap: 12px;
28
+ }
29
+ button {
30
+ background-color: #6c63ff;
31
+ color: white;
32
+ font-weight: 600;
33
+ font-size: 1rem;
34
+ padding: 10px 14px;
35
+ border: none;
36
+ border-radius: 0.6em;
37
+ cursor: pointer;
38
+ box-shadow: 0 2px 8px rgb(31 41 55 / 8%);
39
+ transition: background-color 0.2s ease;
40
+ }
41
+ button:hover {
42
+ background-color: #5753d6;
43
+ }
44
+ button:disabled {
45
+ cursor: default;
46
+ background-color: #778191;
47
+ }
48
+ select {
49
+ padding: 10px 14px;
50
+ border-radius: 0.6em;
51
+ border: none;
52
+ box-shadow: 0 2px 8px rgb(31 41 55 / 8%);
53
+ font-size: 1rem;
54
+ color: #374151;
55
+ background: #f3f4f6;
56
+ appearance: none;
57
+ cursor: pointer;
58
+ }
59
+ select:focus {
60
+ outline: none;
61
+ box-shadow: 0 0 0 2px #6c63ff;
62
+ background: white;
63
+ }
64
+ select:hover {
65
+ background: #e5e7eb;
66
+ }
67
+ select:disabled {
68
+ cursor: default;
69
+ }
70
+
71
+ .dropdown-content {
72
+ position: absolute; /* ou fixed si tu veux */
73
+ z-index: 9999; /* un nombre élevé pour être sûr que c'est au dessus */
74
+ background-color: white; /* pour que ce soit bien visible */
75
+ border: 1px solid #ccc;
76
+ /* autres styles que tu avais déjà */
77
+ border-radius: 0.6em;
78
+ box-shadow: 0 2px 8px rgb(31 41 55 / 8%);
79
+ padding: 10px;
80
+ max-height: 55vh;
81
+ overflow-y: auto;
82
+ }
83
+
84
+ #dropbtn {
85
+ background: #f3f4f6;
86
+ color: #374151;
87
+ font-size: 1rem;
88
+ font-family: "Montserrat", sans-serif; /* même font que body */
89
+ padding: 10px 14px;
90
+ border-radius: 0.6em;
91
+ font-weight: normal;
92
+ border: none;
93
+ box-shadow: 0 2px 8px rgb(31 41 55 / 8%);
94
+ cursor: pointer;
95
+ width: 100%;
96
+ text-align: left;
97
+ appearance: none; /* supprime les styles natives du bouton */
98
+ user-select: none;
99
+ transition: background-color 0.2s ease;
100
+ display: inline-block;
101
+ }
102
+
103
+ #dropbtn:hover {
104
+ background: #e5e7eb;
105
+ }
106
+
107
+ #dropbtn:disabled {
108
+ cursor: default;
109
+ }
110
+
111
+ #dropbtn:focus {
112
+ outline: none;
113
+ box-shadow: 0 0 0 2px #6c63ff;
114
+ background: white;
115
+ }
116
+
117
+ option {
118
+ background: white;
119
+ }
120
+ textarea {
121
+ width: 100%;
122
+ min-height: 450px;
123
+ border-radius: 0.6em;
124
+ border: none;
125
+ box-shadow: 0 2px 6px rgb(31 41 55 / 12%);
126
+ padding: 12px;
127
+ font-family: monospace, monospace;
128
+ font-size: 0.95rem;
129
+ color: #1f2937;
130
+ resize: vertical;
131
+ background: white;
132
+ }
133
+ textarea[readonly] {
134
+ background: #e5e7eb;
135
+ cursor: default;
136
+ }
137
+ </style>
138
+ </head>
139
+ <body>
140
+
141
+ <h1>📄 3GPP/ETSI Document/Specification Indexer Main Menu</h1>
142
+
143
+ <div class="row" id="r1">
144
+ <div class="column">
145
+ <button id="tdocs-btn">Re-index TDocs</button>
146
+ <button id="spec-3gpp-btn">Re-index 3GPP Specifications</button>
147
+ </div>
148
+ <div class="column">
149
+ <select id="tdocs-wg-option" aria-label="Options Working Group TDocs">
150
+ <option value="ALL" selected>Index all working groups</option>
151
+ <option value="SA0">SP</option>
152
+ <option value="SA1">SA1</option>
153
+ <option value="SA2">SA2</option>
154
+ <option value="SA3">SA3</option>
155
+ <option value="SA4">SA4</option>
156
+ <option value="SA5">SA5</option>
157
+ <option value="SA6">SA6</option>
158
+ <option value="CT0">CP</option>
159
+ <option value="CT1">CT1</option>
160
+ <option value="CT2">CT2</option>
161
+ <option value="CT3">CT3</option>
162
+ <option value="CT4">CT4</option>
163
+ <option value="CT5">CT5</option>
164
+ <option value="CT6">CT6</option>
165
+ <option value="RAN0">RP</option>
166
+ <option value="RAN1">RAN1</option>
167
+ <option value="RAN2">RAN2</option>
168
+ <option value="RAN3">RAN3</option>
169
+ <option value="RAN4">RAN4</option>
170
+ <option value="RAN5">RAN5</option>
171
+ <option value="RAN6">RAN6</option>
172
+ </select>
173
+
174
+ </div>
175
+ <div class="column">
176
+ <div class="dropdown">
177
+ <button id="dropbtn" disabled="disabled">Index all meetings</button>
178
+ <div id="dropdownContent" class="dropdown-content" style="display:none;">
179
+ <label style="display:none;"><input type="checkbox" checked value="ALL">Index all meetings</label>
180
+ </div>
181
+ </div>
182
+ <button id="spec-etsi-btn">Re-index ETSI Specifications</button>
183
+ </div>
184
+ </div>
185
+
186
+ <textarea id="output" readonly placeholder="Output..." aria-label="Output console"></textarea>
187
+
188
+ <script type="module">
189
+ const output = document.getElementById('output');
190
+ let selectedMeetings = [];
191
+ let currentURL = null;
192
+
193
+ function toggleDropdown() {
194
+ const dropdown = document.getElementById("dropdownContent");
195
+ dropdown.style.display = (dropdown.style.display === "none") ? "block" : "none";
196
+ }
197
+
198
+ document.getElementById('dropbtn').addEventListener('click', ()=>{toggleDropdown()})
199
+ document.addEventListener('mousedown', (e)=>{
200
+ if(document.getElementById("dropdownContent").style.display == "block" && e.target.className != "dropdown-content" && e.target.tagName != "INPUT" && e.target.tagName != "LABEL"){
201
+ document.getElementById("dropdownContent").style.display = "none";
202
+ }
203
+ })
204
+
205
+ function logMessage(msg, reset){
206
+ if(reset){
207
+ output.value = msg + "\n";
208
+ };
209
+ output.value += msg + '\n';
210
+ output.scrollTop = output.scrollHeight;
211
+ }
212
+
213
+ document.getElementById('tdocs-wg-option').addEventListener('change', async (e) => {
214
+ let wg = e.target.value;
215
+ const dropdownContent = document.getElementById('dropdownContent');
216
+ const dropbtn = document.getElementById('dropbtn');
217
+
218
+ if (wg != "ALL") {
219
+ dropdownContent.innerHTML = '<label style="display:none;"><input type="checkbox" checked value="ALL">Index all meetings</label>';
220
+ const response = await fetch(`/get_meetings/${wg}`, { method: "GET" });
221
+ const responseJson = await response.json();
222
+ const meetings = responseJson.meetings;
223
+ currentURL = responseJson.url;
224
+
225
+ for (const meet of meetings) {
226
+ const label = document.createElement('label');
227
+ const checkbox = document.createElement('input');
228
+ checkbox.type = "checkbox";
229
+ checkbox.value = meet;
230
+ label.appendChild(checkbox);
231
+ label.appendChild(document.createTextNode(meet));
232
+ dropdownContent.appendChild(label);
233
+ dropdownContent.appendChild(document.createElement('br'));
234
+ }
235
+ dropbtn.removeAttribute('disabled');
236
+
237
+ // après création, ajoute les listeners de gestion sur chaque checkbox
238
+ initCheckboxListeners();
239
+ // Initialise l'état initial
240
+ updateDropbtnLabel();
241
+ } else {
242
+ dropdownContent.innerHTML = '<label style="display:none;"><input type="checkbox" checked value="ALL">Index all meetings</label>';
243
+ dropbtn.setAttribute('disabled', 'true');
244
+ dropbtn.textContent = "Index all meetings";
245
+ }
246
+ });
247
+
248
+ function disableButtons(){
249
+ document.getElementById("spec-3gpp-btn").setAttribute('disabled', 'disabled')
250
+ document.getElementById("spec-etsi-btn").setAttribute('disabled', 'disabled')
251
+ document.getElementById("tdocs-btn").setAttribute('disabled', 'disabled')
252
+ }
253
+
254
+ function enableButtons(){
255
+ document.getElementById("spec-3gpp-btn").removeAttribute('disabled')
256
+ document.getElementById("spec-etsi-btn").removeAttribute('disabled')
257
+ document.getElementById("tdocs-btn").removeAttribute('disabled')
258
+ }
259
+
260
+ function initCheckboxListeners() {
261
+ const dropdownContent = document.getElementById('dropdownContent');
262
+ const dropbtn = document.getElementById('dropbtn');
263
+
264
+ function updateState() {
265
+ const checkboxes = dropdownContent.querySelectorAll('input[type="checkbox"]');
266
+ const allCheckbox = dropdownContent.querySelector('input[value="ALL"]');
267
+ const checkedBoxes = Array.from(checkboxes).filter(cb => cb.checked && cb !== allCheckbox);
268
+
269
+ if (checkedBoxes.length === 0) {
270
+ allCheckbox.checked = true;
271
+ dropbtn.textContent = "Index all meetings";
272
+ selectedMeetings = ["ALL"];
273
+ } else {
274
+ if (allCheckbox.checked) {
275
+ allCheckbox.checked = false; // décocher ALL si autre(s) cochée(s)
276
+ }
277
+ if (checkedBoxes.length === 1) {
278
+ dropbtn.textContent = checkedBoxes[0].value;
279
+ } else {
280
+ dropbtn.textContent = `${checkedBoxes.length} meetings sélectionnés`;
281
+ }
282
+ selectedMeetings = checkedBoxes.map(cb => cb.value);
283
+ }
284
+
285
+ console.log(selectedMeetings);
286
+ console.log(currentURL);
287
+ }
288
+
289
+ const checkboxes = dropdownContent.querySelectorAll('input[type="checkbox"]');
290
+ checkboxes.forEach(cb => cb.addEventListener('change', updateState));
291
+
292
+ updateState(); // mise à jour initiale
293
+ }
294
+ function updateDropbtnLabel() {
295
+ const dropdownContent = document.getElementById('dropdownContent');
296
+ const checkboxes = dropdownContent.querySelectorAll('input[type="checkbox"]');
297
+ const allCheckbox = dropdownContent.querySelector('input[value="ALL"]');
298
+ const dropbtn = document.getElementById('dropbtn');
299
+ const checkedBoxes = Array.from(checkboxes).filter(cb => cb.checked && cb !== allCheckbox);
300
+
301
+ if (checkedBoxes.length === 0) {
302
+ allCheckbox.checked = true;
303
+ dropbtn.textContent = "Index all meetings";
304
+ } else if (checkedBoxes.length === 1) {
305
+ allCheckbox.checked = false;
306
+ dropbtn.textContent = checkedBoxes[0].value;
307
+ } else {
308
+ allCheckbox.checked = false;
309
+ dropbtn.textContent = `${checkedBoxes.length} meetings sélectionnés`;
310
+ }
311
+ }
312
+
313
+
314
+ document.getElementById('tdocs-btn').addEventListener('click', () => {
315
+ disableButtons()
316
+ logMessage(`Started re-indexing TDocs`);
317
+ if(currentURL){
318
+ if(!selectedMeetings.includes("ALL")){
319
+ fetch("/index_tdocs/meeting", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({wg: document.getElementById("tdocs-wg-option").value, meetings: selectedMeetings})})
320
+ .then(resp => resp.text())
321
+ .then(data => {
322
+ logMessage(`${data}`)
323
+ enableButtons()
324
+ })
325
+ } else {
326
+ fetch("/index_tdocs/working_group", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({wg: document.getElementById("tdocs-wg-option").value})})
327
+ .then(resp => resp.text())
328
+ .then(data => {
329
+ logMessage(`${data}`)
330
+ enableButtons()
331
+ })
332
+ }
333
+ } else {
334
+ fetch("/index_tdocs/all", {method: "POST", headers: {"Content-Type": "application/json"}})
335
+ .then(resp => resp.text())
336
+ .then(data => {
337
+ logMessage(`${data}`)
338
+ enableButtons()
339
+ })
340
+ }
341
+ });
342
+
343
+ document.getElementById('spec-3gpp-btn').addEventListener('click', () => {
344
+ disableButtons()
345
+ logMessage(`Started re-indexing 3GPP Specifications`);
346
+ fetch("/index_specs/3gpp", {method: "POST", headers: {"Content-Type": "application/json"}})
347
+ .then(resp => resp.text())
348
+ .then(data => {
349
+ logMessage(`${data}`)
350
+ enableButtons()
351
+ })
352
+ });
353
+
354
+ document.getElementById('spec-etsi-btn').addEventListener('click', () => {
355
+ logMessage('Started re-indexing ETSI Specifications');
356
+ disableButtons()
357
+ fetch("/index_specs/etsi", {method: "POST", headers: {"Content-Type": "application/json"}})
358
+ .then(resp => resp.text())
359
+ .then(data => {
360
+ logMessage(`${data}`)
361
+ enableButtons()
362
+ })
363
+ });
364
+ </script>
365
+
366
+ </body>
367
+ </html>
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ python-dotenv
3
+ fastapi
4
+ uvicorn[standard]
5
+ beautifulsoup4
6
+ huggingface_hub
7
+ PyMuPDF
8
+ bm25s[full]
9
+ pydantic
10
+ datasets
schemas.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import *
3
+
4
+ class IndexTDoc(BaseModel):
5
+ wg: Optional[str] = None
6
+ meetings: Optional[List[str]] = None