Lucas ARRIESSE commited on
Commit
5ef0f8d
·
1 Parent(s): 035141c

Migrate API modules to api routers

Browse files
Files changed (5) hide show
  1. api/docs.py +438 -0
  2. api/requirements.py +35 -0
  3. app.py +12 -482
  4. dependencies.py +42 -0
  5. static/js/script.js +5 -5
api/docs.py CHANGED
@@ -1,4 +1,442 @@
 
 
1
  from fastapi.routing import APIRouter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # API router for requirement extraction from docs / doc list retrieval / download
4
  router = APIRouter()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from typing import Literal
3
  from fastapi.routing import APIRouter
4
+ import logging
5
+ import string
6
+ import io
7
+ import traceback
8
+ import zipfile
9
+ import json
10
+ import os
11
+ from pydantic import BaseModel
12
+ import requests
13
+ import subprocess
14
+ import pandas as pd
15
+ import re
16
+ from lxml import etree
17
+ from nltk.tokenize import word_tokenize
18
+ from bs4 import BeautifulSoup
19
+ from nltk.corpus import stopwords
20
+ from nltk.stem import WordNetLemmatizer
21
+ from fastapi import Depends, BackgroundTasks, HTTPException, Request
22
+ from dependencies import get_llm_router
23
+ from fastapi.responses import StreamingResponse
24
+ from litellm.router import Router
25
+
26
+ from schemas import DataRequest, DataResponse, DocRequirements, DownloadRequest, MeetingsRequest, MeetingsResponse, RequirementsRequest, RequirementsResponse
27
 
28
  # API router for requirement extraction from docs / doc list retrieval / download
29
  router = APIRouter()
30
+
31
+ # ==================================================== Utilities =================================================================
32
+
33
+ lemmatizer = WordNetLemmatizer()
34
+
35
+ NSMAP = {
36
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
37
+ 'v': 'urn:schemas-microsoft-com:vml'
38
+ }
39
+
40
+
41
+ def lemma(text: str):
42
+ stop_words = set(stopwords.words('english'))
43
+ txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
44
+ tokens = [token for token in word_tokenize(
45
+ txt.lower()) if token not in stop_words]
46
+ return [lemmatizer.lemmatize(token) for token in tokens]
47
+
48
+
49
+ def get_docx_archive(url: str) -> zipfile.ZipFile:
50
+ """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
51
+ if not url.endswith("zip"):
52
+ raise ValueError("URL doit pointer vers un fichier ZIP")
53
+ doc_id = os.path.splitext(os.path.basename(url))[0]
54
+ resp = requests.get(url, verify=False, headers={
55
+ "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
56
+ })
57
+ resp.raise_for_status()
58
+
59
+ with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
60
+ for file_name in zf.namelist():
61
+ if file_name.endswith(".docx"):
62
+ docx_bytes = zf.read(file_name)
63
+ return zipfile.ZipFile(io.BytesIO(docx_bytes))
64
+ elif file_name.endswith(".doc"):
65
+ input_path = f"/tmp/{doc_id}.doc"
66
+ output_path = f"/tmp/{doc_id}.docx"
67
+ docx_bytes = zf.read(file_name)
68
+
69
+ with open(input_path, "wb") as f:
70
+ f.write(docx_bytes)
71
+
72
+ subprocess.run([
73
+ "libreoffice",
74
+ "--headless",
75
+ "--convert-to", "docx",
76
+ "--outdir", "/tmp",
77
+ input_path
78
+ ], check=True)
79
+
80
+ with open(output_path, "rb") as f:
81
+ docx_bytes = f.read()
82
+
83
+ os.remove(input_path)
84
+ os.remove(output_path)
85
+
86
+ return zipfile.ZipFile(io.BytesIO(docx_bytes))
87
+
88
+ raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
89
+
90
+
91
+ def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
92
+ """Parse le document.xml principal"""
93
+ xml_bytes = docx_zip.read('word/document.xml')
94
+ parser = etree.XMLParser(remove_blank_text=True)
95
+ return etree.fromstring(xml_bytes, parser=parser)
96
+
97
+
98
+ def clean_document_xml(root: etree._Element) -> None:
99
+ """Nettoie le XML en modifiant l'arbre directement"""
100
+ # Suppression des balises <w:del> et leur contenu
101
+ for del_elem in root.xpath('//w:del', namespaces=NSMAP):
102
+ parent = del_elem.getparent()
103
+ if parent is not None:
104
+ parent.remove(del_elem)
105
+
106
+ # Désencapsulation des balises <w:ins>
107
+ for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
108
+ parent = ins_elem.getparent()
109
+ index = parent.index(ins_elem)
110
+ for child in ins_elem.iterchildren():
111
+ parent.insert(index, child)
112
+ index += 1
113
+ parent.remove(ins_elem)
114
+
115
+ # Nettoyage des commentaires
116
+ for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
117
+ for elem in root.xpath(f'//{tag}', namespaces=NSMAP):
118
+ parent = elem.getparent()
119
+ if parent is not None:
120
+ parent.remove(elem)
121
+
122
+
123
+ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> bytes:
124
+ """Crée un nouveau docx avec le XML modifié"""
125
+ output = io.BytesIO()
126
+
127
+ with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
128
+ # Copier tous les fichiers non modifiés
129
+ for file in original_zip.infolist():
130
+ if file.filename != 'word/document.xml':
131
+ new_zip.writestr(file, original_zip.read(file.filename))
132
+
133
+ # Ajouter le document.xml modifié
134
+ xml_str = etree.tostring(
135
+ modified_root,
136
+ xml_declaration=True,
137
+ encoding='UTF-8',
138
+ pretty_print=True
139
+ )
140
+ new_zip.writestr('word/document.xml', xml_str)
141
+
142
+ output.seek(0)
143
+ return output.getvalue()
144
+
145
+
146
+ def docx_to_txt(doc_id: str, url: str):
147
+ docx_zip = get_docx_archive(url)
148
+ root = parse_document_xml(docx_zip)
149
+ clean_document_xml(root)
150
+ modified_bytes = create_modified_docx(docx_zip, root)
151
+
152
+ input_path = f"/tmp/{doc_id}_cleaned.docx"
153
+ output_path = f"/tmp/{doc_id}_cleaned.txt"
154
+ with open(input_path, "wb") as f:
155
+ f.write(modified_bytes)
156
+
157
+ subprocess.run([
158
+ "libreoffice",
159
+ "--headless",
160
+ "--convert-to", "txt",
161
+ "--outdir", "/tmp",
162
+ input_path
163
+ ], check=True)
164
+
165
+ with open(output_path, "r", encoding="utf-8") as f:
166
+ txt_data = [line.strip() for line in f if line.strip()]
167
+
168
+ os.remove(input_path)
169
+ os.remove(output_path)
170
+ return txt_data
171
+
172
+
173
+ # ============================================= Doc routes =========================================================
174
+
175
+ @router.post("/get_meetings", response_model=MeetingsResponse)
176
+ def get_meetings(req: MeetingsRequest):
177
+ working_group = req.working_group
178
+ tsg = re.sub(r"\d+", "", working_group)
179
+ wg_number = re.search(r"\d", working_group).group(0)
180
+
181
+ logging.debug(tsg, wg_number)
182
+ url = "https://www.3gpp.org/ftp/tsg_" + tsg
183
+ logging.debug(url)
184
+
185
+ resp = requests.get(url, verify=False)
186
+ soup = BeautifulSoup(resp.text, "html.parser")
187
+
188
+ meeting_folders = []
189
+ all_meetings = []
190
+ wg_folders = [item.get_text() for item in soup.select("tr td a")]
191
+ selected_folder = None
192
+ for folder in wg_folders:
193
+ if "wg" + str(wg_number) in folder.lower():
194
+ selected_folder = folder
195
+ break
196
+
197
+ url += "/" + selected_folder
198
+ logging.debug(url)
199
+
200
+ if selected_folder:
201
+ resp = requests.get(url, verify=False)
202
+ soup = BeautifulSoup(resp.text, "html.parser")
203
+ meeting_folders = [item.get_text() for item in soup.select("tr td a") if item.get_text(
204
+ ).startswith("TSG") or (item.get_text().startswith("CT") and "-" in item.get_text())]
205
+ all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace(
206
+ "-", " ") if meeting.startswith('TSG') else meeting.replace("-", "#") for meeting in meeting_folders]
207
+
208
+ return MeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders)))
209
+
210
+ # ============================================================================================================================================
211
+
212
+
213
+ @router.post("/get_dataframe", response_model=DataResponse)
214
+ def get_change_request_dataframe(req: DataRequest):
215
+ working_group = req.working_group
216
+ tsg = re.sub(r"\d+", "", working_group)
217
+ wg_number = re.search(r"\d", working_group).group(0)
218
+ url = "https://www.3gpp.org/ftp/tsg_" + tsg
219
+ logging.info("Fetching TDocs dataframe")
220
+
221
+ resp = requests.get(url, verify=False)
222
+ soup = BeautifulSoup(resp.text, "html.parser")
223
+ wg_folders = [item.get_text() for item in soup.select("tr td a")]
224
+ selected_folder = None
225
+ for folder in wg_folders:
226
+ if "wg" + str(wg_number) in folder.lower():
227
+ selected_folder = folder
228
+ break
229
+
230
+ url += "/" + selected_folder + "/" + req.meeting + "/docs"
231
+ resp = requests.get(url, verify=False)
232
+ soup = BeautifulSoup(resp.text, "html.parser")
233
+ files = [item.get_text() for item in soup.select("tr td a")
234
+ if item.get_text().endswith(".xlsx")]
235
+
236
+ if files == []:
237
+ raise HTTPException(status_code=404, detail="No XLSX has been found")
238
+
239
+ def gen_url(tdoc: str):
240
+ return f"{url}/{tdoc}.zip"
241
+
242
+ df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
243
+ filtered_df = df[(((df["Type"] == "CR") & ((df["CR category"] == "B") | (df["CR category"] == "C"))) | (df["Type"] == "pCR")) & ~(
244
+ df["Uploaded"].isna())][["TDoc", "Title", "CR category", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
245
+ filtered_df["URL"] = filtered_df["TDoc"].apply(gen_url)
246
+
247
+ df = filtered_df.fillna("")
248
+ return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
249
+
250
+ # ==================================================================================================================================
251
+
252
+
253
+ @router.post("/download_tdocs")
254
+ def download_tdocs(req: DownloadRequest):
255
+ """Download the specified TDocs and zips them in a single archive"""
256
+ documents = req.documents
257
+
258
+ logging.info(f"Downloading TDocs: {documents}")
259
+
260
+ def process_document(doc: str):
261
+ doc_id = doc
262
+ url = requests.post(
263
+ 'https://organizedprogrammers-3gppdocfinder.hf.space/find',
264
+ headers={"Content-Type": "application/json"},
265
+ data=json.dumps({"doc_id": doc_id}),
266
+ verify=False
267
+ )
268
+ logging.info(
269
+ f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
270
+ url = url.json()['url']
271
+ logging.debug(f"Doc URL for {doc_id} is {url}")
272
+
273
+ try:
274
+ txt = "\n".join(docx_to_txt(doc_id, url))
275
+ except Exception as e:
276
+ txt = f"Document {doc_id} text extraction failed: {e}"
277
+ return doc_id, txt.encode("utf-8")
278
+
279
+ # PERF: use asyncio?
280
+ def process_batch(batch):
281
+ results = {}
282
+ for doc in batch:
283
+ try:
284
+ doc_id, file_bytes = process_document(doc)
285
+ results[doc_id] = file_bytes
286
+ except Exception as e:
287
+ traceback.print_exception(e)
288
+ results[doc] = b"Erreur"
289
+ return results
290
+
291
+ documents_bytes = process_batch(documents)
292
+
293
+ zip_buffer = io.BytesIO()
294
+ with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
295
+ for doc_id, txt_data in documents_bytes.items():
296
+ zip_file.writestr(f'{doc_id}.txt', txt_data)
297
+
298
+ zip_buffer.seek(0)
299
+ return StreamingResponse(
300
+ zip_buffer,
301
+ media_type="application/zip"
302
+ )
303
+
304
+
305
+ @router.post("/generate_requirements", response_model=RequirementsResponse)
306
+ async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks, llm_router: Router = Depends(get_llm_router)):
307
+ """Extract requirements from the specified TDocs using a LLM"""
308
+
309
+ documents = req.documents
310
+ n_docs = len(documents)
311
+
312
+ logging.info("Generating requirements for documents: {}".format(
313
+ [doc.document for doc in documents]))
314
+
315
+ def prompt(doc_id, full):
316
+ return f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found. Remove the errors"
317
+
318
+ async def process_document(doc):
319
+ doc_id = doc.document
320
+ url = doc.url
321
+ try:
322
+ full = "\n".join(docx_to_txt(doc_id, url))
323
+ except Exception as e:
324
+ logging.error(f"Failed to process doc {doc_id}", e)
325
+ return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
326
+
327
+ try:
328
+ resp_ai = await llm_router.acompletion(
329
+ model="gemini-v2",
330
+ messages=[
331
+ {"role": "user", "content": prompt(doc_id, full)}],
332
+ response_format=RequirementsResponse
333
+ )
334
+
335
+ return RequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements
336
+
337
+ except Exception as e:
338
+ logging.error(
339
+ f"Failed to process document {doc_id}", e, stack_info=True)
340
+ return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
341
+
342
+ async def process_batch(batch):
343
+ results = await asyncio.gather(*(process_document(doc) for doc in batch))
344
+ return [item for sublist in results for item in sublist]
345
+
346
+ all_requirements = []
347
+
348
+ if n_docs <= 30:
349
+ batch_results = await process_batch(documents)
350
+ all_requirements.extend(batch_results)
351
+ else:
352
+ batch_size = 30
353
+ batches = [documents[i:i + batch_size]
354
+ for i in range(0, n_docs, batch_size)]
355
+
356
+ for i, batch in enumerate(batches):
357
+ batch_results = await process_batch(batch)
358
+ all_requirements.extend(batch_results)
359
+
360
+ if i < len(batches) - 1:
361
+ background_tasks.add_task(asyncio.sleep, 60)
362
+ return RequirementsResponse(requirements=all_requirements)
363
+
364
+ # ======================================================================================================================================================================================
365
+
366
+
367
+ class ProgressUpdate(BaseModel):
368
+ """Defines the structure of a single SSE message."""
369
+ status: Literal["progress", "complete"]
370
+ data: dict
371
+ total_docs: int
372
+ processed_docs: int
373
+
374
+
375
+ @router.post("/generate_requirements/sse")
376
+ async def gen_reqs(req: RequirementsRequest, con: Request, llm_router: Router = Depends(get_llm_router)):
377
+ """Extract requirements from the specified TDocs using a LLM and returns SSE events about the progress of ongoing operations"""
378
+
379
+ documents = req.documents
380
+ n_docs = len(documents)
381
+
382
+ logging.info("Generating requirements for documents: {}".format(
383
+ [doc.document for doc in documents]))
384
+
385
+ # limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
386
+ concurrency_sema = asyncio.Semaphore(4)
387
+
388
+ def prompt(doc_id, full):
389
+ return f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found. Remove the errors"
390
+
391
+ async def _process_document(doc) -> list[DocRequirements]:
392
+ doc_id = doc.document
393
+ url = doc.url
394
+
395
+ # convert the docx to txt for use
396
+ try:
397
+ full = "\n".join(docx_to_txt(doc_id, url))
398
+ except Exception as e:
399
+ logging.error(
400
+ f"Failed to process document {doc_id}", e, stack_info=True)
401
+ return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])]
402
+
403
+ try:
404
+ await concurrency_sema.acquire()
405
+
406
+ model_used = "gemini-v2"
407
+ resp_ai = await llm_router.acompletion(
408
+ model=model_used,
409
+ messages=[
410
+ {"role": "user", "content": prompt(doc_id, full)}],
411
+ response_format=RequirementsResponse
412
+ )
413
+ return RequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements
414
+ except Exception as e:
415
+ return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])]
416
+ finally:
417
+ concurrency_sema.release()
418
+
419
+ # futures for all processed documents
420
+ process_futures = [_process_document(doc) for doc in documents]
421
+
422
+ # lambda to print progress
423
+ def progress_update(x): return f"data: {x.model_dump_json()}\n\n"
424
+
425
+ # async generator that generates the SSE events for progress
426
+ async def _stream_generator(docs: list[asyncio.Future]):
427
+ items = []
428
+ n_processed = 0
429
+
430
+ yield progress_update(ProgressUpdate(status="progress", data={}, total_docs=n_docs, processed_docs=0))
431
+
432
+ for doc in asyncio.as_completed(docs):
433
+ result = await doc
434
+ items.extend(result)
435
+ n_processed += 1
436
+ yield progress_update(ProgressUpdate(status="progress", data={}, total_docs=n_docs, processed_docs=n_processed))
437
+
438
+ final_response = RequirementsResponse(requirements=items)
439
+
440
+ yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed))
441
+
442
+ return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream")
api/requirements.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException
2
+ from litellm.router import Router
3
+ from dependencies import get_llm_router
4
+ from schemas import ReqSearchLLMResponse, ReqSearchRequest, ReqSearchResponse
5
+
6
+ # Router for all requirements
7
+ router = APIRouter()
8
+
9
+
10
+ @router.post("/get_reqs_from_query", response_model=ReqSearchResponse)
11
+ def find_requirements_from_problem_description(req: ReqSearchRequest, llm_router: Router = Depends(get_llm_router)):
12
+ """Finds the requirements that adress a given problem description from an extracted list"""
13
+
14
+ requirements = req.requirements
15
+ query = req.query
16
+
17
+ requirements_text = "\n".join(
18
+ [f"[Selection ID: {r.req_id} | Document: {r.document} | Context: {r.context} | Requirement: {r.requirement}]" for r in requirements])
19
+ print("Called the LLM")
20
+ resp_ai = llm_router.completion(
21
+ model="gemini-v2",
22
+ messages=[{"role": "user", "content": f"Given all the requirements : \n {requirements_text} \n and the problem description \"{query}\", return a list of 'Selection ID' for the most relevant corresponding requirements that reference or best cover the problem. If none of the requirements covers the problem, simply return an empty list"}],
23
+ response_format=ReqSearchLLMResponse
24
+ )
25
+ print("Answered")
26
+ print(resp_ai.choices[0].message.content)
27
+
28
+ out_llm = ReqSearchLLMResponse.model_validate_json(
29
+ resp_ai.choices[0].message.content).selected
30
+
31
+ if max(out_llm) > len(requirements) - 1:
32
+ raise HTTPException(
33
+ status_code=500, detail="LLM error : Generated a wrong index, please try again.")
34
+
35
+ return ReqSearchResponse(requirements=[requirements[i] for i in out_llm])
app.py CHANGED
@@ -1,31 +1,20 @@
1
  import asyncio
2
  import logging
 
 
3
  import nltk
4
- import string
5
  import warnings
6
- import io
7
- import traceback
8
- import zipfile
9
- import json
10
  import os
11
- import requests
12
- import subprocess
13
- import pandas as pd
14
- import re
15
- from lxml import etree
16
- from typing import Literal
17
- from dotenv import load_dotenv
18
- from nltk.tokenize import word_tokenize
19
- from bs4 import BeautifulSoup
20
- from nltk.corpus import stopwords
21
- from nltk.stem import WordNetLemmatizer
22
- from fastapi import FastAPI, BackgroundTasks, HTTPException, Request
23
  from fastapi.staticfiles import StaticFiles
 
 
 
 
24
  from schemas import *
25
  from fastapi.middleware.cors import CORSMiddleware
26
  from fastapi.responses import FileResponse, StreamingResponse
27
  from litellm.router import Router
28
- from aiolimiter import AsyncLimiter
29
 
30
  load_dotenv()
31
 
@@ -36,6 +25,9 @@ logging.basicConfig(
36
  datefmt='%Y-%m-%d %H:%M:%S'
37
  )
38
 
 
 
 
39
  # Download required packages for NLTK
40
  nltk.download('stopwords')
41
  nltk.download('punkt_tab')
@@ -47,470 +39,8 @@ app = FastAPI(title="Requirements Extractor")
47
  app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=[
48
  "*"], allow_methods=["*"], allow_origins=["*"])
49
 
50
- llm_router = Router(model_list=[
51
- {
52
- "model_name": "gemini-v1",
53
- "litellm_params":
54
- {
55
- "model": "gemini/gemini-2.0-flash",
56
- "api_key": os.environ.get("GEMINI"),
57
- "max_retries": 5,
58
- "rpm": 15,
59
- "allowed_fails": 1,
60
- "cooldown": 30,
61
- }
62
- },
63
- {
64
- "model_name": "gemini-v2",
65
- "litellm_params":
66
- {
67
- "model": "gemini/gemini-2.5-flash",
68
- "api_key": os.environ.get("GEMINI"),
69
- "max_retries": 5,
70
- "rpm": 10,
71
- "allowed_fails": 1,
72
- "cooldown": 30,
73
- }
74
- }], fallbacks=[{"gemini-v2": ["gemini-v1"]}], num_retries=10, retry_after=30)
75
-
76
- lemmatizer = WordNetLemmatizer()
77
-
78
- NSMAP = {
79
- 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
80
- 'v': 'urn:schemas-microsoft-com:vml'
81
- }
82
-
83
-
84
- def lemma(text: str):
85
- stop_words = set(stopwords.words('english'))
86
- txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
87
- tokens = [token for token in word_tokenize(
88
- txt.lower()) if token not in stop_words]
89
- return [lemmatizer.lemmatize(token) for token in tokens]
90
-
91
-
92
- def get_docx_archive(url: str) -> zipfile.ZipFile:
93
- """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
94
- if not url.endswith("zip"):
95
- raise ValueError("URL doit pointer vers un fichier ZIP")
96
- doc_id = os.path.splitext(os.path.basename(url))[0]
97
- resp = requests.get(url, verify=False, headers={
98
- "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
99
- })
100
- resp.raise_for_status()
101
-
102
- with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
103
- for file_name in zf.namelist():
104
- if file_name.endswith(".docx"):
105
- docx_bytes = zf.read(file_name)
106
- return zipfile.ZipFile(io.BytesIO(docx_bytes))
107
- elif file_name.endswith(".doc"):
108
- input_path = f"/tmp/{doc_id}.doc"
109
- output_path = f"/tmp/{doc_id}.docx"
110
- docx_bytes = zf.read(file_name)
111
-
112
- with open(input_path, "wb") as f:
113
- f.write(docx_bytes)
114
-
115
- subprocess.run([
116
- "libreoffice",
117
- "--headless",
118
- "--convert-to", "docx",
119
- "--outdir", "/tmp",
120
- input_path
121
- ], check=True)
122
-
123
- with open(output_path, "rb") as f:
124
- docx_bytes = f.read()
125
-
126
- os.remove(input_path)
127
- os.remove(output_path)
128
-
129
- return zipfile.ZipFile(io.BytesIO(docx_bytes))
130
-
131
- raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
132
-
133
-
134
- def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
135
- """Parse le document.xml principal"""
136
- xml_bytes = docx_zip.read('word/document.xml')
137
- parser = etree.XMLParser(remove_blank_text=True)
138
- return etree.fromstring(xml_bytes, parser=parser)
139
-
140
-
141
- def clean_document_xml(root: etree._Element) -> None:
142
- """Nettoie le XML en modifiant l'arbre directement"""
143
- # Suppression des balises <w:del> et leur contenu
144
- for del_elem in root.xpath('//w:del', namespaces=NSMAP):
145
- parent = del_elem.getparent()
146
- if parent is not None:
147
- parent.remove(del_elem)
148
-
149
- # Désencapsulation des balises <w:ins>
150
- for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
151
- parent = ins_elem.getparent()
152
- index = parent.index(ins_elem)
153
- for child in ins_elem.iterchildren():
154
- parent.insert(index, child)
155
- index += 1
156
- parent.remove(ins_elem)
157
-
158
- # Nettoyage des commentaires
159
- for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
160
- for elem in root.xpath(f'//{tag}', namespaces=NSMAP):
161
- parent = elem.getparent()
162
- if parent is not None:
163
- parent.remove(elem)
164
-
165
-
166
- def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> bytes:
167
- """Crée un nouveau docx avec le XML modifié"""
168
- output = io.BytesIO()
169
-
170
- with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
171
- # Copier tous les fichiers non modifiés
172
- for file in original_zip.infolist():
173
- if file.filename != 'word/document.xml':
174
- new_zip.writestr(file, original_zip.read(file.filename))
175
-
176
- # Ajouter le document.xml modifié
177
- xml_str = etree.tostring(
178
- modified_root,
179
- xml_declaration=True,
180
- encoding='UTF-8',
181
- pretty_print=True
182
- )
183
- new_zip.writestr('word/document.xml', xml_str)
184
-
185
- output.seek(0)
186
- return output.getvalue()
187
-
188
-
189
- def docx_to_txt(doc_id: str, url: str):
190
- docx_zip = get_docx_archive(url)
191
- root = parse_document_xml(docx_zip)
192
- clean_document_xml(root)
193
- modified_bytes = create_modified_docx(docx_zip, root)
194
-
195
- input_path = f"/tmp/{doc_id}_cleaned.docx"
196
- output_path = f"/tmp/{doc_id}_cleaned.txt"
197
- with open(input_path, "wb") as f:
198
- f.write(modified_bytes)
199
-
200
- subprocess.run([
201
- "libreoffice",
202
- "--headless",
203
- "--convert-to", "txt",
204
- "--outdir", "/tmp",
205
- input_path
206
- ], check=True)
207
-
208
- with open(output_path, "r", encoding="utf-8") as f:
209
- txt_data = [line.strip() for line in f if line.strip()]
210
-
211
- os.remove(input_path)
212
- os.remove(output_path)
213
- return txt_data
214
-
215
-
216
- # ============================================= Doc routes =========================================================
217
-
218
- @app.post("/get_meetings", response_model=MeetingsResponse)
219
- def get_meetings(req: MeetingsRequest):
220
- working_group = req.working_group
221
- tsg = re.sub(r"\d+", "", working_group)
222
- wg_number = re.search(r"\d", working_group).group(0)
223
-
224
- logging.debug(tsg, wg_number)
225
- url = "https://www.3gpp.org/ftp/tsg_" + tsg
226
- logging.debug(url)
227
-
228
- resp = requests.get(url, verify=False)
229
- soup = BeautifulSoup(resp.text, "html.parser")
230
-
231
- meeting_folders = []
232
- all_meetings = []
233
- wg_folders = [item.get_text() for item in soup.select("tr td a")]
234
- selected_folder = None
235
- for folder in wg_folders:
236
- if "wg" + str(wg_number) in folder.lower():
237
- selected_folder = folder
238
- break
239
-
240
- url += "/" + selected_folder
241
- logging.debug(url)
242
-
243
- if selected_folder:
244
- resp = requests.get(url, verify=False)
245
- soup = BeautifulSoup(resp.text, "html.parser")
246
- meeting_folders = [item.get_text() for item in soup.select("tr td a") if item.get_text(
247
- ).startswith("TSG") or (item.get_text().startswith("CT") and "-" in item.get_text())]
248
- all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace(
249
- "-", " ") if meeting.startswith('TSG') else meeting.replace("-", "#") for meeting in meeting_folders]
250
-
251
- return MeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders)))
252
-
253
- # ============================================================================================================================================
254
-
255
-
256
- @app.post("/get_dataframe", response_model=DataResponse)
257
- def get_change_request_dataframe(req: DataRequest):
258
- working_group = req.working_group
259
- tsg = re.sub(r"\d+", "", working_group)
260
- wg_number = re.search(r"\d", working_group).group(0)
261
- url = "https://www.3gpp.org/ftp/tsg_" + tsg
262
- logging.info("Fetching TDocs dataframe")
263
-
264
- resp = requests.get(url, verify=False)
265
- soup = BeautifulSoup(resp.text, "html.parser")
266
- wg_folders = [item.get_text() for item in soup.select("tr td a")]
267
- selected_folder = None
268
- for folder in wg_folders:
269
- if "wg" + str(wg_number) in folder.lower():
270
- selected_folder = folder
271
- break
272
-
273
- url += "/" + selected_folder + "/" + req.meeting + "/docs"
274
- resp = requests.get(url, verify=False)
275
- soup = BeautifulSoup(resp.text, "html.parser")
276
- files = [item.get_text() for item in soup.select("tr td a")
277
- if item.get_text().endswith(".xlsx")]
278
-
279
- if files == []:
280
- raise HTTPException(status_code=404, detail="No XLSX has been found")
281
-
282
- def gen_url(tdoc: str):
283
- return f"{url}/{tdoc}.zip"
284
-
285
- df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
286
- filtered_df = df[(((df["Type"] == "CR") & ((df["CR category"] == "B") | (df["CR category"] == "C"))) | (df["Type"] == "pCR")) & ~(
287
- df["Uploaded"].isna())][["TDoc", "Title", "CR category", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
288
- filtered_df["URL"] = filtered_df["TDoc"].apply(gen_url)
289
-
290
- df = filtered_df.fillna("")
291
- return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
292
-
293
- # ==================================================================================================================================
294
-
295
-
296
- @app.post("/download_tdocs")
297
- def download_tdocs(req: DownloadRequest):
298
- """Download the specified TDocs and zips them in a single archive"""
299
- documents = req.documents
300
-
301
- logging.info(f"Downloading TDocs: {documents}")
302
-
303
- def process_document(doc: str):
304
- doc_id = doc
305
- url = requests.post(
306
- 'https://organizedprogrammers-3gppdocfinder.hf.space/find',
307
- headers={"Content-Type": "application/json"},
308
- data=json.dumps({"doc_id": doc_id}),
309
- verify=False
310
- )
311
- logging.info(
312
- f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
313
- url = url.json()['url']
314
- logging.debug(f"Doc URL for {doc_id} is {url}")
315
-
316
- try:
317
- txt = "\n".join(docx_to_txt(doc_id, url))
318
- except Exception as e:
319
- txt = f"Document {doc_id} text extraction failed: {e}"
320
- return doc_id, txt.encode("utf-8")
321
-
322
- # PERF: use asyncio?
323
- def process_batch(batch):
324
- results = {}
325
- for doc in batch:
326
- try:
327
- doc_id, file_bytes = process_document(doc)
328
- results[doc_id] = file_bytes
329
- except Exception as e:
330
- traceback.print_exception(e)
331
- results[doc] = b"Erreur"
332
- return results
333
-
334
- documents_bytes = process_batch(documents)
335
-
336
- zip_buffer = io.BytesIO()
337
- with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
338
- for doc_id, txt_data in documents_bytes.items():
339
- zip_file.writestr(f'{doc_id}.txt', txt_data)
340
-
341
- zip_buffer.seek(0)
342
- return StreamingResponse(
343
- zip_buffer,
344
- media_type="application/zip"
345
- )
346
-
347
- # ========================================================================================================================
348
-
349
-
350
- @app.post("/generate_requirements", response_model=RequirementsResponse)
351
- async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks):
352
- """Extract requirements from the specified TDocs using a LLM"""
353
-
354
- documents = req.documents
355
- n_docs = len(documents)
356
-
357
- logging.info("Generating requirements for documents: {}".format(
358
- [doc.document for doc in documents]))
359
-
360
- def prompt(doc_id, full):
361
- return f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found. Remove the errors"
362
-
363
- async def process_document(doc):
364
- doc_id = doc.document
365
- url = doc.url
366
- try:
367
- full = "\n".join(docx_to_txt(doc_id, url))
368
- except Exception as e:
369
- traceback.print_exception(e)
370
- return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
371
-
372
- try:
373
- resp_ai = await llm_router.acompletion(
374
- model="gemini-v2",
375
- messages=[
376
- {"role": "user", "content": prompt(doc_id, full)}],
377
- response_format=RequirementsResponse
378
- )
379
-
380
- return RequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements
381
-
382
- except Exception as e:
383
- logging.error(
384
- f"Failed to process document {doc_id}", e, stack_info=True)
385
- return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
386
-
387
- async def process_batch(batch):
388
- results = await asyncio.gather(*(process_document(doc) for doc in batch))
389
- return [item for sublist in results for item in sublist]
390
-
391
- all_requirements = []
392
-
393
- if n_docs <= 30:
394
- batch_results = await process_batch(documents)
395
- all_requirements.extend(batch_results)
396
- else:
397
- batch_size = 30
398
- batches = [documents[i:i + batch_size]
399
- for i in range(0, n_docs, batch_size)]
400
-
401
- for i, batch in enumerate(batches):
402
- batch_results = await process_batch(batch)
403
- all_requirements.extend(batch_results)
404
-
405
- if i < len(batches) - 1:
406
- background_tasks.add_task(asyncio.sleep, 60)
407
- return RequirementsResponse(requirements=all_requirements)
408
-
409
- # ======================================================================================================================================================================================
410
-
411
-
412
- class ProgressUpdate(BaseModel):
413
- """Defines the structure of a single SSE message."""
414
- status: Literal["progress", "complete"]
415
- data: dict
416
- total_docs: int
417
- processed_docs: int
418
-
419
-
420
- @app.post("/generate_requirements/sse")
421
- async def gen_reqs(req: RequirementsRequest, con: Request):
422
- """Extract requirements from the specified TDocs using a LLM and returns SSE events about the progress of ongoing operations"""
423
-
424
- documents = req.documents
425
- n_docs = len(documents)
426
-
427
- logging.info("Generating requirements for documents: {}".format(
428
- [doc.document for doc in documents]))
429
-
430
- # limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
431
- concurrency_sema = asyncio.Semaphore(4)
432
-
433
- def prompt(doc_id, full):
434
- return f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found. Remove the errors"
435
-
436
- async def _process_document(doc) -> list[DocRequirements]:
437
- doc_id = doc.document
438
- url = doc.url
439
-
440
- # convert the docx to txt for use
441
- try:
442
- full = "\n".join(docx_to_txt(doc_id, url))
443
- except Exception as e:
444
- traceback.print_exception(e)
445
- return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])]
446
-
447
- try:
448
- await concurrency_sema.acquire()
449
-
450
- model_used = "gemini-v2"
451
- resp_ai = await llm_router.acompletion(
452
- model=model_used,
453
- messages=[
454
- {"role": "user", "content": prompt(doc_id, full)}],
455
- response_format=RequirementsResponse
456
- )
457
- return RequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements
458
- except Exception as e:
459
- return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])]
460
- finally:
461
- concurrency_sema.release()
462
-
463
- # futures for all processed documents
464
- process_futures = [_process_document(doc) for doc in documents]
465
-
466
- # lambda to print progress
467
- def progress_update(x): return f"data: {x.model_dump_json()}\n\n"
468
-
469
- # async generator that generates the SSE events for progress
470
- async def _stream_generator(docs: list[asyncio.Future]):
471
- items = []
472
- n_processed = 0
473
-
474
- yield progress_update(ProgressUpdate(status="progress", data={}, total_docs=n_docs, processed_docs=0))
475
-
476
- for doc in asyncio.as_completed(docs):
477
- result = await doc
478
- items.extend(result)
479
- n_processed += 1
480
- yield progress_update(ProgressUpdate(status="progress", data={}, total_docs=n_docs, processed_docs=n_processed))
481
-
482
- final_response = RequirementsResponse(requirements=items)
483
-
484
- yield progress_update(ProgressUpdate(status="complete", data=final_response.model_dump(), total_docs=n_docs, processed_docs=n_processed))
485
-
486
- return StreamingResponse(_stream_generator(process_futures), media_type="text/event-stream")
487
  # =======================================================================================================================================================================================
488
 
489
-
490
- @app.post("/get_reqs_from_query", response_model=ReqSearchResponse)
491
- def find_requirements_from_problem_description(req: ReqSearchRequest):
492
- requirements = req.requirements
493
- query = req.query
494
-
495
- requirements_text = "\n".join(
496
- [f"[Selection ID: {r.req_id} | Document: {r.document} | Context: {r.context} | Requirement: {r.requirement}]" for r in requirements])
497
- print("Called the LLM")
498
- resp_ai = llm_router.completion(
499
- model="gemini-v2",
500
- messages=[{"role": "user", "content": f"Given all the requirements : \n {requirements_text} \n and the problem description \"{query}\", return a list of 'Selection ID' for the most relevant corresponding requirements that reference or best cover the problem. If none of the requirements covers the problem, simply return an empty list"}],
501
- response_format=ReqSearchLLMResponse
502
- )
503
- print("Answered")
504
- print(resp_ai.choices[0].message.content)
505
-
506
- out_llm = ReqSearchLLMResponse.model_validate_json(
507
- resp_ai.choices[0].message.content).selected
508
-
509
- if max(out_llm) > len(requirements) - 1:
510
- raise HTTPException(
511
- status_code=500, detail="LLM error : Generated a wrong index, please try again.")
512
-
513
- return ReqSearchResponse(requirements=[requirements[i] for i in out_llm])
514
-
515
-
516
  app.mount("/", StaticFiles(directory="static", html=True), name="static")
 
1
  import asyncio
2
  import logging
3
+ from dotenv import load_dotenv
4
+ from typing import Literal
5
  import nltk
 
6
  import warnings
 
 
 
 
7
  import os
8
+ from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request
 
 
 
 
 
 
 
 
 
 
 
9
  from fastapi.staticfiles import StaticFiles
10
+ from dependencies import get_llm_router, init_dependencies
11
+ import api.docs
12
+ import api.requirements
13
+ from api.docs import docx_to_txt
14
  from schemas import *
15
  from fastapi.middleware.cors import CORSMiddleware
16
  from fastapi.responses import FileResponse, StreamingResponse
17
  from litellm.router import Router
 
18
 
19
  load_dotenv()
20
 
 
25
  datefmt='%Y-%m-%d %H:%M:%S'
26
  )
27
 
28
+ # Initialize global dependencies
29
+ init_dependencies()
30
+
31
  # Download required packages for NLTK
32
  nltk.download('stopwords')
33
  nltk.download('punkt_tab')
 
39
  app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=[
40
  "*"], allow_methods=["*"], allow_origins=["*"])
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # =======================================================================================================================================================================================
43
 
44
+ app.include_router(api.docs.router, prefix="/docs")
45
+ app.include_router(api.requirements.router, prefix="/requirements")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  app.mount("/", StaticFiles(directory="static", html=True), name="static")
dependencies.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from litellm.router import Router
3
+
4
+ # Declare all global app dependencies here
5
+ # - Setup your dependency global inside init_dependencies()
6
+ # - Create a get_xxxx_() function to retrieve the dependency inside the FastAPI router
7
+
8
+
9
+ def init_dependencies():
10
+ """Initialize the application global dependencies"""
11
+
12
+ global llm_router
13
+ llm_router = Router(model_list=[
14
+ {
15
+ "model_name": "gemini-v1",
16
+ "litellm_params":
17
+ {
18
+ "model": "gemini/gemini-2.0-flash",
19
+ "api_key": os.environ.get("GEMINI"),
20
+ "max_retries": 5,
21
+ "rpm": 15,
22
+ "allowed_fails": 1,
23
+ "cooldown": 30,
24
+ }
25
+ },
26
+ {
27
+ "model_name": "gemini-v2",
28
+ "litellm_params":
29
+ {
30
+ "model": "gemini/gemini-2.5-flash",
31
+ "api_key": os.environ.get("GEMINI"),
32
+ "max_retries": 5,
33
+ "rpm": 10,
34
+ "allowed_fails": 1,
35
+ "cooldown": 30,
36
+ }
37
+ }], fallbacks=[{"gemini-v2": ["gemini-v1"]}], num_retries=10, retry_after=30)
38
+
39
+
40
+ def get_llm_router() -> Router:
41
+ """Retrieves the LLM router"""
42
+ return llm_router
static/js/script.js CHANGED
@@ -32,7 +32,7 @@ async function getMeetings() {
32
  toggleElementsEnabled(['get-meetings-btn'], false);
33
 
34
  try {
35
- const response = await fetch('/get_meetings', {
36
  method: 'POST',
37
  headers: { 'Content-Type': 'application/json' },
38
  body: JSON.stringify({ working_group: workingGroup })
@@ -63,7 +63,7 @@ async function getTDocs() {
63
  toggleElementsEnabled(['get-tdocs-btn'], false);
64
 
65
  try {
66
- const response = await fetch('/get_dataframe', {
67
  method: 'POST',
68
  headers: { 'Content-Type': 'application/json' },
69
  body: JSON.stringify({ working_group: workingGroup, meeting: meeting })
@@ -238,7 +238,7 @@ async function downloadTDocs() {
238
  // Transformer au format requis: [{tdoc_id: url}, ...]
239
  const documents = selectedData.map(obj => obj.document)
240
 
241
- const response = await fetch('/download_tdocs', {
242
  method: 'POST',
243
  headers: { 'Content-Type': 'application/json' },
244
  body: JSON.stringify({ documents: documents })
@@ -322,7 +322,7 @@ async function extractRequirements() {
322
  toggleElementsEnabled(['extract-requirements-btn'], false);
323
 
324
  try {
325
- const response = await postWithSSE('/generate_requirements/sse', { documents: selectedData }, {
326
  onMessage: (msg) => {
327
  console.log("SSE message:");
328
  console.log(msg);
@@ -663,7 +663,7 @@ async function searchRequirements() {
663
 
664
  try {
665
  // Préparer les requirements pour la recherche
666
- const response = await fetch('/get_reqs_from_query', {
667
  method: 'POST',
668
  headers: { 'Content-Type': 'application/json' },
669
  body: JSON.stringify({
 
32
  toggleElementsEnabled(['get-meetings-btn'], false);
33
 
34
  try {
35
+ const response = await fetch('/docs/get_meetings', {
36
  method: 'POST',
37
  headers: { 'Content-Type': 'application/json' },
38
  body: JSON.stringify({ working_group: workingGroup })
 
63
  toggleElementsEnabled(['get-tdocs-btn'], false);
64
 
65
  try {
66
+ const response = await fetch('/docs/get_dataframe', {
67
  method: 'POST',
68
  headers: { 'Content-Type': 'application/json' },
69
  body: JSON.stringify({ working_group: workingGroup, meeting: meeting })
 
238
  // Transformer au format requis: [{tdoc_id: url}, ...]
239
  const documents = selectedData.map(obj => obj.document)
240
 
241
+ const response = await fetch('/docs/download_tdocs', {
242
  method: 'POST',
243
  headers: { 'Content-Type': 'application/json' },
244
  body: JSON.stringify({ documents: documents })
 
322
  toggleElementsEnabled(['extract-requirements-btn'], false);
323
 
324
  try {
325
+ const response = await postWithSSE('/docs/generate_requirements/sse', { documents: selectedData }, {
326
  onMessage: (msg) => {
327
  console.log("SSE message:");
328
  console.log(msg);
 
663
 
664
  try {
665
  // Préparer les requirements pour la recherche
666
+ const response = await fetch('/requirements/get_reqs_from_query', {
667
  method: 'POST',
668
  headers: { 'Content-Type': 'application/json' },
669
  body: JSON.stringify({