Lucas ARRIESSE commited on
Commit
a83bff5
·
1 Parent(s): f6da275

Fix TDoc download

Browse files
Files changed (2) hide show
  1. api/docs.py +67 -38
  2. dependencies.py +1 -0
api/docs.py CHANGED
@@ -1,5 +1,5 @@
1
  import asyncio
2
- from typing import Literal
3
  from fastapi.routing import APIRouter
4
  import logging
5
  import string
@@ -19,7 +19,7 @@ from bs4 import BeautifulSoup
19
  from nltk.corpus import stopwords
20
  from nltk.stem import WordNetLemmatizer
21
  from fastapi import Depends, BackgroundTasks, HTTPException, Request
22
- from dependencies import get_llm_router
23
  from fastapi.responses import StreamingResponse
24
  from litellm.router import Router
25
 
@@ -253,52 +253,81 @@ def get_change_request_dataframe(req: DataRequest):
253
  @router.post("/download_tdocs")
254
  def download_tdocs(req: DownloadRequest):
255
  """Download the specified TDocs and zips them in a single archive"""
256
- documents = req.documents
257
-
258
- logging.info(f"Downloading TDocs: {documents}")
259
-
260
- def process_document(doc: str):
261
- doc_id = doc
262
- url = requests.post(
263
- 'https://organizedprogrammers-3gppdocfinder.hf.space/find',
264
- headers={"Content-Type": "application/json"},
265
- data=json.dumps({"doc_id": doc_id}),
266
- verify=False
267
- )
268
- logging.info(
269
- f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
270
- url = url.json()['url']
271
- logging.debug(f"Doc URL for {doc_id} is {url}")
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  try:
274
- txt = "\n".join(docx_to_txt(doc_id, url))
 
 
275
  except Exception as e:
276
- txt = f"Document {doc_id} text extraction failed: {e}"
277
- return doc_id, txt.encode("utf-8")
278
-
279
- # PERF: use asyncio?
280
- def process_batch(batch):
281
- results = {}
282
- for doc in batch:
283
- try:
284
- doc_id, file_bytes = process_document(doc)
285
- results[doc_id] = file_bytes
286
- except Exception as e:
287
- traceback.print_exception(e)
288
- results[doc] = b"Erreur"
289
- return results
290
-
291
- documents_bytes = process_batch(documents)
 
 
 
 
 
 
 
 
 
292
 
293
  zip_buffer = io.BytesIO()
294
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
295
- for doc_id, txt_data in documents_bytes.items():
296
- zip_file.writestr(f'{doc_id}.txt', txt_data)
 
297
 
298
  zip_buffer.seek(0)
 
299
  return StreamingResponse(
300
  zip_buffer,
301
- media_type="application/zip"
 
302
  )
303
 
304
 
 
1
  import asyncio
2
+ from typing import Dict, List, Literal, Tuple
3
  from fastapi.routing import APIRouter
4
  import logging
5
  import string
 
19
  from nltk.corpus import stopwords
20
  from nltk.stem import WordNetLemmatizer
21
  from fastapi import Depends, BackgroundTasks, HTTPException, Request
22
+ from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
23
  from fastapi.responses import StreamingResponse
24
  from litellm.router import Router
25
 
 
253
  @router.post("/download_tdocs")
254
  def download_tdocs(req: DownloadRequest):
255
  """Download the specified TDocs and zips them in a single archive"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ # Document IDs to download
258
+ document_ids = req.documents
259
+
260
+ logging.info(f"Downloading TDocs: {document_ids}")
261
+
262
+ # Retrieve all doc URLs to download
263
+ doc_urls_req = requests.post(DOC_FINDER_BASE_URL + "find/batch",
264
+ headers={
265
+ "Content-Type": "application/json"
266
+ },
267
+ data=json.dumps({
268
+ "doc_ids": document_ids
269
+ }),
270
+ verify=False)
271
+
272
+ doc_urls_req.raise_for_status()
273
+ doc_urls = doc_urls_req.json()
274
+
275
+ # early check to bail out if no doc is available.
276
+ if len(doc_urls["results"]) == 0:
277
+ logging.warning(
278
+ f"Got no URL results for docs {document_ids}. 3GPP index may not be up to date")
279
+
280
+ raise HTTPException(
281
+ status_code=501, detail="Got no URL results for docs {documents}. 3GPP index may not be up to date")
282
+
283
+ documents_content: Dict[str, bytes] = {}
284
+ failed_documents: List[str] = []
285
+
286
+ def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
287
+ """Attempts to convert a document to text and returns success status and content."""
288
  try:
289
+ text_lines = docx_to_txt(doc_id, doc_url)
290
+ content_bytes = "\n".join(text_lines).encode("utf-8")
291
+ return True, content_bytes
292
  except Exception as e:
293
+ logging.warning(
294
+ f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
295
+ error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
296
+ "utf-8")
297
+ return False, error_message
298
+
299
+ for doc_id, doc_url in doc_urls["results"].items():
300
+ success, content = _process_single_document(doc_id, doc_url)
301
+ documents_content[doc_id] = content
302
+ if not success:
303
+ failed_documents.append(doc_id)
304
+
305
+ # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
306
+ for requested_doc_id in document_ids:
307
+ if requested_doc_id not in documents_content:
308
+ error_msg = (
309
+ f"Failed to retrieve or process document '{requested_doc_id}'. "
310
+ "The 3GPP index may not be up to date, or the document might be unavailable."
311
+ ).encode("utf-8")
312
+
313
+ documents_content[requested_doc_id] = error_msg
314
+ logging.warning(
315
+ f"Document '{requested_doc_id}' was requested but not found or processed.")
316
+ if requested_doc_id not in failed_documents:
317
+ failed_documents.append(requested_doc_id)
318
 
319
  zip_buffer = io.BytesIO()
320
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
321
+ for doc_id, content_bytes in documents_content.items():
322
+ safe_filename = f"{doc_id}.txt"
323
+ zip_file.writestr(safe_filename, content_bytes)
324
 
325
  zip_buffer.seek(0)
326
+
327
  return StreamingResponse(
328
  zip_buffer,
329
+ media_type="application/zip",
330
+ headers={"Content-Disposition": "attachment; filename=tdocs.zip"}
331
  )
332
 
333
 
dependencies.py CHANGED
@@ -9,6 +9,7 @@ from jinja2 import Environment, StrictUndefined, FileSystemLoader
9
 
10
 
11
  INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
 
12
 
13
  def init_dependencies():
14
  """Initialize the application global dependencies"""
 
9
 
10
 
11
  INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
12
+ DOC_FINDER_BASE_URL = "https://organizedprogrammers-docfinder.hf.space/"
13
 
14
  def init_dependencies():
15
  """Initialize the application global dependencies"""