Lucas ARRIESSE commited on
Commit
eb8cbe5
·
1 Parent(s): 6607a5c

Enable downloading multiple doc types

Browse files
Files changed (1) hide show
  1. api/docs.py +49 -27
api/docs.py CHANGED
@@ -37,6 +37,7 @@ NSMAP = {
37
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
38
  CONVERSION_MUTEX = asyncio.Semaphore(1)
39
 
 
40
  async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
41
  """
42
  Converts the given file bytes using Libreoffice headless to the specified file type.
@@ -100,8 +101,8 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
100
  return out_bytes
101
 
102
 
103
- async def get_doc_archive(url: str, client: AsyncClient) -> zipfile.ZipFile:
104
- """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
105
 
106
  if not url.endswith("zip"):
107
  raise ValueError("URL doit pointer vers un fichier ZIP")
@@ -115,23 +116,24 @@ async def get_doc_archive(url: str, client: AsyncClient) -> zipfile.ZipFile:
115
 
116
  with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
117
  # there should be a single file per file
118
- for file_name in zf.namelist():
119
- if file_name.endswith(".docx"):
120
- docx_bytes = zf.read(file_name)
121
- return zipfile.ZipFile(io.BytesIO(docx_bytes))
122
- elif file_name.endswith(".doc"):
123
- in_bytes = io.BytesIO(zf.read(file_name))
124
- docx_bytes = await convert_file(in_bytes, doc_id, "doc", "docx")
125
- return zipfile.ZipFile(docx_bytes)
126
- elif file_name.endswith(".pptx"):
127
- in_bytes = io.BytesIO(zf.read())
128
 
129
- raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
130
 
131
 
132
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
133
  """
134
- Applique les révisions des .docx avant de retourner le contenu
 
 
 
135
  """
136
 
137
  try:
@@ -189,19 +191,38 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
189
 
190
 
191
  async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
192
- # Grab the document archive
193
- docx_zip = await get_doc_archive(url, client)
194
-
195
- # Apply the docx_revisions if the
196
- modified_bytes = apply_docx_revisions(docx_zip)
197
-
198
- final_bytes = await convert_file(
199
- modified_bytes, f"{doc_id}", "docx", "txt")
200
-
201
- final_bytes_text = str(final_bytes.read(), encoding="utf-8")
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  txt_data = [line.strip()
204
- for line in final_bytes_text.splitlines() if line.strip()]
205
 
206
  return txt_data
207
 
@@ -315,15 +336,16 @@ async def download_tdocs(req: DocDownloadRequest, http_client: AsyncClient = Dep
315
  f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
316
  error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
317
  "utf-8")
318
- return {"doc_id": doc_id, "content": error_message}
319
 
320
  convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
321
 
322
  zip_buffer = io.BytesIO()
323
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
324
  for task in convert_tasks:
 
325
  doc_id = task["doc_id"]
326
- safe_filename = f"{doc_id}.txt"
327
  zip_file.writestr(safe_filename, task["content"])
328
 
329
  zip_buffer.seek(0)
 
37
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
38
  CONVERSION_MUTEX = asyncio.Semaphore(1)
39
 
40
+
41
  async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
42
  """
43
  Converts the given file bytes using Libreoffice headless to the specified file type.
 
101
  return out_bytes
102
 
103
 
104
+ async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
105
+ """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
106
 
107
  if not url.endswith("zip"):
108
  raise ValueError("URL doit pointer vers un fichier ZIP")
 
116
 
117
  with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
118
  # there should be a single file per file
119
+ for entry in zf.infolist():
120
+ if entry.is_dir():
121
+ continue
122
+
123
+ file_name = entry.filename
124
+ root, ext = os.path.splitext(file_name)
125
+ doc_bytes = zf.read(file_name)
126
+ return (root, ext.lower(), io.BytesIO(doc_bytes))
 
 
127
 
128
+ raise ValueError("Aucun fichier trouvé dans l'archive")
129
 
130
 
131
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
132
  """
133
+ Applique les révisions des .docx avant de retourner le contenu.
134
+
135
+ Args:
136
+ docx_zip: Le document word sous forme de zip
137
  """
138
 
139
  try:
 
191
 
192
 
193
  async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
194
+ """
195
+ Télécharge le TDoc spécifié et le convertit en texte.
196
+ """
 
 
 
 
 
 
 
197
 
198
+ # Grab the document archive
199
+ filename, ext, bytes = await get_doc_archive(url, client)
200
+
201
+ final_bytes: io.BytesIO = None
202
+ if ext == ".doc":
203
+ logging.debug(f"Converting {filename} .doc --> .docx")
204
+ docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
205
+ logging.debug(f"Converting {filename} .docx --> .txt")
206
+ final_bytes = await convert_file(docx_bytes, f"{doc_id}", "docx", "txt")
207
+ elif ext == ".docx":
208
+ logging.debug(f"Updating .docx revisions")
209
+ applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
210
+ logging.debug(f"Converting {filename} .docx --> .txt")
211
+ final_bytes = await convert_file(applied_revision, f"{doc_id}", "docx", "txt")
212
+ elif ext == ".pdf":
213
+ logging.debug(f"Converting {filename} .pdf --> .txt")
214
+ final_bytes = await convert_file(bytes, doc_id, "pdf", "txt")
215
+ elif ext == ".pptx":
216
+ logging.debug(f"Converting {filename} .pptx --> .pdf")
217
+ pdf_bytes = await convert_file(bytes, doc_id, "pptx", "pdf")
218
+ logging.debug(f"Converting {filename} .pdf --> .txt")
219
+ final_bytes = await convert_file(pdf_bytes, doc_id, "pdf", "txt")
220
+ else:
221
+ raise Exception(f"Unsupported file type: {ext}, filename: {filename}")
222
+
223
+ text_from_bytes = str(final_bytes.read(), encoding="utf-8")
224
  txt_data = [line.strip()
225
+ for line in text_from_bytes.splitlines() if line.strip()]
226
 
227
  return txt_data
228
 
 
336
  f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
337
  error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
338
  "utf-8")
339
+ return {"doc_id": doc_id, "content": error_message, "failed": True}
340
 
341
  convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
342
 
343
  zip_buffer = io.BytesIO()
344
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
345
  for task in convert_tasks:
346
+ failed = "failed" in task
347
  doc_id = task["doc_id"]
348
+ safe_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
349
  zip_file.writestr(safe_filename, task["content"])
350
 
351
  zip_buffer.seek(0)