Lucas ARRIESSE
commited on
Commit
·
eb8cbe5
1
Parent(s):
6607a5c
Enable downloading multiple doc types
Browse files- api/docs.py +49 -27
api/docs.py
CHANGED
@@ -37,6 +37,7 @@ NSMAP = {
|
|
37 |
# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
|
38 |
CONVERSION_MUTEX = asyncio.Semaphore(1)
|
39 |
|
|
|
40 |
async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
|
41 |
"""
|
42 |
Converts the given file bytes using Libreoffice headless to the specified file type.
|
@@ -100,8 +101,8 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
|
|
100 |
return out_bytes
|
101 |
|
102 |
|
103 |
-
async def get_doc_archive(url: str, client: AsyncClient) ->
|
104 |
-
"""Récupère le docx depuis l'URL et le retourne
|
105 |
|
106 |
if not url.endswith("zip"):
|
107 |
raise ValueError("URL doit pointer vers un fichier ZIP")
|
@@ -115,23 +116,24 @@ async def get_doc_archive(url: str, client: AsyncClient) -> zipfile.ZipFile:
|
|
115 |
|
116 |
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
117 |
# there should be a single file per file
|
118 |
-
for
|
119 |
-
if
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
elif file_name.endswith(".pptx"):
|
127 |
-
in_bytes = io.BytesIO(zf.read())
|
128 |
|
129 |
-
raise ValueError("Aucun fichier
|
130 |
|
131 |
|
132 |
def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|
133 |
"""
|
134 |
-
Applique les révisions des .docx avant de retourner le contenu
|
|
|
|
|
|
|
135 |
"""
|
136 |
|
137 |
try:
|
@@ -189,19 +191,38 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|
|
189 |
|
190 |
|
191 |
async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
# Apply the docx_revisions if the
|
196 |
-
modified_bytes = apply_docx_revisions(docx_zip)
|
197 |
-
|
198 |
-
final_bytes = await convert_file(
|
199 |
-
modified_bytes, f"{doc_id}", "docx", "txt")
|
200 |
-
|
201 |
-
final_bytes_text = str(final_bytes.read(), encoding="utf-8")
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
txt_data = [line.strip()
|
204 |
-
for line in
|
205 |
|
206 |
return txt_data
|
207 |
|
@@ -315,15 +336,16 @@ async def download_tdocs(req: DocDownloadRequest, http_client: AsyncClient = Dep
|
|
315 |
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
316 |
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
317 |
"utf-8")
|
318 |
-
return {"doc_id": doc_id, "content": error_message}
|
319 |
|
320 |
convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
|
321 |
|
322 |
zip_buffer = io.BytesIO()
|
323 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
324 |
for task in convert_tasks:
|
|
|
325 |
doc_id = task["doc_id"]
|
326 |
-
safe_filename = f"{doc_id}.txt"
|
327 |
zip_file.writestr(safe_filename, task["content"])
|
328 |
|
329 |
zip_buffer.seek(0)
|
|
|
37 |
# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
|
38 |
CONVERSION_MUTEX = asyncio.Semaphore(1)
|
39 |
|
40 |
+
|
41 |
async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
|
42 |
"""
|
43 |
Converts the given file bytes using Libreoffice headless to the specified file type.
|
|
|
101 |
return out_bytes
|
102 |
|
103 |
|
104 |
+
async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
|
105 |
+
"""Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
|
106 |
|
107 |
if not url.endswith("zip"):
|
108 |
raise ValueError("URL doit pointer vers un fichier ZIP")
|
|
|
116 |
|
117 |
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
118 |
# there should be a single file per file
|
119 |
+
for entry in zf.infolist():
|
120 |
+
if entry.is_dir():
|
121 |
+
continue
|
122 |
+
|
123 |
+
file_name = entry.filename
|
124 |
+
root, ext = os.path.splitext(file_name)
|
125 |
+
doc_bytes = zf.read(file_name)
|
126 |
+
return (root, ext.lower(), io.BytesIO(doc_bytes))
|
|
|
|
|
127 |
|
128 |
+
raise ValueError("Aucun fichier trouvé dans l'archive")
|
129 |
|
130 |
|
131 |
def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|
132 |
"""
|
133 |
+
Applique les révisions des .docx avant de retourner le contenu.
|
134 |
+
|
135 |
+
Args:
|
136 |
+
docx_zip: Le document word sous forme de zip
|
137 |
"""
|
138 |
|
139 |
try:
|
|
|
191 |
|
192 |
|
193 |
async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
|
194 |
+
"""
|
195 |
+
Télécharge le TDoc spécifié et le convertit en texte.
|
196 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
+
# Grab the document archive
|
199 |
+
filename, ext, bytes = await get_doc_archive(url, client)
|
200 |
+
|
201 |
+
final_bytes: io.BytesIO = None
|
202 |
+
if ext == ".doc":
|
203 |
+
logging.debug(f"Converting {filename} .doc --> .docx")
|
204 |
+
docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
|
205 |
+
logging.debug(f"Converting {filename} .docx --> .txt")
|
206 |
+
final_bytes = await convert_file(docx_bytes, f"{doc_id}", "docx", "txt")
|
207 |
+
elif ext == ".docx":
|
208 |
+
logging.debug(f"Updating .docx revisions")
|
209 |
+
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
210 |
+
logging.debug(f"Converting {filename} .docx --> .txt")
|
211 |
+
final_bytes = await convert_file(applied_revision, f"{doc_id}", "docx", "txt")
|
212 |
+
elif ext == ".pdf":
|
213 |
+
logging.debug(f"Converting {filename} .pdf --> .txt")
|
214 |
+
final_bytes = await convert_file(bytes, doc_id, "pdf", "txt")
|
215 |
+
elif ext == ".pptx":
|
216 |
+
logging.debug(f"Converting {filename} .pptx --> .pdf")
|
217 |
+
pdf_bytes = await convert_file(bytes, doc_id, "pptx", "pdf")
|
218 |
+
logging.debug(f"Converting {filename} .pdf --> .txt")
|
219 |
+
final_bytes = await convert_file(pdf_bytes, doc_id, "pdf", "txt")
|
220 |
+
else:
|
221 |
+
raise Exception(f"Unsupported file type: {ext}, filename: {filename}")
|
222 |
+
|
223 |
+
text_from_bytes = str(final_bytes.read(), encoding="utf-8")
|
224 |
txt_data = [line.strip()
|
225 |
+
for line in text_from_bytes.splitlines() if line.strip()]
|
226 |
|
227 |
return txt_data
|
228 |
|
|
|
336 |
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
337 |
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
338 |
"utf-8")
|
339 |
+
return {"doc_id": doc_id, "content": error_message, "failed": True}
|
340 |
|
341 |
convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
|
342 |
|
343 |
zip_buffer = io.BytesIO()
|
344 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
345 |
for task in convert_tasks:
|
346 |
+
failed = "failed" in task
|
347 |
doc_id = task["doc_id"]
|
348 |
+
safe_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
|
349 |
zip_file.writestr(safe_filename, task["content"])
|
350 |
|
351 |
zip_buffer.seek(0)
|