Spaces:

OrganizedProgrammers
/

Reqxtract-v2

Running

App Files Files Community

Lucas ARRIESSE commited on 8 days ago

Commit

eb8cbe5

1 Parent(s): 6607a5c

Enable downloading multiple doc types

Browse files

Files changed (1) hide show

api/docs.py +49 -27

api/docs.py CHANGED Viewed

@@ -37,6 +37,7 @@ NSMAP = {
 # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
 CONVERSION_MUTEX = asyncio.Semaphore(1)
 async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
     """
     Converts the given file bytes using Libreoffice headless to the specified file type.
@@ -100,8 +101,8 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
         return out_bytes
-async def get_doc_archive(url: str, client: AsyncClient) -> zipfile.ZipFile:
-    """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
@@ -115,23 +116,24 @@ async def get_doc_archive(url: str, client: AsyncClient) -> zipfile.ZipFile:
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
         # there should be a single file per file
-        for file_name in zf.namelist():
-            if file_name.endswith(".docx"):
-                docx_bytes = zf.read(file_name)
-                return zipfile.ZipFile(io.BytesIO(docx_bytes))
-            elif file_name.endswith(".doc"):
-                in_bytes = io.BytesIO(zf.read(file_name))
-                docx_bytes = await convert_file(in_bytes, doc_id, "doc", "docx")
-                return zipfile.ZipFile(docx_bytes)
-            elif file_name.endswith(".pptx"):
-                in_bytes = io.BytesIO(zf.read())
-    raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     """
-    Applique les révisions des .docx avant de retourner le contenu
     """
     try:
@@ -189,19 +191,38 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
 async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
-    # Grab the document archive
-    docx_zip = await get_doc_archive(url, client)
-    # Apply the docx_revisions if the
-    modified_bytes = apply_docx_revisions(docx_zip)
-    final_bytes = await convert_file(
-        modified_bytes, f"{doc_id}", "docx", "txt")
-    final_bytes_text = str(final_bytes.read(), encoding="utf-8")
     txt_data = [line.strip()
-                for line in final_bytes_text.splitlines() if line.strip()]
     return txt_data
@@ -315,15 +336,16 @@ async def download_tdocs(req: DocDownloadRequest, http_client: AsyncClient = Dep
                 f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
             error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
                 "utf-8")
-            return {"doc_id": doc_id, "content": error_message}
     convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
         for task in convert_tasks:
             doc_id = task["doc_id"]
-            safe_filename = f"{doc_id}.txt"
             zip_file.writestr(safe_filename, task["content"])
     zip_buffer.seek(0)

 # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
 CONVERSION_MUTEX = asyncio.Semaphore(1)
 async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
     """
     Converts the given file bytes using Libreoffice headless to the specified file type.
         return out_bytes
+async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
+    """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
         # there should be a single file per file
+        for entry in zf.infolist():
+            if entry.is_dir():
+                continue
+            file_name = entry.filename
+            root, ext = os.path.splitext(file_name)
+            doc_bytes = zf.read(file_name)
+            return (root, ext.lower(), io.BytesIO(doc_bytes))
+    raise ValueError("Aucun fichier trouvé dans l'archive")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     """
+    Applique les révisions des .docx avant de retourner le contenu.
+    Args:
+        docx_zip: Le document word sous forme de zip
     """
     try:
 async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
+    """
+    Télécharge le TDoc spécifié et le convertit en texte.
+    """
+    # Grab the document archive
+    filename, ext, bytes = await get_doc_archive(url, client)
+    final_bytes: io.BytesIO = None
+    if ext == ".doc":
+        logging.debug(f"Converting {filename} .doc --> .docx")
+        docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
+        logging.debug(f"Converting {filename} .docx --> .txt")
+        final_bytes = await convert_file(docx_bytes, f"{doc_id}", "docx", "txt")
+    elif ext == ".docx":
+        logging.debug(f"Updating .docx revisions")
+        applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
+        logging.debug(f"Converting {filename} .docx --> .txt")
+        final_bytes = await convert_file(applied_revision, f"{doc_id}", "docx", "txt")
+    elif ext == ".pdf":
+        logging.debug(f"Converting {filename} .pdf --> .txt")
+        final_bytes = await convert_file(bytes, doc_id, "pdf", "txt")
+    elif ext == ".pptx":
+        logging.debug(f"Converting {filename} .pptx --> .pdf")
+        pdf_bytes = await convert_file(bytes, doc_id, "pptx", "pdf")
+        logging.debug(f"Converting {filename} .pdf --> .txt")
+        final_bytes = await convert_file(pdf_bytes, doc_id, "pdf", "txt")
+    else:
+        raise Exception(f"Unsupported file type: {ext}, filename: {filename}")
+    text_from_bytes = str(final_bytes.read(), encoding="utf-8")
     txt_data = [line.strip()
+                for line in text_from_bytes.splitlines() if line.strip()]
     return txt_data
                 f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
             error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
                 "utf-8")
+            return {"doc_id": doc_id, "content": error_message, "failed": True}
     convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
         for task in convert_tasks:
+            failed = "failed" in task
             doc_id = task["doc_id"]
+            safe_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
             zip_file.writestr(safe_filename, task["content"])
     zip_buffer.seek(0)