Spaces:

Artemis-IA
/

docling_converter

Running

App Files Files Community

Artemis-IA commited on Jan 25

Commit

1c4b493

verified ·

1 Parent(s): 69b16ea

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -119

app.py CHANGED Viewed

@@ -1,15 +1,9 @@
-import aiofiles
-import uvicorn
 import zipfile
-from fastapi import FastAPI, File, Query, UploadFile, HTTPException
-from fastapi.responses import FileResponse, JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-from enum import Enum
 from pathlib import Path
 from typing import List
 from PyPDF2 import PdfReader
-from easyocr import Reader
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionStatus
@@ -26,17 +20,6 @@ from docling.datamodel.pipeline_options import (
     OcrMacOptions,
 )
-# Définition de l'application FastAPI
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
 # Répertoires de sortie
 OUTPUT_DIR = Path("output")
 OUTPUT_DIR.mkdir(exist_ok=True)
@@ -47,26 +30,13 @@ FIGURES_DIR.mkdir(exist_ok=True)
 TABLES_DIR = OUTPUT_DIR / "tables"
 TABLES_DIR.mkdir(exist_ok=True)
-class ExportFormat(str, Enum):
-    json = "json"
-    yaml = "yaml"
-    md = "md"
-    multimodal = "multimodal"
-class OcrEngine(str, Enum):
-    easyocr = "easyocr"
-    tesseract_cli = "tesseract_cli"
-    tesserocr = "tesserocr"
-    rapidocr = "rapidocr"
-    ocrmac = "ocrmac"
 # Vérification de validité des fichiers PDF
 def is_valid_pdf(file_path):
     try:
         PdfReader(file_path)
         return True
     except Exception as e:
-        print(f"Fichier non valide ou corrompu : {file_path} - Erreur : {e}")
         return False
 # Fonction pour configurer le convertisseur de documents
@@ -75,9 +45,9 @@ def create_document_converter(
     export_figures: bool,
     export_tables: bool,
     accelerator: str,
-    ocr_engine: OcrEngine,
     table_mode: str,
-    ocr_languages: List[str]
 ) -> DocumentConverter:
     accelerator_options = AcceleratorOptions(
         num_threads=8,
@@ -89,16 +59,16 @@ def create_document_converter(
         do_cell_matching=True,
     )
-    # OCR avec EasyOCR ou autres moteurs
-    if ocr_engine == OcrEngine.easyocr:
         ocr_options = EasyOcrOptions(lang=ocr_languages)
-    elif ocr_engine == OcrEngine.tesseract_cli:
         ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
-    elif ocr_engine == OcrEngine.tesserocr:
         ocr_options = TesseractOcrOptions(lang=ocr_languages)
-    elif ocr_engine == OcrEngine.rapidocr:
         ocr_options = RapidOcrOptions(lang=ocr_languages)
-    elif ocr_engine == OcrEngine.ocrmac:
         ocr_options = OcrMacOptions(lang=ocr_languages)
     else:
         raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
@@ -112,6 +82,7 @@ def create_document_converter(
         table_structure_options=table_structure_options,
         ocr_options=ocr_options,
     )
     return DocumentConverter(
         allowed_formats=[
             InputFormat.PDF,
@@ -120,84 +91,67 @@ def create_document_converter(
             InputFormat.HTML,
             InputFormat.IMAGE,
         ],
-        format_options={
-            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
-        },
     )
-# Endpoint pour traiter les fichiers
-@app.post("/process_files/")
-async def process_files(
-    files: List[UploadFile] = File(...),
-    use_ocr: bool = Query(False),
-    export_figures: bool = Query(True),
-    export_tables: bool = Query(True),
-    export_formats: List[ExportFormat] = Query(default=[ExportFormat.md]),
-    accelerator: str = Query("cpu"),
-    ocr_engine: OcrEngine = Query(OcrEngine.easyocr),
-    table_mode: str = Query(TableFormerMode.ACCURATE),
-    ocr_languages: List[str] = Query(default=["eng"]),
-):
-    input_paths = []
-    for f in files:
-        file_path = OUTPUT_DIR / f.filename
-        async with aiofiles.open(file_path, 'wb') as out_file:
-            content = await f.read()
-            await out_file.write(content)
-        print(f"Fichier reçu : {file_path} (taille : {file_path.stat().st_size} octets)")
-        if not is_valid_pdf(file_path):
-            return JSONResponse(
-                content={"error": f"Le fichier {file_path.name} n'est pas un PDF valide."}, status_code=400
-            )
-        input_paths.append(file_path)
-    converter = create_document_converter(
-        use_ocr,
-        export_figures,
-        export_tables,
-        accelerator,
-        ocr_engine,
-        table_mode,
-        ocr_languages,
-    )
-    conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
-    success_count, failure_count = 0, 0
-    generated_files = []
-    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            print(f"Conversion réussie pour : {conv_res.input.file}")
-            success_count += 1
-            for export_format in export_formats:
-                if export_format == ExportFormat.md:
-                    output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
-                    if conv_res.document.pages:
-                        with open(output_file, "w") as f:
-                            f.write("## Exemple de contenu Markdown généré\n")
-                        generated_files.append(output_file)
-                    else:
-                        print(f"Aucune donnée trouvée dans le document converti : {conv_res.input.file}")
-        else:
-            print(f"Échec de la conversion pour : {conv_res.input.file} - Statut : {conv_res.status}")
-            failure_count += 1
-    # Création du fichier ZIP
-    zip_filename = OUTPUT_DIR / "exported_files.zip"
-    with zipfile.ZipFile(zip_filename, "w") as zipf:
-        for file in generated_files:
-            zipf.write(file, file.name)
-    return {"success_count": success_count, "failure_count": failure_count, "zip_path": str(zip_filename)}
-@app.get("/download/{filename}")
-def download_file(filename: str):
-    file_path = OUTPUT_DIR / filename
-    if file_path.exists():
-        return FileResponse(path=file_path, filename=filename)
     else:
-        raise HTTPException(status_code=404, detail="Fichier non trouvé.")
-# Démarrer le serveur
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+import os
 import zipfile
 from pathlib import Path
 from typing import List
 from PyPDF2 import PdfReader
+import streamlit as st
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionStatus
     OcrMacOptions,
 )
 # Répertoires de sortie
 OUTPUT_DIR = Path("output")
 OUTPUT_DIR.mkdir(exist_ok=True)
 TABLES_DIR = OUTPUT_DIR / "tables"
 TABLES_DIR.mkdir(exist_ok=True)
 # Vérification de validité des fichiers PDF
 def is_valid_pdf(file_path):
     try:
         PdfReader(file_path)
         return True
     except Exception as e:
+        st.error(f"Fichier non valide ou corrompu : {file_path} - Erreur : {e}")
         return False
 # Fonction pour configurer le convertisseur de documents
     export_figures: bool,
     export_tables: bool,
     accelerator: str,
+    ocr_engine: str,
     table_mode: str,
+    ocr_languages: List[str],
 ) -> DocumentConverter:
     accelerator_options = AcceleratorOptions(
         num_threads=8,
         do_cell_matching=True,
     )
+    # OCR avec EasyOCR
+    if ocr_engine == "easyocr":
         ocr_options = EasyOcrOptions(lang=ocr_languages)
+    elif ocr_engine == "tesseract_cli":
         ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
+    elif ocr_engine == "tesserocr":
         ocr_options = TesseractOcrOptions(lang=ocr_languages)
+    elif ocr_engine == "rapidocr":
         ocr_options = RapidOcrOptions(lang=ocr_languages)
+    elif ocr_engine == "ocrmac":
         ocr_options = OcrMacOptions(lang=ocr_languages)
     else:
         raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
         table_structure_options=table_structure_options,
         ocr_options=ocr_options,
     )
     return DocumentConverter(
         allowed_formats=[
             InputFormat.PDF,
             InputFormat.HTML,
             InputFormat.IMAGE,
         ],
+        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},
     )
+# Interface Streamlit
+st.title("Conversion de documents PDF avec OCR")
+st.subheader("Téléchargez un PDF pour commencer le traitement")
+uploaded_files = st.file_uploader("Sélectionnez vos fichiers PDF", accept_multiple_files=True, type=["pdf"])
+use_ocr = st.checkbox("Activer l'OCR ?", value=True)
+export_figures = st.checkbox("Exporter les figures ?", value=True)
+export_tables = st.checkbox("Exporter les tableaux ?", value=True)
+ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
+ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
+table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
+if st.button("Convertir"):
+    if uploaded_files:
+        input_paths = []
+        generated_files = []
+        for uploaded_file in uploaded_files:
+            file_path = OUTPUT_DIR / uploaded_file.name
+            with open(file_path, "wb") as f:
+                f.write(uploaded_file.read())
+            st.write(f"Fichier reçu : {file_path} (taille : {os.path.getsize(file_path)} octets)")
+            if not is_valid_pdf(file_path):
+                st.error(f"Le fichier {file_path.name} n'est pas un PDF valide.")
+                continue
+            input_paths.append(file_path)
+        # Configurer le convertisseur de documents
+        converter = create_document_converter(
+            use_ocr,
+            export_figures,
+            export_tables,
+            accelerator="cpu",
+            ocr_engine=ocr_engine,
+            table_mode=table_mode,
+            ocr_languages=ocr_languages,
+        )
+        # Conversion des fichiers
+        conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
+        for conv_res in conv_results:
+            if conv_res.status == ConversionStatus.SUCCESS:
+                st.success(f"Conversion réussie pour : {conv_res.input.file}")
+                output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
+                with open(output_file, "w") as f:
+                    f.write("## Exemple de contenu Markdown généré\n")
+                generated_files.append(output_file)
+            else:
+                st.error(f"Échec de la conversion pour : {conv_res.input.file} - Statut : {conv_res.status}")
+        # Création du fichier ZIP
+        zip_filename = OUTPUT_DIR / "exported_files.zip"
+        with zipfile.ZipFile(zip_filename, "w") as zipf:
+            for file in generated_files:
+                zipf.write(file, file.name)
+        st.success("Conversion terminée !")
+        st.download_button("Télécharger le ZIP", data=open(zip_filename, "rb").read(), file_name="exported_files.zip")
     else:
+        st.error("Veuillez télécharger au moins un fichier PDF.")