Spaces:

Rivalcoder
/

pdf-text-api

Sleeping

App Files Files Community

Rivalcoder commited on 9 days ago

Commit

45a9a23

1 Parent(s): 09c633c

Add application file

Browse files

Files changed (3) hide show

Dockerfile +9 -1
app.py +30 -2
requirements.txt +2 -0

Dockerfile CHANGED Viewed

@@ -1,16 +1,24 @@
 FROM python:3.9
-RUN apt-get update && apt-get install -y tesseract-ocr
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.9
+# Install system dependencies (Tesseract for OCR)
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    && rm -rf /var/lib/apt/lists/*
+# Create and switch to a non-root user
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
 WORKDIR /app
+# Copy and install Python dependencies
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy application code
 COPY --chown=user . /app
+# Start the app with Uvicorn
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ import pytesseract
 from PIL import Image
 import io
 app = FastAPI()
@@ -30,8 +33,9 @@ async def home():
             <p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
             <h2>Available endpoints:</h2>
             <ul>
-                <li><b>POST /extract-text</b> - Extract text from PDF pages.</li>
-                <li><b>POST /extract-text-ocr</b> - Extract text including OCR from images inside PDFs.</li>
             </ul>
             <p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
         </div>
@@ -85,3 +89,27 @@ async def extract_text_ocr(file: UploadFile = File(...)):
     except Exception as e:
         return {"error": str(e)}

 from PIL import Image
 import io
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer
 app = FastAPI()
             <p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
             <h2>Available endpoints:</h2>
             <ul>
+                <li><b>POST /extract-text</b> - Extract plain text from PDF pages.</li>
+                <li><b>POST /extract-text-ocr</b> - Extract text including OCR from image-based PDFs.</li>
+                <li><b>POST /extract-text-structured</b> - Extract structured text using pdfminer.</li>
             </ul>
             <p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
         </div>
     except Exception as e:
         return {"error": str(e)}
+@app.post("/extract-text-structured")
+async def extract_text_structured(file: UploadFile = File(...)):
+    try:
+        contents = await file.read()
+        # Save to temp file to use with extract_pages
+        import tempfile
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(contents)
+            temp_pdf_path = tmp_file.name
+        structured_text = ""
+        for i, page_layout in enumerate(extract_pages(temp_pdf_path)):
+            structured_text += f"\n\n--- Page {i + 1} ---\n\n"
+            for element in page_layout:
+                if isinstance(element, LTTextContainer):
+                    structured_text += element.get_text()
+        return {"filename": file.filename, "text": structured_text}
+    except Exception as e:
+        return {"error": str(e)}

requirements.txt CHANGED Viewed

@@ -4,3 +4,5 @@ PyMuPDF
 python-multipart
 pytesseract
 Pillow

 python-multipart
 pytesseract
 Pillow
+pdfminer.six