Spaces:

Rivalcoder
/

pdf-text-api

Sleeping

Rivalcoder commited on 13 days ago

Commit

705ae48

1 Parent(s): 21cf7d6

Add application file

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -1,20 +1,16 @@
-# Use Python base image
 FROM python:3.9
-# Create a user and set up environment
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
-# Set working directory
 WORKDIR /app
-# Install dependencies
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
-# Copy source code
 COPY --chown=user . /app
-# Run the API server
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.9
+RUN apt-get update && apt-get install -y tesseract-ocr
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from fastapi import FastAPI, File, UploadFile
 import fitz  # PyMuPDF
-import uvicorn
 app = FastAPI()
@@ -15,6 +17,36 @@ async def extract_text(file: UploadFile = File(...)):
             extracted_text += f"\n\n--- Page {i + 1} ---\n\n" + page.get_text()
         return {"filename": file.filename, "text": extracted_text}
     except Exception as e:
         return {"error": str(e)}

 from fastapi import FastAPI, File, UploadFile
 import fitz  # PyMuPDF
+import pytesseract
+from PIL import Image
+import io
 app = FastAPI()
             extracted_text += f"\n\n--- Page {i + 1} ---\n\n" + page.get_text()
         return {"filename": file.filename, "text": extracted_text}
+    except Exception as e:
+        return {"error": str(e)}
+@app.post("/extract-text-ocr")
+async def extract_text_ocr(file: UploadFile = File(...)):
+    try:
+        contents = await file.read()
+        doc = fitz.open(stream=contents, filetype="pdf")
+        full_text = ""
+        for i in range(len(doc)):
+            page = doc.load_page(i)
+            # Normal text
+            text = page.get_text()
+            # Render page to an image
+            pix = page.get_pixmap()
+            img = Image.open(io.BytesIO(pix.tobytes()))
+            # OCR text
+            ocr_text = pytesseract.image_to_string(img)
+            full_text += f"\n\n--- Page {i + 1} ---\n\n"
+            full_text += text + "\n"
+            full_text += "[OCR Text]\n" + ocr_text
+        return {"filename": file.filename, "text": full_text}
     except Exception as e:
         return {"error": str(e)}

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 fastapi
 uvicorn
 PyMuPDF

 fastapi
 uvicorn
 PyMuPDF
+python-multipart
+pytesseract
+Pillow