Spaces:
Sleeping
Sleeping
Rivalcoder
commited on
Commit
·
45a9a23
1
Parent(s):
09c633c
Add application file
Browse files- Dockerfile +9 -1
- app.py +30 -2
- requirements.txt +2 -0
Dockerfile
CHANGED
@@ -1,16 +1,24 @@
|
|
1 |
FROM python:3.9
|
2 |
|
3 |
-
|
|
|
|
|
|
|
4 |
|
|
|
5 |
RUN useradd -m -u 1000 user
|
6 |
USER user
|
7 |
ENV PATH="/home/user/.local/bin:$PATH"
|
8 |
|
|
|
9 |
WORKDIR /app
|
10 |
|
|
|
11 |
COPY --chown=user ./requirements.txt requirements.txt
|
12 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
13 |
|
|
|
14 |
COPY --chown=user . /app
|
15 |
|
|
|
16 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
FROM python:3.9
|
2 |
|
3 |
+
# Install system dependencies (Tesseract for OCR)
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
tesseract-ocr \
|
6 |
+
&& rm -rf /var/lib/apt/lists/*
|
7 |
|
8 |
+
# Create and switch to a non-root user
|
9 |
RUN useradd -m -u 1000 user
|
10 |
USER user
|
11 |
ENV PATH="/home/user/.local/bin:$PATH"
|
12 |
|
13 |
+
# Set working directory
|
14 |
WORKDIR /app
|
15 |
|
16 |
+
# Copy and install Python dependencies
|
17 |
COPY --chown=user ./requirements.txt requirements.txt
|
18 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
19 |
|
20 |
+
# Copy application code
|
21 |
COPY --chown=user . /app
|
22 |
|
23 |
+
# Start the app with Uvicorn
|
24 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
@@ -5,6 +5,9 @@ import pytesseract
|
|
5 |
from PIL import Image
|
6 |
import io
|
7 |
|
|
|
|
|
|
|
8 |
app = FastAPI()
|
9 |
|
10 |
|
@@ -30,8 +33,9 @@ async def home():
|
|
30 |
<p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
|
31 |
<h2>Available endpoints:</h2>
|
32 |
<ul>
|
33 |
-
<li><b>POST /extract-text</b> - Extract text from PDF pages.</li>
|
34 |
-
<li><b>POST /extract-text-ocr</b> - Extract text including OCR from
|
|
|
35 |
</ul>
|
36 |
<p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
|
37 |
</div>
|
@@ -85,3 +89,27 @@ async def extract_text_ocr(file: UploadFile = File(...)):
|
|
85 |
|
86 |
except Exception as e:
|
87 |
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from PIL import Image
|
6 |
import io
|
7 |
|
8 |
+
from pdfminer.high_level import extract_pages
|
9 |
+
from pdfminer.layout import LTTextContainer
|
10 |
+
|
11 |
app = FastAPI()
|
12 |
|
13 |
|
|
|
33 |
<p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
|
34 |
<h2>Available endpoints:</h2>
|
35 |
<ul>
|
36 |
+
<li><b>POST /extract-text</b> - Extract plain text from PDF pages.</li>
|
37 |
+
<li><b>POST /extract-text-ocr</b> - Extract text including OCR from image-based PDFs.</li>
|
38 |
+
<li><b>POST /extract-text-structured</b> - Extract structured text using pdfminer.</li>
|
39 |
</ul>
|
40 |
<p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
|
41 |
</div>
|
|
|
89 |
|
90 |
except Exception as e:
|
91 |
return {"error": str(e)}
|
92 |
+
|
93 |
+
|
94 |
+
@app.post("/extract-text-structured")
|
95 |
+
async def extract_text_structured(file: UploadFile = File(...)):
|
96 |
+
try:
|
97 |
+
contents = await file.read()
|
98 |
+
|
99 |
+
# Save to temp file to use with extract_pages
|
100 |
+
import tempfile
|
101 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
102 |
+
tmp_file.write(contents)
|
103 |
+
temp_pdf_path = tmp_file.name
|
104 |
+
|
105 |
+
structured_text = ""
|
106 |
+
for i, page_layout in enumerate(extract_pages(temp_pdf_path)):
|
107 |
+
structured_text += f"\n\n--- Page {i + 1} ---\n\n"
|
108 |
+
for element in page_layout:
|
109 |
+
if isinstance(element, LTTextContainer):
|
110 |
+
structured_text += element.get_text()
|
111 |
+
|
112 |
+
return {"filename": file.filename, "text": structured_text}
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
return {"error": str(e)}
|
requirements.txt
CHANGED
@@ -4,3 +4,5 @@ PyMuPDF
|
|
4 |
python-multipart
|
5 |
pytesseract
|
6 |
Pillow
|
|
|
|
|
|
4 |
python-multipart
|
5 |
pytesseract
|
6 |
Pillow
|
7 |
+
pdfminer.six
|
8 |
+
|