Spaces:
Sleeping
Sleeping
Rivalcoder
commited on
Commit
·
705ae48
1
Parent(s):
21cf7d6
Add application file
Browse files- Dockerfile +2 -6
- app.py +33 -1
- requirements.txt +3 -0
Dockerfile
CHANGED
@@ -1,20 +1,16 @@
|
|
1 |
-
# Use Python base image
|
2 |
FROM python:3.9
|
3 |
|
4 |
-
|
|
|
5 |
RUN useradd -m -u 1000 user
|
6 |
USER user
|
7 |
ENV PATH="/home/user/.local/bin:$PATH"
|
8 |
|
9 |
-
# Set working directory
|
10 |
WORKDIR /app
|
11 |
|
12 |
-
# Install dependencies
|
13 |
COPY --chown=user ./requirements.txt requirements.txt
|
14 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
15 |
|
16 |
-
# Copy source code
|
17 |
COPY --chown=user . /app
|
18 |
|
19 |
-
# Run the API server
|
20 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
1 |
FROM python:3.9
|
2 |
|
3 |
+
RUN apt-get update && apt-get install -y tesseract-ocr
|
4 |
+
|
5 |
RUN useradd -m -u 1000 user
|
6 |
USER user
|
7 |
ENV PATH="/home/user/.local/bin:$PATH"
|
8 |
|
|
|
9 |
WORKDIR /app
|
10 |
|
|
|
11 |
COPY --chown=user ./requirements.txt requirements.txt
|
12 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
13 |
|
|
|
14 |
COPY --chown=user . /app
|
15 |
|
|
|
16 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
from fastapi import FastAPI, File, UploadFile
|
2 |
import fitz # PyMuPDF
|
3 |
-
import
|
|
|
|
|
4 |
|
5 |
app = FastAPI()
|
6 |
|
@@ -15,6 +17,36 @@ async def extract_text(file: UploadFile = File(...)):
|
|
15 |
extracted_text += f"\n\n--- Page {i + 1} ---\n\n" + page.get_text()
|
16 |
|
17 |
return {"filename": file.filename, "text": extracted_text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
except Exception as e:
|
20 |
return {"error": str(e)}
|
|
|
1 |
from fastapi import FastAPI, File, UploadFile
|
2 |
import fitz # PyMuPDF
|
3 |
+
import pytesseract
|
4 |
+
from PIL import Image
|
5 |
+
import io
|
6 |
|
7 |
app = FastAPI()
|
8 |
|
|
|
17 |
extracted_text += f"\n\n--- Page {i + 1} ---\n\n" + page.get_text()
|
18 |
|
19 |
return {"filename": file.filename, "text": extracted_text}
|
20 |
+
except Exception as e:
|
21 |
+
return {"error": str(e)}
|
22 |
+
|
23 |
+
|
24 |
+
@app.post("/extract-text-ocr")
|
25 |
+
async def extract_text_ocr(file: UploadFile = File(...)):
|
26 |
+
try:
|
27 |
+
contents = await file.read()
|
28 |
+
doc = fitz.open(stream=contents, filetype="pdf")
|
29 |
+
|
30 |
+
full_text = ""
|
31 |
+
|
32 |
+
for i in range(len(doc)):
|
33 |
+
page = doc.load_page(i)
|
34 |
+
|
35 |
+
# Normal text
|
36 |
+
text = page.get_text()
|
37 |
+
|
38 |
+
# Render page to an image
|
39 |
+
pix = page.get_pixmap()
|
40 |
+
img = Image.open(io.BytesIO(pix.tobytes()))
|
41 |
+
|
42 |
+
# OCR text
|
43 |
+
ocr_text = pytesseract.image_to_string(img)
|
44 |
+
|
45 |
+
full_text += f"\n\n--- Page {i + 1} ---\n\n"
|
46 |
+
full_text += text + "\n"
|
47 |
+
full_text += "[OCR Text]\n" + ocr_text
|
48 |
+
|
49 |
+
return {"filename": file.filename, "text": full_text}
|
50 |
|
51 |
except Exception as e:
|
52 |
return {"error": str(e)}
|
requirements.txt
CHANGED
@@ -1,3 +1,6 @@
|
|
1 |
fastapi
|
2 |
uvicorn
|
3 |
PyMuPDF
|
|
|
|
|
|
|
|
1 |
fastapi
|
2 |
uvicorn
|
3 |
PyMuPDF
|
4 |
+
python-multipart
|
5 |
+
pytesseract
|
6 |
+
Pillow
|