Rivalcoder commited on
Commit
45a9a23
·
1 Parent(s): 09c633c

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +9 -1
  2. app.py +30 -2
  3. requirements.txt +2 -0
Dockerfile CHANGED
@@ -1,16 +1,24 @@
1
  FROM python:3.9
2
 
3
- RUN apt-get update && apt-get install -y tesseract-ocr
 
 
 
4
 
 
5
  RUN useradd -m -u 1000 user
6
  USER user
7
  ENV PATH="/home/user/.local/bin:$PATH"
8
 
 
9
  WORKDIR /app
10
 
 
11
  COPY --chown=user ./requirements.txt requirements.txt
12
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
13
 
 
14
  COPY --chown=user . /app
15
 
 
16
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.9
2
 
3
+ # Install system dependencies (Tesseract for OCR)
4
+ RUN apt-get update && apt-get install -y \
5
+ tesseract-ocr \
6
+ && rm -rf /var/lib/apt/lists/*
7
 
8
+ # Create and switch to a non-root user
9
  RUN useradd -m -u 1000 user
10
  USER user
11
  ENV PATH="/home/user/.local/bin:$PATH"
12
 
13
+ # Set working directory
14
  WORKDIR /app
15
 
16
+ # Copy and install Python dependencies
17
  COPY --chown=user ./requirements.txt requirements.txt
18
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
19
 
20
+ # Copy application code
21
  COPY --chown=user . /app
22
 
23
+ # Start the app with Uvicorn
24
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -5,6 +5,9 @@ import pytesseract
5
  from PIL import Image
6
  import io
7
 
 
 
 
8
  app = FastAPI()
9
 
10
 
@@ -30,8 +33,9 @@ async def home():
30
  <p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
31
  <h2>Available endpoints:</h2>
32
  <ul>
33
- <li><b>POST /extract-text</b> - Extract text from PDF pages.</li>
34
- <li><b>POST /extract-text-ocr</b> - Extract text including OCR from images inside PDFs.</li>
 
35
  </ul>
36
  <p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
37
  </div>
@@ -85,3 +89,27 @@ async def extract_text_ocr(file: UploadFile = File(...)):
85
 
86
  except Exception as e:
87
  return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from PIL import Image
6
  import io
7
 
8
+ from pdfminer.high_level import extract_pages
9
+ from pdfminer.layout import LTTextContainer
10
+
11
  app = FastAPI()
12
 
13
 
 
33
  <p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
34
  <h2>Available endpoints:</h2>
35
  <ul>
36
+ <li><b>POST /extract-text</b> - Extract plain text from PDF pages.</li>
37
+ <li><b>POST /extract-text-ocr</b> - Extract text including OCR from image-based PDFs.</li>
38
+ <li><b>POST /extract-text-structured</b> - Extract structured text using pdfminer.</li>
39
  </ul>
40
  <p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
41
  </div>
 
89
 
90
  except Exception as e:
91
  return {"error": str(e)}
92
+
93
+
94
+ @app.post("/extract-text-structured")
95
+ async def extract_text_structured(file: UploadFile = File(...)):
96
+ try:
97
+ contents = await file.read()
98
+
99
+ # Save to temp file to use with extract_pages
100
+ import tempfile
101
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
102
+ tmp_file.write(contents)
103
+ temp_pdf_path = tmp_file.name
104
+
105
+ structured_text = ""
106
+ for i, page_layout in enumerate(extract_pages(temp_pdf_path)):
107
+ structured_text += f"\n\n--- Page {i + 1} ---\n\n"
108
+ for element in page_layout:
109
+ if isinstance(element, LTTextContainer):
110
+ structured_text += element.get_text()
111
+
112
+ return {"filename": file.filename, "text": structured_text}
113
+
114
+ except Exception as e:
115
+ return {"error": str(e)}
requirements.txt CHANGED
@@ -4,3 +4,5 @@ PyMuPDF
4
  python-multipart
5
  pytesseract
6
  Pillow
 
 
 
4
  python-multipart
5
  pytesseract
6
  Pillow
7
+ pdfminer.six
8
+