Rivalcoder commited on
Commit
705ae48
·
1 Parent(s): 21cf7d6

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -6
  2. app.py +33 -1
  3. requirements.txt +3 -0
Dockerfile CHANGED
@@ -1,20 +1,16 @@
1
- # Use Python base image
2
  FROM python:3.9
3
 
4
- # Create a user and set up environment
 
5
  RUN useradd -m -u 1000 user
6
  USER user
7
  ENV PATH="/home/user/.local/bin:$PATH"
8
 
9
- # Set working directory
10
  WORKDIR /app
11
 
12
- # Install dependencies
13
  COPY --chown=user ./requirements.txt requirements.txt
14
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
 
16
- # Copy source code
17
  COPY --chown=user . /app
18
 
19
- # Run the API server
20
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
  FROM python:3.9
2
 
3
+ RUN apt-get update && apt-get install -y tesseract-ocr
4
+
5
  RUN useradd -m -u 1000 user
6
  USER user
7
  ENV PATH="/home/user/.local/bin:$PATH"
8
 
 
9
  WORKDIR /app
10
 
 
11
  COPY --chown=user ./requirements.txt requirements.txt
12
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
13
 
 
14
  COPY --chown=user . /app
15
 
 
16
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,6 +1,8 @@
1
  from fastapi import FastAPI, File, UploadFile
2
  import fitz # PyMuPDF
3
- import uvicorn
 
 
4
 
5
  app = FastAPI()
6
 
@@ -15,6 +17,36 @@ async def extract_text(file: UploadFile = File(...)):
15
  extracted_text += f"\n\n--- Page {i + 1} ---\n\n" + page.get_text()
16
 
17
  return {"filename": file.filename, "text": extracted_text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  except Exception as e:
20
  return {"error": str(e)}
 
1
  from fastapi import FastAPI, File, UploadFile
2
  import fitz # PyMuPDF
3
+ import pytesseract
4
+ from PIL import Image
5
+ import io
6
 
7
  app = FastAPI()
8
 
 
17
  extracted_text += f"\n\n--- Page {i + 1} ---\n\n" + page.get_text()
18
 
19
  return {"filename": file.filename, "text": extracted_text}
20
+ except Exception as e:
21
+ return {"error": str(e)}
22
+
23
+
24
+ @app.post("/extract-text-ocr")
25
+ async def extract_text_ocr(file: UploadFile = File(...)):
26
+ try:
27
+ contents = await file.read()
28
+ doc = fitz.open(stream=contents, filetype="pdf")
29
+
30
+ full_text = ""
31
+
32
+ for i in range(len(doc)):
33
+ page = doc.load_page(i)
34
+
35
+ # Normal text
36
+ text = page.get_text()
37
+
38
+ # Render page to an image
39
+ pix = page.get_pixmap()
40
+ img = Image.open(io.BytesIO(pix.tobytes()))
41
+
42
+ # OCR text
43
+ ocr_text = pytesseract.image_to_string(img)
44
+
45
+ full_text += f"\n\n--- Page {i + 1} ---\n\n"
46
+ full_text += text + "\n"
47
+ full_text += "[OCR Text]\n" + ocr_text
48
+
49
+ return {"filename": file.filename, "text": full_text}
50
 
51
  except Exception as e:
52
  return {"error": str(e)}
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  fastapi
2
  uvicorn
3
  PyMuPDF
 
 
 
 
1
  fastapi
2
  uvicorn
3
  PyMuPDF
4
+ python-multipart
5
+ pytesseract
6
+ Pillow