Shouvik commited on
Commit
1b470fb
Β·
1 Parent(s): 8193074

pushing the changes..

Browse files
Files changed (3) hide show
  1. Dockerfile +23 -0
  2. app.py +113 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces: Recommended Secure Dockerfile for FastAPI
2
+
3
+ FROM python:3.10-slim
4
+
5
+ # Add a non-root user (important for HF Spaces security)
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ # Copy requirements and install as user (not root)
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ # Copy the rest of the app as user
17
+ COPY --chown=user . /app
18
+
19
+ # Expose port for FastAPI (7860 is default for Spaces)
20
+ EXPOSE 7860
21
+
22
+ # Start FastAPI app with uvicorn
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from io import BytesIO
4
+ from PIL import Image
5
+ import torch
6
+ from fastapi import FastAPI, File, UploadFile, Form
7
+ from fastapi.responses import JSONResponse
8
+ from huggingface_hub import hf_hub_download
9
+ from transformers import (
10
+ AutoProcessor,
11
+ LayoutLMv3Model,
12
+ T5ForConditionalGeneration,
13
+ AutoTokenizer
14
+ )
15
+
16
+ app = FastAPI()
17
+
18
+ # ── 1) CONFIG & CHECKPOINT ────────────────────────────────────────────────
19
+ HF_REPO = "shouvik27/LayoutLMv3_T5"
20
+ CKPT_NAME = "pytorch_model.bin"
21
+
22
+ ckpt_path = hf_hub_download(repo_id=HF_REPO, filename=CKPT_NAME)
23
+ ckpt = torch.load(ckpt_path, map_location="cpu")
24
+
25
+ # ── 2) BUILD MODELS ───────────────────────────────────────────────────────
26
+ processor = AutoProcessor.from_pretrained(
27
+ "microsoft/layoutlmv3-base", apply_ocr=False
28
+ )
29
+ layout_model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
30
+ layout_model.load_state_dict(ckpt["layout_model"], strict=False)
31
+ layout_model.eval().to("cpu")
32
+
33
+ t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
34
+ t5_model.load_state_dict(ckpt["t5_model"], strict=False)
35
+ t5_model.eval().to("cpu")
36
+
37
+ tokenizer = AutoTokenizer.from_pretrained("t5-small")
38
+
39
+ proj_state = ckpt["projection"]
40
+ projection = torch.nn.Sequential(
41
+ torch.nn.Linear(768, t5_model.config.d_model),
42
+ torch.nn.LayerNorm(t5_model.config.d_model),
43
+ torch.nn.GELU()
44
+ )
45
+ projection.load_state_dict(proj_state)
46
+ projection.eval().to("cpu")
47
+
48
+ if t5_model.config.decoder_start_token_id is None:
49
+ t5_model.config.decoder_start_token_id = tokenizer.bos_token_id or tokenizer.pad_token_id
50
+ if t5_model.config.bos_token_id is None:
51
+ t5_model.config.bos_token_id = t5_model.config.decoder_start_token_id
52
+
53
+ # ── 3) INFERENCE ─────────────────────────────────────────────────────────
54
+ def infer_from_files(image_file: UploadFile, json_file: UploadFile):
55
+ # Read image
56
+ image_bytes = image_file.file.read()
57
+ img_name = os.path.basename(image_file.filename)
58
+
59
+ # Parse the NDJSON file, find entry
60
+ entry = None
61
+ for line in json_file.file:
62
+ if not line.strip():
63
+ continue
64
+ obj = json.loads(line.decode('utf-8').strip())
65
+ if obj.get("img_name") == img_name:
66
+ entry = obj
67
+ break
68
+
69
+ if entry is None:
70
+ return {"error": f"No JSON entry for: {img_name}"}
71
+
72
+ words = entry["src_word_list"]
73
+ boxes = entry["src_wordbox_list"]
74
+
75
+ img = Image.open(BytesIO(image_bytes)).convert("RGB")
76
+ enc = processor([img], [words], boxes=[boxes], return_tensors="pt", padding=True, truncation=True)
77
+ pixel_values = enc.pixel_values.to("cpu")
78
+ input_ids = enc.input_ids.to("cpu")
79
+ attention_mask = enc.attention_mask.to("cpu")
80
+ bbox = enc.bbox.to("cpu")
81
+
82
+ with torch.no_grad():
83
+ out = layout_model(
84
+ pixel_values=pixel_values,
85
+ input_ids=input_ids,
86
+ attention_mask=attention_mask,
87
+ bbox=bbox
88
+ )
89
+ seq_len = input_ids.size(1)
90
+ text_feats = out.last_hidden_state[:, :seq_len, :]
91
+ proj_feats = projection(text_feats)
92
+ gen_ids = t5_model.generate(
93
+ inputs_embeds=proj_feats,
94
+ attention_mask=attention_mask,
95
+ max_length=512,
96
+ decoder_start_token_id=t5_model.config.decoder_start_token_id
97
+ )
98
+
99
+ result = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
100
+ return {"result": result}
101
+
102
+ # ── 4) FASTAPI ENDPOINT ──────────────────────────────────────────────────
103
+ @app.post("/infer")
104
+ async def infer_api(
105
+ image_file: UploadFile = File(..., description="The image file"),
106
+ json_file: UploadFile = File(..., description="The NDJSON file"),
107
+ ):
108
+ output = infer_from_files(image_file, json_file)
109
+ return JSONResponse(content=output)
110
+
111
+ @app.get("/")
112
+ def healthcheck():
113
+ return {"message": "OCR FastAPI server is running."}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch
4
+ transformers
5
+ huggingface_hub
6
+ Pillow
7
+ python-multipart