Souvik3333 commited on
Commit
50060d5
Β·
verified Β·
1 Parent(s): c36e3a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -0
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
4
+ import torch
5
+ import spaces
6
+
7
+ model_path = "nanonets/Nanonets-OCR-s"
8
+
9
+ # Load model once at startup
10
+ print("Loading Nanonets OCR model...")
11
+ model = AutoModelForImageTextToText.from_pretrained(
12
+ model_path,
13
+ torch_dtype="auto",
14
+ device_map="auto",
15
+ attn_implementation="flash_attention_2"
16
+ )
17
+ model.eval()
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
20
+ processor = AutoProcessor.from_pretrained(model_path)
21
+ print("Model loaded successfully!")
22
+
23
+
24
+ def process_tags(content: str) -> str:
25
+ content = content.replace("<img>", "&lt;img&gt;")
26
+ content = content.replace("</img>", "&lt;/img&gt;")
27
+ content = content.replace("<watermark>", "&lt;watermark&gt;")
28
+ content = content.replace("</watermark>", "&lt;/watermark&gt;")
29
+ content = content.replace("<page_number>", "&lt;page_number&gt;")
30
+ content = content.replace("</page_number>", "&lt;/page_number&gt;")
31
+ content = content.replace("<signature>", "&lt;signature&gt;")
32
+ content = content.replace("</signature>", "&lt;/signature&gt;")
33
+
34
+ return content
35
+
36
+ @spaces.GPU()
37
+ def ocr_image_gradio(image, max_tokens=4096):
38
+ """Process image through Nanonets OCR model for Gradio interface"""
39
+ if image is None:
40
+ return "Please upload an image."
41
+
42
+ try:
43
+ prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and β˜‘ for check boxes."""
44
+
45
+ # Convert PIL image if needed
46
+ if not isinstance(image, Image.Image):
47
+ image = Image.fromarray(image)
48
+
49
+ messages = [
50
+ {"role": "system", "content": "You are a helpful assistant."},
51
+ {"role": "user", "content": [
52
+ {"type": "image", "image": image},
53
+ {"type": "text", "text": prompt},
54
+ ]},
55
+ ]
56
+
57
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
58
+ inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
59
+ inputs = inputs.to(model.device)
60
+
61
+ with torch.no_grad():
62
+ output_ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
63
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
64
+
65
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
66
+ return process_tags(output_text[0])
67
+
68
+ except Exception as e:
69
+ return f"Error processing image: {str(e)}"
70
+
71
+ # Create Gradio interface
72
+ with gr.Blocks(title="Nanonets OCR Demo") as demo:
73
+ # Replace simple markdown with styled HTML header that includes resources
74
+ gr.HTML("""
75
+ <div class="title" style="text-align: center">
76
+ <h1>πŸ” Nanonets OCR - Document Text Extraction</h1>
77
+ <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
78
+ A model for transforming documents into structured markdown with intelligent content recognition and semantic tagging
79
+ </p>
80
+ <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
81
+ <a href="https://huggingface.co/nanonets/Nanonets-OCR-s" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
82
+ πŸ“š Hugging Face Model
83
+ </a>
84
+ <a href="https://nanonets.com/research/nanonets-ocr-s/" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
85
+ πŸ“ Release Blog
86
+ </a>
87
+ <a href="https://github.com/NanoNets/docext" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;">
88
+ πŸ’» GitHub Repository
89
+ </a>
90
+ </div>
91
+ </div>
92
+ """)
93
+
94
+ with gr.Row():
95
+ with gr.Column(scale=1):
96
+ image_input = gr.Image(
97
+ label="Upload Document Image",
98
+ type="pil",
99
+ height=400
100
+ )
101
+ max_tokens_slider = gr.Slider(
102
+ minimum=1024,
103
+ maximum=8192,
104
+ value=4096,
105
+ step=512,
106
+ label="Max Tokens",
107
+ info="Maximum number of tokens to generate"
108
+ )
109
+ extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
110
+
111
+ with gr.Column(scale=2):
112
+ output_text = gr.Markdown(
113
+ label="Formatted model prediction",
114
+ latex_delimiters=[
115
+ {"left": "$$", "right": "$$", "display": True},
116
+ {"left": "$", "right": "$", "display": False},
117
+ {
118
+ "left": "\\begin{align*}",
119
+ "right": "\\end{align*}",
120
+ "display": True,
121
+ },
122
+ ],
123
+ line_breaks=True,
124
+ show_copy_button=True,
125
+ )
126
+
127
+ # Event handlers
128
+ extract_btn.click(
129
+ fn=ocr_image_gradio,
130
+ inputs=[image_input, max_tokens_slider],
131
+ outputs=output_text,
132
+ show_progress=True
133
+ )
134
+
135
+ image_input.change(
136
+ fn=ocr_image_gradio,
137
+ inputs=[image_input, max_tokens_slider],
138
+ outputs=output_text,
139
+ show_progress=True
140
+ )
141
+
142
+ # Add model information section
143
+ with gr.Accordion("About Nanonets-OCR-s", open=False):
144
+ gr.Markdown("""
145
+ ## Nanonets-OCR-s
146
+
147
+ Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
148
+ It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
149
+ for downstream processing by Large Language Models (LLMs).
150
+
151
+ ### Key Features
152
+
153
+ - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
154
+ It distinguishes between inline ($...$) and display ($$...$$) equations.
155
+
156
+ - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
157
+ for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
158
+ style, and context.
159
+
160
+ - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
161
+ This is crucial for processing legal and business documents.
162
+
163
+ - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
164
+
165
+ - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, β˜‘, β˜’)
166
+ for consistent and reliable processing.
167
+
168
+ - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
169
+ and HTML table formats.
170
+ """)
171
+
172
+ if __name__ == "__main__":
173
+ demo.queue().launch()