Nanonets-OCR

Running

App Files Files Community

Toughen1 commited on Jul 13

Commit

bc062f5

verified ·

1 Parent(s): c97dcff

使用CPU

Browse files

Files changed (1) hide show

app.py +121 -70

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
-import spaces
 import torch
-from gradio_pdf import PDF
-from pdf2image import convert_from_path
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer
@@ -13,8 +12,7 @@ print("Loading Nanonets OCR model...")
 model = AutoModelForImageTextToText.from_pretrained(
     model_path,
     torch_dtype="auto",
-    device_map="auto",
-    attn_implementation="flash_attention_2",
 )
 model.eval()
@@ -23,7 +21,6 @@ processor = AutoProcessor.from_pretrained(model_path)
 print("Model loaded successfully!")
-@spaces.GPU()
 def ocr_image_gradio(image, max_tokens=4096):
     """Process image through Nanonets OCR model for Gradio interface"""
     if image is None:
@@ -70,30 +67,24 @@ def ocr_image_gradio(image, max_tokens=4096):
     return output_text[0]
-@spaces.GPU()
-def ocr_pdf_gradio(pdf_path, max_tokens=4096, progress=gr.Progress()):
-    """Process each page of a PDF through Nanonets OCR model"""
-    if pdf_path is None:
-        return "Please upload a PDF file."
-    # Convert PDF to images
-    progress(0, desc="Converting PDF to images...")
-    pdf_images = convert_from_path(pdf_path)
-    # Process each page
-    all_text = []
-    total_pages = len(pdf_images)
-    for i, image in enumerate(pdf_images):
-        progress(
-            (i + 1) / total_pages, desc=f"Processing page {i + 1}/{total_pages}..."
-        )
-        page_text = ocr_image_gradio(image, max_tokens)
-        all_text.append(f"--- PAGE {i + 1} ---\n{page_text}\n")
-    # Combine results
-    combined_text = "\n".join(all_text)
-    return combined_text
 # Create Gradio interface
@@ -125,51 +116,55 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     image_input = gr.Image(
-                        label="Upload Document Image", type="pil", height=400
                     )
                     image_max_tokens = gr.Slider(
                         minimum=1024,
                         maximum=8192,
                         value=4096,
                         step=512,
-                        label="Max Tokens",
-                        info="Maximum number of tokens to generate",
                     )
                     image_extract_btn = gr.Button(
-                        "Extract Text", variant="primary", size="lg"
                     )
                 with gr.Column(scale=2):
                     image_output_text = gr.Textbox(
-                        label="Extracted Text",
                         lines=20,
                         show_copy_button=True,
-                        placeholder="Extracted text will appear here...",
                     )
-        # PDF tab
-        with gr.TabItem("PDF OCR"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    pdf_input = PDF(label="Upload PDF Document", height=400)
-                    pdf_max_tokens = gr.Slider(
                         minimum=1024,
                         maximum=8192,
                         value=4096,
                         step=512,
-                        label="Max Tokens per Page",
-                        info="Maximum number of tokens to generate for each page",
                     )
-                    pdf_extract_btn = gr.Button(
-                        "Extract PDF Text", variant="primary", size="lg"
                     )
                 with gr.Column(scale=2):
-                    pdf_output_text = gr.Textbox(
-                        label="Extracted Text (All Pages)",
                         lines=20,
                         show_copy_button=True,
-                        placeholder="Extracted text will appear here...",
                     )
     # Event handlers for Image tab
@@ -187,43 +182,99 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
         show_progress=True,
     )
-    # Event handlers for PDF tab
-    pdf_extract_btn.click(
-        fn=ocr_pdf_gradio,
-        inputs=[pdf_input, pdf_max_tokens],
-        outputs=pdf_output_text,
         show_progress=True,
     )
     # Add model information section
-    with gr.Accordion("About Nanonets-OCR-s", open=False):
         gr.Markdown("""
         ## Nanonets-OCR-s
-        Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
-        It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
-        for downstream processing by Large Language Models (LLMs).
-        ### Key Features
-        - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
-          It distinguishes between inline ($...$) and display ($$...$$) equations.
-        - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
-          for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
-          style, and context.
-        - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
-          This is crucial for processing legal and business documents.
-        - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
-        - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, ☑, ☒)
-          for consistent and reliable processing.
-        - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
-          and HTML table formats.
         """)
 if __name__ == "__main__":
-    demo.queue().launch(ssr_mode=False)

 import gradio as gr
+import base64
+import io
 import torch
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer
 model = AutoModelForImageTextToText.from_pretrained(
     model_path,
     torch_dtype="auto",
+    device_map="cpu",  # 使用CPU
 )
 model.eval()
 print("Model loaded successfully!")
 def ocr_image_gradio(image, max_tokens=4096):
     """Process image through Nanonets OCR model for Gradio interface"""
     if image is None:
     return output_text[0]
+def ocr_base64_image(base64_string, max_tokens=4096):
+    """Process base64 encoded image through Nanonets OCR model"""
+    if not base64_string or base64_string.strip() == "":
+        return "Please provide a valid base64 image string."
+    try:
+        # Remove data URL prefix if present
+        if "base64," in base64_string:
+            base64_string = base64_string.split("base64,")[1]
+        # Decode base64 to image
+        image_data = base64.b64decode(base64_string)
+        image = Image.open(io.BytesIO(image_data))
+        # Process image using existing OCR function
+        return ocr_image_gradio(image, max_tokens)
+    except Exception as e:
+        return f"Error processing base64 image: {str(e)}"
 # Create Gradio interface
             with gr.Row():
                 with gr.Column(scale=1):
                     image_input = gr.Image(
+                        label="上传文档图片", type="pil", height=400
                     )
                     image_max_tokens = gr.Slider(
                         minimum=1024,
                         maximum=8192,
                         value=4096,
                         step=512,
+                        label="最大Token数",
+                        info="生成的最大token数量",
                     )
                     image_extract_btn = gr.Button(
+                        "提取文本", variant="primary", size="lg"
                     )
                 with gr.Column(scale=2):
                     image_output_text = gr.Textbox(
+                        label="提取的文本",
                         lines=20,
                         show_copy_button=True,
+                        placeholder="提取的文本将显示在这里...",
                     )
+        # Base64 Image tab
+        with gr.TabItem("Base64图片OCR"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    base64_input = gr.Textbox(
+                        label="输入Base64编码的图片",
+                        lines=10,
+                        placeholder="粘贴Base64编码的图片数据...",
+                    )
+                    base64_max_tokens = gr.Slider(
                         minimum=1024,
                         maximum=8192,
                         value=4096,
                         step=512,
+                        label="最大Token数",
+                        info="生成的最大token数量",
                     )
+                    base64_extract_btn = gr.Button(
+                        "提取文本", variant="primary", size="lg"
                     )
                 with gr.Column(scale=2):
+                    base64_output_text = gr.Textbox(
+                        label="提取的文本",
                         lines=20,
                         show_copy_button=True,
+                        placeholder="提取的文本将显示在这里...",
                     )
     # Event handlers for Image tab
         show_progress=True,
     )
+    # Event handlers for Base64 tab
+    base64_extract_btn.click(
+        fn=ocr_base64_image,
+        inputs=[base64_input, base64_max_tokens],
+        outputs=base64_output_text,
         show_progress=True,
     )
     # Add model information section
+    with gr.Accordion("关于 Nanonets-OCR-s", open=False):
         gr.Markdown("""
         ## Nanonets-OCR-s
+        Nanonets-OCR-s 是一个强大的最先进的图像到markdown的OCR模型，远超传统的文本提取功能。
+        它将文档转换为带有智能内容识别和语义标记的结构化markdown，非常适合大型语言模型(LLM)的下游处理。
+        ### 主要特点
+        - **LaTeX公式识别**：自动将数学公式转换为格式正确的LaTeX语法。
+          它区分内联($...$)和显示($$...$$)公式。
+        - **智能图像描述**：使用结构化的`<img>`标签描述文档中的图像，使它们易于LLM处理。
+          它可以描述各种图像类型，包括徽标、图表、图形等，详细说明它们的内容、风格和上下文。
+        - **签名检测与隔离**：识别并隔离签名与其他文本，将其输出在`<signature>`标签内。
+          这对处理法律和商业文件至关重要。
+        - **水印提取**：检测并提取文档中的水印文本，将其放在`<watermark>`标签内。
+        - **智能复选框处理**：将表单复选框和单选按钮转换为标准化的Unicode符号(☐, ☑, ☒)，
+          以实现一致可靠的处理。
+        - **复杂表格提取**：准确地从文档中提取复杂表格，并将它们转换为markdown和HTML表格格式。
+        """)
+    # API Usage Information
+    with gr.Accordion("API使用说明", open=True):
+        gr.Markdown("""
+        ## API使用方法
+        ### Base64图片识别API
+        您可以通过HTTP POST请求使用Base64图片识别API：
+        ```
+        curl -X POST "http://localhost:7860/api/predict" \\
+             -H "Content-Type: application/json" \\
+             -d '{
+                  "fn_index": 1,
+                  "data": [
+                    "YOUR_BASE64_STRING_HERE",
+                    4096
+                  ]
+                }'
+        ```
+        - `fn_index: 1` 对应Base64图片OCR功能
+        - 第一个参数是Base64编码的图片字符串
+        - 第二个参数是最大token数量
+        ### 普通图片上传API
+        ```
+        curl -X POST "http://localhost:7860/api/predict" \\
+             -H "Content-Type: application/json" \\
+             -d '{
+                  "fn_index": 0,
+                  "data": [
+                    "IMAGE_DATA_HERE",
+                    4096
+                  ]
+                }'
+        ```
+        - `fn_index: 0` 对应普通图片OCR功能
+        """)
+    # CPU Usage Warning
+    with gr.Accordion("CPU环境说明", open=True):
+        gr.Markdown("""
+        ## CPU环境性能说明
+        此应用程序当前运行在CPU环境下（2核16G），请注意：
+        - 处理大型图像可能需要更长时间
+        - 建议使用较小的图像以获得更快的响应速度
+        - 如果处理时间过长，可以考虑降低最大Token数
+        - 模型已针对CPU环境进行了优化配置
         """)
 if __name__ == "__main__":
+    import torch
+    print(f"使用设备: CPU - 可用线程数: {torch.get_num_threads()}")
+    # 设置线程数以优化CPU性能
+    torch.set_num_threads(2)  # 设置为可用的2核
+    demo.queue().launch(share=True, server_name="0.0.0.0")