Spaces:

ZennyKenny
/

Novoyaz

Runtime error

App Files Files Community

ZennyKenny commited on 20 days ago

Commit

aaaf2a4

verified ·

1 Parent(s): 77fd050

Create app.py

Browse files

Files changed (1) hide show

app.py +169 -0

app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# app.py
+import os
+import gradio as gr
+from PIL import Image
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# ---- 1) Load OCR model (dots.ocr) ----
+# Uses trust_remote_code per model card instructions
+# Tip from model card: they sometimes recommend saving weights in a folder name without dots,
+# but loading by repo id works on Spaces with trust_remote_code.
+OCR_REPO = "rednote-hilab/dots.ocr"
+ocr_model = AutoModelForCausalLM.from_pretrained(
+    OCR_REPO,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
+    attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager",
+    trust_remote_code=True,
+)
+ocr_processor = AutoProcessor.from_pretrained(OCR_REPO, trust_remote_code=True)
+# We’ll ask for raw OCR text only (no translation, reading order preserved).
+# The repo provides a dictionary of prompt presets. "prompt_ocr" = text extraction only.
+try:
+    from dots_ocr.utils import dict_promptmode_to_prompt  # provided by the model repo
+    OCR_PROMPT = dict_promptmode_to_prompt()["prompt_ocr"]
+except Exception:
+    # Fallback prompt (aligned with the model card’s guidance)
+    OCR_PROMPT = (
+        "Extract the original text from this image as plain text. "
+        "Keep the reading order. Do not translate. Do not add extra formatting."
+    )
+# ---- 2) Load your conversion model (pre-reform → modern Russian) ----
+CONVERT_REPO = "ZennyKenny/oss-20b-prereform-to-modern-ru-merged"
+convert_tokenizer = AutoTokenizer.from_pretrained(CONVERT_REPO, use_fast=True)
+convert_model = AutoModelForCausalLM.from_pretrained(
+    CONVERT_REPO,
+    device_map="auto",
+    torch_dtype="auto",
+)
+SYSTEM_MSG = (
+    "You convert Russian text from pre-1918 orthography to modern Russian spelling. "
+    "Keep wording and punctuation; change only orthography."
+)
+def run_ocr(pil_image: Image.Image) -> str:
+    # Build messages for dots.ocr: one image + one text prompt
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": OCR_PROMPT},
+            ],
+        }
+    ]
+    # Prepare inputs (use the processor’s chat template + vision utils)
+    text = ocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = ocr_processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    if torch.cuda.is_available():
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        generated_ids = ocr_model.generate(**inputs, max_new_tokens=4096)
+        # Trim the prompt tokens from the generated ids
+        trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], generated_ids)]
+        output_text = ocr_processor.batch_decode(
+            trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+    # dots.ocr can sometimes return JSON/markdown for layout tasks; we asked for plain text.
+    # Light normalization:
+    return output_text.strip()
+def prereform_to_modern(pre_reform_text: str) -> str:
+    # Compose system + user turns and rely on tokenizer’s chat template
+    messages = [
+        {"role": "system", "content": SYSTEM_MSG},
+        {"role": "user", "content": pre_reform_text},
+    ]
+    prompt = convert_tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = convert_tokenizer([prompt], return_tensors="pt")
+    if torch.cuda.is_available():
+        inputs = {k: v.to(convert_model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        gen = convert_model.generate(
+            **inputs,
+            max_new_tokens=1024,
+            do_sample=False,
+            temperature=0.0,
+            repetition_penalty=1.05,
+        )
+    # Drop the prompt portion to get pure assistant text (works for most chat templates)
+    generated = gen[0][inputs["input_ids"].shape[1]:]
+    text = convert_tokenizer.decode(generated, skip_special_tokens=True)
+    return text.strip()
+def transcribe_and_convert(pil_image: Image.Image):
+    if pil_image is None:
+        return None, "", "", "Please upload an image."
+    # 1) OCR
+    ocr_text = run_ocr(pil_image)
+    # 2) Convert to modern Russian
+    modern_text = prereform_to_modern(ocr_text)
+    # 3) Markdown code block view
+    md = "```text\n" + modern_text + "\n```"
+    return pil_image, ocr_text, modern_text, md
+# ---------------- UI ----------------
+with gr.Blocks(title="Pre-reform → Modern Russian OCR & Converter") as demo:
+    gr.Markdown(
+        "## Pre-reform → Modern Russian\n"
+        "Upload an image with pre-1918 Russian text → OCR with **dots.ocr** → convert to modern Russian with your fine-tuned model."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_in = gr.Image(type="pil", label="Upload image (pre-reform Russian)")
+            run_btn = gr.Button("Transcribe & Convert", variant="primary")
+            note = gr.Markdown(
+                "Tip: high-res images OCR better. For PDFs, export a page as an image first."
+            )
+        with gr.Column(scale=2):
+            with gr.Row():
+                image_preview = gr.Image(label="Preview", interactive=False)
+                ocr_box = gr.Textbox(label="Transcribed (pre-reform)", lines=14)
+                modern_box = gr.Textbox(label="Modern Russian", lines=14)
+            md_block = gr.Markdown(label="Modern Russian (markdown code block)")
+    run_btn.click(
+        transcribe_and_convert,
+        inputs=[image_in],
+        outputs=[image_preview, ocr_box, modern_box, md_block],
+        api_name="transcribe_convert",
+    )
+    gr.Examples(
+        examples=[],  # You can add sample image paths here later
+        inputs=image_in,
+        label="Examples",
+    )
+demo.queue(max_size=10).launch()