leonarb commited on
Commit
8be5494
·
verified ·
1 Parent(s): 0f8f93a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -34
app.py CHANGED
@@ -1,46 +1,78 @@
1
- import gradio as gr
2
  import torch
3
- import pypdfium2
 
4
  from PIL import Image
5
- from transformers import Qwen2VLProcessor, Qwen2VLImageProcessor, AutoTokenizer, Qwen2VLModel
6
-
7
- # Load model and processor
8
- model_name = "Qwen/Qwen-VL" # You may replace with your preferred VL model
9
- image_processor = Qwen2VLImageProcessor.from_pretrained(model_name)
10
- tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- processor = Qwen2VLProcessor(image_processor=image_processor, tokenizer=tokenizer)
12
- model = Qwen2VLModel.from_pretrained(
13
- model_name,
14
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
15
- )
16
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Convert PDF to list of PIL images (one per page)
19
- def pdf_to_images(pdf_path):
20
- pdf = pypdfium2.PdfDocument(pdf_path)
21
- return [page.render().to_pil() for page in pdf]
 
 
 
 
 
 
22
 
23
- # Generate text from each image using the vision-language model
24
- def process_pdf(pdf_file):
25
- images = pdf_to_images(pdf_file.name)
26
- results = []
27
 
28
- for image in images:
29
- inputs = processor(images=image, return_tensors="pt").to(model.device)
30
- with torch.no_grad():
31
- outputs = model.generate(**inputs, max_new_tokens=256)
32
- text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
33
- results.append(text.strip())
 
 
 
34
 
35
- return "\n\n".join(results)
 
 
 
 
36
 
37
- # Gradio UI
38
- demo = gr.Interface(
39
  fn=process_pdf,
40
- inputs=gr.File(type="file", file_types=[".pdf"]),
 
 
 
41
  outputs="text",
42
- title="olmOCR PDF Processor"
 
43
  )
44
 
45
  if __name__ == "__main__":
46
- demo.launch()
 
 
1
  import torch
2
+ import base64
3
+ from io import BytesIO
4
  from PIL import Image
5
+ import gradio as gr
6
+
7
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
8
+ from olmocr.data.renderpdf import render_pdf_to_base64png
9
+ from olmocr.prompts import build_finetuning_prompt
10
+ from olmocr.prompts.anchor import get_anchor_text
11
+
12
+ # Load processor and model
13
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
14
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
15
+ "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
16
+ ).eval()
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ model.to(device)
20
+
21
+ def process_pdf(file, page=1):
22
+ # Save uploaded file to disk
23
+ file_path = file.name
24
+
25
+ # Render the selected PDF page to base64 PNG
26
+ image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
27
+ main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
28
+
29
+ # Extract document metadata and build the prompt
30
+ anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
31
+ prompt = build_finetuning_prompt(anchor_text)
32
 
33
+ # Construct chat message
34
+ messages = [
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "text", "text": prompt},
39
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
40
+ ],
41
+ }
42
+ ]
43
 
44
+ # Tokenize inputs
45
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
46
+ inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
47
+ inputs = {k: v.to(device) for k, v in inputs.items()}
48
 
49
+ # Run model
50
+ with torch.no_grad():
51
+ output = model.generate(
52
+ **inputs,
53
+ temperature=0.8,
54
+ max_new_tokens=256,
55
+ num_return_sequences=1,
56
+ do_sample=True,
57
+ )
58
 
59
+ # Decode
60
+ prompt_len = inputs["input_ids"].shape[1]
61
+ new_tokens = output[:, prompt_len:]
62
+ decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
63
+ return decoded[0]
64
 
65
+ # Gradio interface
66
+ iface = gr.Interface(
67
  fn=process_pdf,
68
+ inputs=[
69
+ gr.File(label="Upload PDF"),
70
+ gr.Number(value=1, label="Page Number"),
71
+ ],
72
  outputs="text",
73
+ title="olmOCR PDF Text Extractor",
74
+ description="Upload a PDF and select a page to extract text using the olmOCR model.",
75
  )
76
 
77
  if __name__ == "__main__":
78
+ iface.launch()