prithivMLmods commited on
Commit
91b5b67
·
verified ·
1 Parent(s): 72868c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -24,6 +24,9 @@ from transformers import (
24
 
25
  from transformers.image_utils import load_image
26
 
 
 
 
27
  # Constants for text generation
28
  MAX_MAX_NEW_TOKENS = 2048
29
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -58,13 +61,15 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
58
  torch_dtype=torch.float16
59
  ).to(device).eval()
60
 
61
- # Load Imgscope-OCR-2B-0527
62
- MODEL_ID_Y = "prithivMLmods/Imgscope-OCR-2B-0527"
63
- processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
64
- model_y = Qwen2VLForConditionalGeneration.from_pretrained(
65
  MODEL_ID_Y,
66
  trust_remote_code=True,
67
- torch_dtype=torch.float16
 
 
68
  ).to(device).eval()
69
 
70
 
@@ -108,8 +113,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
108
  elif model_name == "coreOCR-7B-050325-preview":
109
  processor = processor_k
110
  model = model_k
111
- elif model_name == "Imgscope-OCR-2B-0527":
112
- processor = processor_y
113
  model = model_y
114
  else:
115
  yield "Invalid model selected."
@@ -165,8 +170,8 @@ def generate_video(model_name: str, text: str, video_path: str,
165
  elif model_name == "coreOCR-7B-050325-preview":
166
  processor = processor_k
167
  model = model_k
168
- elif model_name == "Imgscope-OCR-2B-0527":
169
- processor = processor_y
170
  model = model_y
171
  else:
172
  yield "Invalid model selected."
@@ -269,7 +274,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
269
  with gr.Column():
270
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
271
  model_choice = gr.Radio(
272
- choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "Imgscope-OCR-2B-0527"],
273
  label="Select Model",
274
  value="SkyCaptioner-V1"
275
  )
@@ -277,7 +282,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
277
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
278
  gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
279
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
280
- gr.Markdown("> [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")
281
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
282
 
283
  image_submit.click(
 
24
 
25
  from transformers.image_utils import load_image
26
 
27
+ import subprocess
28
+ subprocess.run('pip install flash-attn==2.6.3 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
29
+
30
  # Constants for text generation
31
  MAX_MAX_NEW_TOKENS = 2048
32
  DEFAULT_MAX_NEW_TOKENS = 1024
 
61
  torch_dtype=torch.float16
62
  ).to(device).eval()
63
 
64
+ # Load llama-nemoretriever-colembed-1b-v1
65
+ MODEL_ID_Y = "nvidia/llama-nemoretriever-colembed-1b-v1"
66
+ #processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
67
+ model_y = AutoModel.from_pretrained(
68
  MODEL_ID_Y,
69
  trust_remote_code=True,
70
+ torch_dtype=torch.float16,
71
+ attn_implementation="flash_attention_2",
72
+ revision='1f0fdea7f5b19532a750be109b19072d719b8177'
73
  ).to(device).eval()
74
 
75
 
 
113
  elif model_name == "coreOCR-7B-050325-preview":
114
  processor = processor_k
115
  model = model_k
116
+ elif model_name == "llama-nemoretriever-colembed-1b-v1":
117
+ #processor = processor_y
118
  model = model_y
119
  else:
120
  yield "Invalid model selected."
 
170
  elif model_name == "coreOCR-7B-050325-preview":
171
  processor = processor_k
172
  model = model_k
173
+ elif model_name == "llama-nemoretriever-colembed-1b-v1":
174
+ #processor = processor_y
175
  model = model_y
176
  else:
177
  yield "Invalid model selected."
 
274
  with gr.Column():
275
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
276
  model_choice = gr.Radio(
277
+ choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "llama-nemoretriever-colembed-1b-v1"],
278
  label="Select Model",
279
  value="SkyCaptioner-V1"
280
  )
 
282
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/VisionScope-R2/discussions)")
283
  gr.Markdown("> [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
284
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
 
285
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
286
 
287
  image_submit.click(