Spaces:

mgbam
/

image

Runtime error

App Files Files Community

mgbam commited on Jan 29

Commit

e6713e2

verified ·

1 Parent(s): 05acc1a

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -90

app.py CHANGED Viewed

@@ -1,45 +1,37 @@
-import gradio as gr
 import torch
 from janus.janusflow.models import MultiModalityCausalLM, VLChatProcessor
 from PIL import Image
 from diffusers.models import AutoencoderKL
 import numpy as np
-import spaces  # Import spaces for ZeroGPU compatibility
 cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
-# Load model and processor
-model_path = "deepseek-ai/JanusFlow-1.3B"
 vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 vl_gpt = MultiModalityCausalLM.from_pretrained(model_path)
 vl_gpt = vl_gpt.to(torch.bfloat16).to(cuda_device).eval()
-# remember to use bfloat16 dtype, this vae doesn't work with fp16
-vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
 vae = vae.to(torch.bfloat16).to(cuda_device).eval()
-# Multimodal Understanding function
 @torch.inference_mode()
-@spaces.GPU(duration=120)
 def multimodal_understanding(image, question, seed, top_p, temperature):
-    # Clear CUDA cache before generating
     torch.cuda.empty_cache()
-    # set seed
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
-    # Medical image preprocessing (this is a placeholder, implement based on your specific needs)
-    # NOTE: If input is DICOM or another medical format, add custom loading and preprocessing steps here
-    # Example: if input is DICOM:
-    # 1. load with pydicom.dcmread()
-    # 2. normalize pixel values based on windowing/leveling if necessary
-    # 3. convert to np.array
-    # else: if the input is a regular numpy array (e.g. png or jpg) no action is needed, image = image
     conversation = [
         {
             "role": "User",
@@ -48,15 +40,14 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
         },
         {"role": "Assistant", "content": ""},
     ]
     pil_images = [Image.fromarray(image)]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
     ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     outputs = vl_gpt.language_model.generate(
         inputs_embeds=inputs_embeds,
         attention_mask=prepare_inputs.attention_mask,
@@ -69,14 +60,13 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
         temperature=temperature,
         top_p=top_p,
     )
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
     return answer
 @torch.inference_mode()
-@spaces.GPU(duration=120)
 def generate(
     input_ids,
     cfg_weight: float = 2.0,
@@ -158,8 +148,8 @@ def unpack(dec, width, height, parallel_size=5):
     return visual_img
 @torch.inference_mode()
-@spaces.GPU(duration=120)
 def generate_image(prompt,
                    seed=None,
                    guidance=5,
@@ -185,80 +175,73 @@ def generate_image(prompt,
                                    num_inference_steps=num_inference_steps)
         return [Image.fromarray(images[i]).resize((1024, 1024), Image.LANCZOS) for i in range(images.shape[0])]
 # Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown(value="# Medical Image Analysis and Generation")
-    # with gr.Row():
-    with gr.Row():
-        image_input = gr.Image(label="Medical Image Input")
-        with gr.Column():
-            question_input = gr.Textbox(label="Analysis Prompt (e.g., 'Identify tumor', 'Characterize lesion', 'Describe anatomic structures')")
-            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
-            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
-            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
-    understanding_button = gr.Button("Analyze Image")
-    understanding_output = gr.Textbox(label="Analysis Response")
-    examples_inpainting = gr.Examples(
-        label="Multimodal Understanding examples",
-         examples=[
-            [
-              "Identify the tumor in the given image.",
-              "./ct_scan.png"  # Placeholder medical image path
-            ],
-             [
-                 "Characterize the lesion in the image. Is it malignant or benign?",
-                 "./mri_scan.png",  # Placeholder medical image path
             ],
-            [
-                 "Generate a report for the given medical image.",
-                 "./xray.png",  # Placeholder medical image path
             ],
-         ],
-        inputs=[question_input, image_input],
-    )
-    gr.Markdown(value="# Medical Image Generation with Hugging Face Logo")
-    with gr.Row():
-        cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=2, step=0.5, label="CFG Weight")
-        step_input = gr.Slider(minimum=1, maximum=50, value=30, step=1, label="Number of Inference Steps")
-    prompt_input = gr.Textbox(label="Generation Prompt (e.g., 'Generate a CT scan with the Hugging Face logo', 'Create an MRI scan showing the Hugging Face logo', 'Render a medical x-ray with the Hugging Face logo.')")
-    seed_input = gr.Number(label="Seed (Optional)", precision=0, value=12345)
-    generation_button = gr.Button("Generate Images")
-    image_output = gr.Gallery(label="Generated Images", columns=2, rows=2, height=300)
-    examples_t2i = gr.Examples(
-        label="Medical image generation examples with Hugging Face logo.",
-        examples=[
-            "Generate a CT scan with the Hugging Face logo clearly visible.",
-            "Create an MRI scan showing the Hugging Face logo embedded within the tissue.",
-            "Render a medical x-ray with the Hugging Face logo subtly visible in the background.",
-            "Generate an ultrasound image with a faint Hugging Face logo on the screen",
-        ],
-        inputs=prompt_input,
-    )
     understanding_button.click(
         multimodal_understanding,
         inputs=[image_input, question_input, und_seed_input, top_p, temperature],
         outputs=understanding_output
     )
     generation_button.click(
         fn=generate_image,
         inputs=[prompt_input, seed_input, cfg_weight_input, step_input],
         outputs=image_output
     )
-demo.launch(share=True, ssr_mode = False)

 import torch
 from janus.janusflow.models import MultiModalityCausalLM, VLChatProcessor
 from PIL import Image
 from diffusers.models import AutoencoderKL
 import numpy as np
+import gradio as gr  # Import gradio for UI
+# CUDA availability check
 cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Using device: {cuda_device}")
+# Load model and processor (adjust path if needed)
+model_path = "deepseek-ai/JanusFlow-1.3B" # You may need to change to your local path
 vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 vl_gpt = MultiModalityCausalLM.from_pretrained(model_path)
 vl_gpt = vl_gpt.to(torch.bfloat16).to(cuda_device).eval()
+# Load VAE for image generation
+vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae") # You may need to change to your local path
 vae = vae.to(torch.bfloat16).to(cuda_device).eval()
+# Multimodal Understanding function (modified for medical context)
 @torch.inference_mode()
 def multimodal_understanding(image, question, seed, top_p, temperature):
+    # Clear CUDA cache before generating to prevent memory leaks
     torch.cuda.empty_cache()
+    # Set seed for reproducibility
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
     conversation = [
         {
             "role": "User",
         },
         {"role": "Assistant", "content": ""},
     ]
     pil_images = [Image.fromarray(image)]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
     ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     outputs = vl_gpt.language_model.generate(
         inputs_embeds=inputs_embeds,
         attention_mask=prepare_inputs.attention_mask,
         temperature=temperature,
         top_p=top_p,
     )
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
     return answer
+# Image Generation Function (modified for medical context)
 @torch.inference_mode()
 def generate(
     input_ids,
     cfg_weight: float = 2.0,
     return visual_img
+# Main image generation function
 @torch.inference_mode()
 def generate_image(prompt,
                    seed=None,
                    guidance=5,
                                    num_inference_steps=num_inference_steps)
         return [Image.fromarray(images[i]).resize((1024, 1024), Image.LANCZOS) for i in range(images.shape[0])]
 # Gradio interface
+with gr.Blocks(title="JanusFlow Medical Image Assistant") as demo:
+    gr.Markdown(value="# Medical Image Understanding and Generation")
+    with gr.Tab("Multimodal Understanding"):
+        with gr.Row():
+            image_input = gr.Image(label="Medical Image Input")
+            with gr.Column():
+                question_input = gr.Textbox(label="Medical Question")
+                und_seed_input = gr.Number(label="Seed", precision=0, value=42)
+                top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="Top P")
+                temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="Temperature")
+        understanding_button = gr.Button("Analyze Image")
+        understanding_output = gr.Textbox(label="Analysis Response")
+        examples_understanding = gr.Examples(
+            label="Examples: Image Analysis",
+            examples=[
+                 [
+                  "What are the visible structures in this ultrasound?",
+                 "./ultrasound.jpeg"
+                ],
+                  [
+                  "Identify abnormalities in the image.",
+                 "./cardiac_ultrasound.jpeg"
+                  ],
+                  [
+                  "Describe the features and histological analysis in this image.",
+                 "./histology.jpeg"
+                  ],
             ],
+            inputs=[question_input, image_input],
+        )
+    with gr.Tab("Text-to-Image Generation"):
+        with gr.Row():
+            cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=2, step=0.5, label="CFG Weight")
+            step_input = gr.Slider(minimum=1, maximum=50, value=30, step=1, label="Inference Steps")
+        prompt_input = gr.Textbox(label="Medical Image Generation Prompt")
+        seed_input = gr.Number(label="Seed (Optional)", precision=0, value=12345)
+        generation_button = gr.Button("Generate Medical Image")
+        image_output = gr.Gallery(label="Generated Images", columns=2, rows=2, height=300)
+        examples_t2i = gr.Examples(
+            label="Examples: Image Generation",
+            examples=[
+                 "Generate a coronal view of a brain MRI with a tumor.",
+                "Create an X-ray image showing a fractured femur.",
+                "Create an image of Histology of Liver Cirrhosis.",
             ],
+            inputs=prompt_input,
+        )
     understanding_button.click(
         multimodal_understanding,
         inputs=[image_input, question_input, und_seed_input, top_p, temperature],
         outputs=understanding_output
     )
     generation_button.click(
         fn=generate_image,
         inputs=[prompt_input, seed_input, cfg_weight_input, step_input],
         outputs=image_output
     )
+demo.launch(share=True)