LoufAn commited on
Commit
d6b6106
Β·
1 Parent(s): 3453c0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -63
app.py CHANGED
@@ -1,68 +1,20 @@
1
- import math
2
- import torch
3
- from transformers import AutoTokenizer, AutoModel, AutoProcessor
4
- import gradio as gr
5
- from PIL import Image
6
 
7
- # === εˆ†ι…ε±‚εˆ°ε€š GPU ===
8
- def split_model(model_path):
9
- from transformers import AutoConfig
10
- device_map = {}
11
- world_size = torch.cuda.device_count()
12
- print(f"world_size:{world_size}")
13
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
14
- num_layers = config.llm_config.num_hidden_layers
15
- num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
16
- num_layers_per_gpu = [num_layers_per_gpu] * world_size
17
- num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
18
- layer_cnt = 0
19
- for i, num_layer in enumerate(num_layers_per_gpu):
20
- for _ in range(num_layer):
21
- device_map[f'language_model.model.layers.{layer_cnt}'] = i
22
- layer_cnt += 1
23
- device_map['vision_model'] = 0
24
- device_map['mlp1'] = 0
25
- device_map['language_model.model.tok_embeddings'] = 0
26
- device_map['language_model.model.embed_tokens'] = 0
27
- device_map['language_model.output'] = 0
28
- device_map['language_model.model.norm'] = 0
29
- device_map['language_model.model.rotary_emb'] = 0
30
- device_map['language_model.lm_head'] = 0
31
- device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
32
- return device_map
33
 
34
- # === ζ¨‘εž‹θ·―εΎ„ ===
35
- model_path = "OpenGVLab/InternVL3-14B"
36
- device_map = split_model(model_path)
 
 
37
 
38
- # === εŠ θ½½ζ¨‘εž‹ε’Œε€„η†ε™¨ ===
39
- model = AutoModel.from_pretrained(
40
- model_path,
41
- torch_dtype=torch.bfloat16,
42
- low_cpu_mem_usage=True,
43
- use_flash_attn=True,
44
- trust_remote_code=True,
45
- device_map=device_map
46
- ).eval()
47
 
48
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
49
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
50
-
51
- # === ζŽ¨η†ε‡½ζ•° ===
52
- def infer(image: Image.Image, prompt: str):
53
- inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
54
- output = model.generate(**inputs, max_new_tokens=512)
55
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
56
- return answer
57
-
58
- # === Gradio η•Œι’ ===
59
  gr.Interface(
60
- fn=infer,
61
- inputs=[
62
- gr.Image(type="pil", label="Upload Image"),
63
- gr.Textbox(label="Your Prompt", placeholder="Ask a question about the image...")
64
- ],
65
- outputs="text",
66
- title="InternVL3-14B Multimodal Demo",
67
- description="Upload an image and ask a question. InternVL3-14B will answer using vision + language."
68
- ).launch()
 
1
+ import spaces
2
+ from diffusers import DiffusionPipeline
 
 
 
3
 
4
+ model_id = "google/gemma-3-27b-it"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ pipe = DiffusionPipeline.from_pretrained(
7
+ model_id,
8
+ device_map="auto"
9
+ )
10
+ pipe.to('cuda')
11
 
12
+ @spaces.GPU
13
+ def generate(prompt):
14
+ return pipe(prompt).images
 
 
 
 
 
 
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  gr.Interface(
17
+ fn=generate,
18
+ inputs=gr.Text(),
19
+ outputs=gr.Gallery(),
20
+ ).launch()