yongyeol commited on
Commit
06f6c9e
Β·
verified Β·
1 Parent(s): 4ae4657

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -26
app.py CHANGED
@@ -1,70 +1,72 @@
 
 
 
 
 
1
 
2
- import os, sys, types, subprocess
3
  os.environ["HF_FORCE_SAFE_SERIALIZATION"] = "1"
4
- os.environ["XFORMERS_FORCE_DISABLE"] = "1" # κ·Έλž˜λ„ importλŠ” λ°œμƒ β†’ κ°€μ§œ λͺ¨λ“ˆ μ£Όμž…
5
 
6
- # ── ✨ xformers 더미 λͺ¨λ“ˆ μ‚½μž… ─────────────────────────
7
  dummy = types.ModuleType("xformers")
8
  dummy.ops = types.ModuleType("xformers.ops") # audiocraftκ°€ ops ν•˜μœ„λͺ¨λ“ˆλ„ 찾음
9
  sys.modules["xformers"] = dummy
10
  sys.modules["xformers.ops"] = dummy.ops
11
- # ─────────────────────────────────────────────────────
12
 
13
- # ── audiocraft 동적 μ„€μΉ˜ ──
14
  try:
15
  from audiocraft.models import MusicGen
16
  except ModuleNotFoundError:
17
  subprocess.check_call([
18
  sys.executable, "-m", "pip", "install",
19
- "git+https://github.com/facebookresearch/audiocraft@main", "--no-deps"
 
20
  ])
21
  from audiocraft.models import MusicGen
22
 
23
-
24
  import gradio as gr
25
  from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
26
- from audiocraft.models import MusicGen
27
  from audiocraft.data.audio import audio_write
28
  from PIL import Image
29
  import torch
30
- import tempfile
31
 
32
- # ───── 이미지 캑셔닝 λͺ¨λΈ λ‘œλ”© ─────
33
  caption_model = VisionEncoderDecoderModel.from_pretrained(
34
  "nlpconnect/vit-gpt2-image-captioning",
35
- use_safetensors=True, # βœ… μ˜¬λ°”λ₯Έ μ˜΅μ…˜λͺ…
36
- low_cpu_mem_usage=True # (μ˜΅μ…˜) λ©”λͺ¨λ¦¬ μ ˆμ•½
37
  )
38
  feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
39
  tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
40
 
41
- # ───── MusicGen λͺ¨λΈ λ‘œλ”© ─────
42
  musicgen = MusicGen.get_pretrained("facebook/musicgen-small")
43
- musicgen.set_generation_params(duration=10) # 생성할 μŒμ•… 길이 (초)
44
 
45
- # ───── 이미지 β†’ μ„€λͺ… λ¬Έμž₯ 생성 ─────
46
- def generate_caption(image):
47
  pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
48
  output_ids = caption_model.generate(pixel_values, max_length=50)
49
- caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
50
- return caption
51
 
52
- # ───── μ„€λͺ… β†’ μŒμ•… 생성 ─────
53
- def generate_music(prompt):
54
- wav = musicgen.generate([prompt]) # batch size 1
55
  tmp_dir = tempfile.mkdtemp()
56
  audio_path = os.path.join(tmp_dir, "musicgen_output.wav")
57
  audio_write(audio_path, wav[0], musicgen.sample_rate, strategy="loudness")
58
  return audio_path
59
 
60
- # ───── 전체 νŒŒμ΄ν”„λΌμΈ μ—°κ²° ─────
61
- def process(image):
62
  caption = generate_caption(image)
63
  prompt = f"A cheerful melody inspired by: {caption}"
64
  audio_path = generate_music(prompt)
65
  return caption, audio_path
66
 
67
- # ───── Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 ─────
68
  demo = gr.Interface(
69
  fn=process,
70
  inputs=gr.Image(type="pil"),
@@ -72,8 +74,8 @@ demo = gr.Interface(
72
  gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
73
  gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
74
  ],
75
- title="🎨 AI κ·Έλ¦Ό μŒμ•… 생성기",
76
- description="그림을 μ—…λ‘œλ“œν•˜λ©΄ AIκ°€ μ„€λͺ…을 λ§Œλ“€κ³ , μ„€λͺ…을 λ°”νƒ•μœΌλ‘œ μŒμ•…μ„ λ§Œλ“€μ–΄ λ“€λ €μ€λ‹ˆλ‹€."
77
  )
78
 
79
  if __name__ == "__main__":
 
1
+ import os
2
+ import sys
3
+ import types
4
+ import subprocess
5
+ import tempfile
6
 
7
+ # ── ν™˜κ²½ λ³€μˆ˜ μ„€μ • ──────────────────────────────────────────────
8
  os.environ["HF_FORCE_SAFE_SERIALIZATION"] = "1"
9
+ os.environ["XFORMERS_FORCE_DISABLE"] = "1" # xformers λΉ„ν™œμ„±ν™”
10
 
11
+ # ── ✨ xformers 더미 λͺ¨λ“ˆ μ‚½μž… ──────────────────────────────────
12
  dummy = types.ModuleType("xformers")
13
  dummy.ops = types.ModuleType("xformers.ops") # audiocraftκ°€ ops ν•˜μœ„λͺ¨λ“ˆλ„ 찾음
14
  sys.modules["xformers"] = dummy
15
  sys.modules["xformers.ops"] = dummy.ops
16
+ # ───────────────────────────────────────────────────────────────
17
 
18
+ # ── audiocraft 동적 μ„€μΉ˜ ───────────────────────────────────────
19
  try:
20
  from audiocraft.models import MusicGen
21
  except ModuleNotFoundError:
22
  subprocess.check_call([
23
  sys.executable, "-m", "pip", "install",
24
+ "git+https://github.com/facebookresearch/audiocraft@main",
25
+ "--use-pep517" # μ˜μ‘΄μ„± 포함 μ„€μΉ˜
26
  ])
27
  from audiocraft.models import MusicGen
28
 
 
29
  import gradio as gr
30
  from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
 
31
  from audiocraft.data.audio import audio_write
32
  from PIL import Image
33
  import torch
 
34
 
35
+ # ───── 이미지 캑셔닝 λͺ¨λΈ λ‘œλ”© ─────────────────────────────────
36
  caption_model = VisionEncoderDecoderModel.from_pretrained(
37
  "nlpconnect/vit-gpt2-image-captioning",
38
+ use_safetensors=True,
39
+ low_cpu_mem_usage=True
40
  )
41
  feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
42
  tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
43
 
44
+ # ───── MusicGen λͺ¨λΈ λ‘œλ”© ─────────────────────────────────────
45
  musicgen = MusicGen.get_pretrained("facebook/musicgen-small")
46
+ musicgen.set_generation_params(duration=10) # 생성 μŒμ•… 길이(초)
47
 
48
+ # ───── 이미지 β†’ μ„€λͺ… λ¬Έμž₯ 생성 ν•¨μˆ˜ ────────────────────────────
49
+ def generate_caption(image: Image.Image) -> str:
50
  pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
51
  output_ids = caption_model.generate(pixel_values, max_length=50)
52
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
53
 
54
+ # ───── μ„€λͺ… β†’ μŒμ•… 생성 ν•¨μˆ˜ ──────────────────────────────────
55
+ def generate_music(prompt: str) -> str:
56
+ wav = musicgen.generate([prompt]) # batch size = 1
57
  tmp_dir = tempfile.mkdtemp()
58
  audio_path = os.path.join(tmp_dir, "musicgen_output.wav")
59
  audio_write(audio_path, wav[0], musicgen.sample_rate, strategy="loudness")
60
  return audio_path
61
 
62
+ # ───── 전체 νŒŒμ΄ν”„λΌμΈ ────────────────────────────────────────
63
+ def process(image: Image.Image):
64
  caption = generate_caption(image)
65
  prompt = f"A cheerful melody inspired by: {caption}"
66
  audio_path = generate_music(prompt)
67
  return caption, audio_path
68
 
69
+ # ───── Gradio μΈν„°νŽ˜μ΄μŠ€ ─────────────────────────────────────
70
  demo = gr.Interface(
71
  fn=process,
72
  inputs=gr.Image(type="pil"),
 
74
  gr.Text(label="AIκ°€ μƒμ„±ν•œ κ·Έλ¦Ό μ„€λͺ…"),
75
  gr.Audio(label="μƒμ„±λœ AI μŒμ•… (MusicGen)")
76
  ],
77
+ title="🎨 AI κ·Έλ¦Ό-μŒμ•… 생성기",
78
+ description="그림을 μ—…λ‘œλ“œν•˜λ©΄ AIκ°€ μ„€λͺ…을 λ§Œλ“€κ³ , μ„€λͺ…을 λ°”νƒ•μœΌλ‘œ μŒμ•…μ„ 생성해 λ“€λ €μ€λ‹ˆλ‹€."
79
  )
80
 
81
  if __name__ == "__main__":