import os, sys, types, subprocess, tempfile import torch, gradio as gr from transformers import ( VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer ) from PIL import Image # ── 환경 변수 ──────────────────────────────────────────────── os.environ["HF_FORCE_SAFE_SERIALIZATION"] = "1" os.environ["XFORMERS_FORCE_DISABLE"] = "1" # audiocraft 내부 플래그 # ── xformers 더미 모듈 ─────────────────────────────────────── dummy = types.ModuleType("xformers") dummy.__version__ = "0.0.0" ops = types.ModuleType("xformers.ops") def _fake_mem_eff_attn(q, k, v, *_, dropout_p: float = 0.0, **__): return torch.nn.functional.scaled_dot_product_attention( q, k, v, dropout_p=dropout_p, is_causal=False ) class _FakeLowerTriangularMask: pass ops.memory_efficient_attention = _fake_mem_eff_attn ops.LowerTriangularMask = _FakeLowerTriangularMask dummy.ops = ops sys.modules["xformers"] = dummy sys.modules["xformers.ops"] = ops # ──────────────────────────────────────────────────────────── # ── audiocraft 로드 (postInstall에서 이미 설치됐음) ─────────── try: from audiocraft.models import MusicGen from audiocraft.data.audio import audio_write except ModuleNotFoundError: # 예외적 로컬 실행 대비 subprocess.check_call([ sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/audiocraft@main", "--no-deps", "--use-pep517" ]) from audiocraft.models import MusicGen from audiocraft.data.audio import audio_write # ── 이미지 캡셔닝 모델 ───────────────────────────────────── caption_model = VisionEncoderDecoderModel.from_pretrained( "nlpconnect/vit-gpt2-image-captioning", use_safetensors=True, low_cpu_mem_usage=True ) feature_extractor = ViTImageProcessor.from_pretrained( "nlpconnect/vit-gpt2-image-captioning" ) tokenizer = AutoTokenizer.from_pretrained( "nlpconnect/vit-gpt2-image-captioning" ) # ── MusicGen 모델 ────────────────────────────────────────── musicgen = MusicGen.get_pretrained("facebook/musicgen-small") musicgen.set_generation_params(duration=10) # ── 파이프라인 함수들 ────────────────────────────────────── def generate_caption(image: Image.Image) -> str: pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values ids = caption_model.generate(pixel_values, max_length=50) return tokenizer.decode(ids[0], skip_special_tokens=True) def generate_music(prompt: str) -> str: wav = musicgen.generate([prompt]) tmpdir = tempfile.mkdtemp() path = os.path.join(tmpdir, "musicgen.wav") audio_write(path, wav[0], musicgen.sample_rate, strategy="loudness") return path def process(image: Image.Image): caption = generate_caption(image) path = generate_music(f"A cheerful melody inspired by: {caption}") return caption, path # ── Gradio UI ────────────────────────────────────────────── demo = gr.Interface( fn=process, inputs=gr.Image(type="pil"), outputs=[ gr.Text(label="AI가 생성한 그림 설명"), gr.Audio(label="생성된 AI 음악 (MusicGen)") ], title="🎨 AI 그림‑음악 생성기", description="그림을 업로드하면 AI가 설명을 만들고, 설명을 바탕으로 음악을 10초간 생성해 들려줍니다." ) if __name__ == "__main__": demo.launch()