JoannaKOKO commited on
Commit
e881a5a
·
verified ·
1 Parent(s): e95d795

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -66
app.py CHANGED
@@ -1,11 +1,12 @@
1
  import streamlit as st
2
- import transformers
3
  from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
4
  import torch
5
  from PIL import Image
6
  import io
7
  import numpy as np
8
- from kokoro import KokoroTTS # for text-to-speech
 
 
9
 
10
  # Load models
11
  # Image-to-Text model
@@ -16,40 +17,54 @@ caption_model = AutoModelForCausalLM.from_pretrained("Ertugrul/Qwen2-VL-7B-Capti
16
  story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
17
 
18
  # Load the text-to-speech model
19
- tts_model = KokoroTTS("hexgrad/Kokoro-82M")
20
 
21
- def generate_story_audio(image_bytes):
22
- """
23
- Process an image to generate story audio.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- Args:
26
- image_bytes: Bytes of the uploaded image
27
-
28
- Returns:
29
- audio (numpy array): Audio waveform
30
- sample_rate (int): Sample rate of the audio
31
- """
32
- # Convert bytes to PIL Image
33
- image = Image.open(io.BytesIO(image_bytes))
34
-
35
- # Step 1: Generate caption from image
36
- inputs = processor(images=image, text="Generate a caption:", return_tensors="pt")
37
- outputs = caption_model.generate(**inputs)
38
- caption = processor.decode(outputs[0], skip_special_tokens=True)
39
-
40
- # Step 2: Generate story from caption
41
- prompt = f"Based on the description '{caption}', tell a short story for children aged 3 to 10 in no more than 100 words."
42
- story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
43
- story = story_output[0]["generated_text"]
44
-
45
- # Truncate to 100 words if necessary
46
- story_words = story.split()
47
- if len(story_words) > 100:
48
- story = " ".join(story_words[:100])
49
-
50
- # Step 3: Generate audio from story using Kokoro TTS
51
- audio, sample_rate = tts_model.generate(story)
52
- return audio, sample_rate
53
 
54
  # Streamlit UI
55
  st.title("Image to Story Audio Generator")
@@ -62,35 +77,7 @@ if uploaded_file is not None:
62
  st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
63
 
64
  with st.spinner("Generating story audio..."):
65
- audio, sample_rate = generate_story_audio(image_bytes)
66
-
67
- # Save audio to a BytesIO object for Streamlit compatibility
68
- audio_buffer = io.BytesIO()
69
- # Simple WAV header for mono 32-bit float audio (minimal implementation)
70
- def write_wav_header(buffer, data, sample_rate):
71
- buffer.write(b'RIFF')
72
- buffer.write((36 + len(data) * 4).to_bytes(4, 'little')) # Chunk size
73
- buffer.write(b'WAVE')
74
- buffer.write(b'fmt ')
75
- buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size
76
- buffer.write((3).to_bytes(2, 'little')) # Audio format (3 = IEEE float)
77
- buffer.write((1).to_bytes(2, 'little')) # Num channels (mono)
78
- buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate
79
- buffer.write((sample_rate * 4).to_bytes(4, 'little')) # Byte rate
80
- buffer.write((4).to_bytes(2, 'little')) # Block align
81
- buffer.write((32).to_bytes(2, 'little')) # Bits per sample
82
- buffer.write(b'data')
83
- buffer.write((len(data) * 4).to_bytes(4, 'little')) # Data size
84
- data.tofile(buffer) # Write audio data
85
-
86
- write_wav_header(audio_buffer, audio, sample_rate)
87
- audio_buffer.seek(0)
88
-
89
- # Provide audio playback and download
90
- st.audio(audio_buffer, format="audio/wav")
91
- st.download_button(
92
- label="Download Story Audio",
93
- data=audio_buffer,
94
- file_name="story_audio.wav",
95
- mime="audio/wav"
96
- )
 
1
  import streamlit as st
 
2
  from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
3
  import torch
4
  from PIL import Image
5
  import io
6
  import numpy as np
7
+ from kokoro import KPipeline # for text-to-speech
8
+ from IPython.display import display, Audio
9
+ import soundfile as sf
10
 
11
  # Load models
12
  # Image-to-Text model
 
17
  story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
18
 
19
  # Load the text-to-speech model
 
20
 
21
+ for i, (gs, ps, audio) in enumerate(audio_generator):
22
+ print(i) # i => index
23
+ print(gs) # gs => graphemes/text
24
+ print(ps) # ps => phonemes
25
+ display(Audio(data=audio, rate=24000, autoplay=i==0))
26
+ sf.write(f'{i}.wav', audio, 24000) # save each audio file
27
+
28
+
29
+ def generate_text(image_bytes):
30
+ # load image-to-text model
31
+ processor = AutoProcessor.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
32
+ caption_model = AutoModelForCausalLM.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
33
+
34
+ # Convert bytes to PIL Image
35
+ image = Image.open(io.BytesIO(image_bytes))
36
+
37
+ # Step 1: Generate text from image
38
+ inputs = processor(images=image, text="Generate a caption:", return_tensors="pt")
39
+ outputs = caption_model.generate(**inputs)
40
+ text = processor.decode(outputs[0], skip_special_tokens=True)
41
+ return text
42
+
43
+ def generate_story(text):
44
+ # load text-to-story model
45
+ story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
46
+
47
+ # Step 2: Generate story from caption
48
+ prompt = f"Based on the description '{text}', tell a short story for children aged 3 to 10 in no more than 100 words."
49
+ story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
50
+ story = story_output[0]["generated_text"]
51
+ return story
52
+
53
+
54
+ def generate_audio(story):
55
+ audio_pipeline = KPipeline(lang_code='a')
56
+ audio_generator = audio_pipeline(
57
+ story, voice='af_heart', # <= change voice here
58
+ speed=1, split_pattern=r'\n+'
59
+ )
60
+ for i, (gs, ps, audio) in enumerate(audio_generator):
61
+ print(i) # i => index
62
+ print(gs) # gs => graphemes/text
63
+ print(ps) # ps => phonemes
64
+ display(Audio(data=audio, rate=24000, autoplay=i==0))
65
+ sf.write(f'{i}.wav', audio, 24000) # save each audio file
66
+
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Streamlit UI
70
  st.title("Image to Story Audio Generator")
 
77
  st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
78
 
79
  with st.spinner("Generating story audio..."):
80
+ #audio, sample_rate = generate_story_audio(image_bytes)
81
+ text = generate_text(image_bytes)
82
+ story = generate_story(text)
83
+ generate_audio(story)