Spaces:

preston-cell
/

image-text-to-text

Sleeping

App Files Files Community

image-text-to-text / run_csm.py

preston-cell

Create run_csm.py

c4e4a14 verified 7 months ago

raw

history blame

4.44 kB

	import os
	import torch
	import torchaudio
	from huggingface_hub import hf_hub_download
	from generator import load_csm_1b, Segment
	from dataclasses import dataclass

	# Disable Triton compilation
	os.environ["NO_TORCH_COMPILE"] = "1"

	# Default prompts are available at https://hf.co/sesame/csm-1b
	prompt_filepath_conversational_a = hf_hub_download(
	repo_id="sesame/csm-1b",
	filename="prompts/conversational_a.wav"
	)
	prompt_filepath_conversational_b = hf_hub_download(
	repo_id="sesame/csm-1b",
	filename="prompts/conversational_b.wav"
	)

	SPEAKER_PROMPTS = {
	"conversational_a": {
	"text": (
	"like revising for an exam I'd have to try and like keep up the momentum because I'd "
	"start really early I'd be like okay I'm gonna start revising now and then like "
	"you're revising for ages and then I just like start losing steam I didn't do that "
	"for the exam we had recently to be fair that was a more of a last minute scenario "
	"but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I "
	"sort of start the day with this not like a panic but like a"
	),
	"audio": prompt_filepath_conversational_a
	},
	"conversational_b": {
	"text": (
	"like a super Mario level. Like it's very like high detail. And like, once you get "
	"into the park, it just like, everything looks like a computer game and they have all "
	"these, like, you know, if, if there's like a, you know, like in a Mario game, they "
	"will have like a question block. And if you like, you know, punch it, a coin will "
	"come out. So like everyone, when they come into the park, they get like this little "
	"bracelet and then you can go punching question blocks around."
	),
	"audio": prompt_filepath_conversational_b
	}
	}

	def load_prompt_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor:
	audio_tensor, sample_rate = torchaudio.load(audio_path)
	audio_tensor = audio_tensor.squeeze(0)
	# Resample is lazy so we can always call it
	audio_tensor = torchaudio.functional.resample(
	audio_tensor, orig_freq=sample_rate, new_freq=target_sample_rate
	)
	return audio_tensor

	def prepare_prompt(text: str, speaker: int, audio_path: str, sample_rate: int) -> Segment:
	audio_tensor = load_prompt_audio(audio_path, sample_rate)
	return Segment(text=text, speaker=speaker, audio=audio_tensor)

	def main():
	# Select the best available device, skipping MPS due to float64 limitations
	if torch.cuda.is_available():
	device = "cuda"
	else:
	device = "cpu"
	print(f"Using device: {device}")

	# Load model
	generator = load_csm_1b(device)

	# Prepare prompts
	prompt_a = prepare_prompt(
	SPEAKER_PROMPTS["conversational_a"]["text"],
	0,
	SPEAKER_PROMPTS["conversational_a"]["audio"],
	generator.sample_rate
	)

	prompt_b = prepare_prompt(
	SPEAKER_PROMPTS["conversational_b"]["text"],
	1,
	SPEAKER_PROMPTS["conversational_b"]["audio"],
	generator.sample_rate
	)

	# Generate conversation
	conversation = [
	{"text": "Hey how are you doing?", "speaker_id": 0},
	{"text": "Pretty good, pretty good. How about you?", "speaker_id": 1},
	{"text": "I'm great! So happy to be speaking with you today.", "speaker_id": 0},
	{"text": "Me too! This is some cool stuff, isn't it?", "speaker_id": 1}
	]

	# Generate each utterance
	generated_segments = []
	prompt_segments = [prompt_a, prompt_b]

	for utterance in conversation:
	print(f"Generating: {utterance['text']}")
	audio_tensor = generator.generate(
	text=utterance['text'],
	speaker=utterance['speaker_id'],
	context=prompt_segments + generated_segments,
	max_audio_length_ms=10_000,
	)
	generated_segments.append(Segment(text=utterance['text'], speaker=utterance['speaker_id'], audio=audio_tensor))

	# Concatenate all generations
	all_audio = torch.cat([seg.audio for seg in generated_segments], dim=0)
	torchaudio.save(
	"full_conversation.wav",
	all_audio.unsqueeze(0).cpu(),
	generator.sample_rate
	)
	print("Successfully generated full_conversation.wav")

	if __name__ == "__main__":
	main()