leenag commited on
Commit
18e4b07
·
verified ·
1 Parent(s): 16df6a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -37
app.py CHANGED
@@ -23,64 +23,60 @@ quantized_model = torch.quantization.quantize_dynamic(
23
  dtype=torch.qint8
24
  )
25
 
26
- # Sentence splitter (splits by full stop, exclamation, or question mark)
27
  def split_text(text, max_len=150):
28
- # First, try to split by sentence punctuation
29
  chunks = re.split(r'(?<=[.!?]) +', text)
30
-
31
- # If any chunk is still too long, split further
32
- refined_chunks = []
33
  for chunk in chunks:
34
  if len(chunk) <= max_len:
35
- refined_chunks.append(chunk)
36
  else:
37
- # Break on space while respecting max_len
38
  words = chunk.split()
39
- buffer = []
40
- length = 0
41
  for word in words:
42
- buffer.append(word)
43
- length += len(word) + 1
44
- if length > max_len:
45
- refined_chunks.append(' '.join(buffer))
46
- buffer = []
47
- length = 0
48
- if buffer:
49
- refined_chunks.append(' '.join(buffer))
50
- return refined_chunks
51
 
52
- # Main synthesis function
53
- def synthesize(language, text, gender, emotion, speed, pitch, quality):
54
  description = (
55
  f"A native {language.lower()} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
56
  f"speaking at a {speed.lower()} rate."
57
  )
58
 
59
- description_input = desc_tokenizer(description, return_tensors="pt").to(device)
60
-
61
- chunks = split_text(text)
62
- audio_pieces = []
63
 
64
- for chunk in chunks:
 
 
65
  prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
 
66
  with torch.no_grad():
67
- generation = quantized_model.generate(
68
- input_ids=description_input.input_ids,
69
- attention_mask=description_input.attention_mask,
70
  prompt_input_ids=prompt_input.input_ids,
71
  prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
72
  )
73
- audio_chunk = generation.cpu().numpy().squeeze()
74
- audio_pieces.append(audio_chunk)
75
 
76
- # Concatenate all audio chunks
77
- final_audio = np.concatenate(audio_pieces)
78
 
 
79
  filename = f"{uuid.uuid4().hex}.wav"
80
- sf.write(filename, final_audio, quantized_model.config.sampling_rate)
81
  return filename
82
 
83
- # Gradio Interface
84
  iface = gr.Interface(
85
  fn=synthesize,
86
  inputs=[
@@ -89,12 +85,12 @@ iface = gr.Interface(
89
  gr.Radio(["Male", "Female"], label="Speaker Gender"),
90
  gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
91
  gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
92
- gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
93
- gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
94
  ],
95
  outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
96
  title="Multilingual Indic TTS (Quantized + Chunked)",
97
- description="Fast CPU-based TTS with quantized Parler-TTS and text chunking for Malayalam, Hindi, Tamil, and English.",
98
  )
99
 
100
  iface.launch()
 
23
  dtype=torch.qint8
24
  )
25
 
26
+ # Sentence splitter
27
  def split_text(text, max_len=150):
 
28
  chunks = re.split(r'(?<=[.!?]) +', text)
29
+ refined = []
 
 
30
  for chunk in chunks:
31
  if len(chunk) <= max_len:
32
+ refined.append(chunk)
33
  else:
 
34
  words = chunk.split()
35
+ temp = []
36
+ buf_len = 0
37
  for word in words:
38
+ temp.append(word)
39
+ buf_len += len(word) + 1
40
+ if buf_len > max_len:
41
+ refined.append(' '.join(temp))
42
+ temp = []
43
+ buf_len = 0
44
+ if temp:
45
+ refined.append(' '.join(temp))
46
+ return refined
47
 
48
+ # Core TTS function
49
+ def synthesize(language, text, gender, emotion, speed):
50
  description = (
51
  f"A native {language.lower()} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
52
  f"speaking at a {speed.lower()} rate."
53
  )
54
 
55
+ audio_chunks = []
56
+ text_chunks = split_text(text)
 
 
57
 
58
+ for chunk in text_chunks:
59
+ # New tokenization for each chunk
60
+ desc_input = desc_tokenizer(description, return_tensors="pt").to(device)
61
  prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
62
+
63
  with torch.no_grad():
64
+ output = quantized_model.generate(
65
+ input_ids=desc_input.input_ids,
66
+ attention_mask=desc_input.attention_mask,
67
  prompt_input_ids=prompt_input.input_ids,
68
  prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
69
  )
 
 
70
 
71
+ audio = output.cpu().numpy().squeeze()
72
+ audio_chunks.append(audio)
73
 
74
+ full_audio = np.concatenate(audio_chunks)
75
  filename = f"{uuid.uuid4().hex}.wav"
76
+ sf.write(filename, full_audio, quantized_model.config.sampling_rate)
77
  return filename
78
 
79
+ # Gradio UI
80
  iface = gr.Interface(
81
  fn=synthesize,
82
  inputs=[
 
85
  gr.Radio(["Male", "Female"], label="Speaker Gender"),
86
  gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
87
  gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
88
+ #gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
89
+ #gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
90
  ],
91
  outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
92
  title="Multilingual Indic TTS (Quantized + Chunked)",
93
+ description="CPU-based TTS with quantized Parler-TTS and chunked input for Malayalam, Hindi, Tamil, and English.",
94
  )
95
 
96
  iface.launch()