khurrameycon commited on
Commit
af836e4
·
verified ·
1 Parent(s): 9354354

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -49
app.py CHANGED
@@ -1,7 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, Response
2
  from fastapi.responses import FileResponse
3
  from kokoro import KPipeline
4
- import soundfile as sf
5
  import os
6
  import numpy as np
7
  import torch
@@ -10,33 +111,27 @@ from huggingface_hub import InferenceClient
10
  def llm_chat_response(text):
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
  client = InferenceClient(
13
- provider="hf-inference",
14
- api_key=HF_TOKEN,)
15
-
16
- response_from_llama = client.chat.completions.create(
17
- model="meta-llama/Llama-3.2-11B-Vision-Instruct",
18
- messages=[
 
 
19
  {
20
- "role": "user",
21
- "content": [
22
- {
23
- "type": "text",
24
- "text": "Describe this image in one sentence."
25
- }#,
26
- # {
27
- # "type": "image_url",
28
- # "image_url": {
29
- # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
30
- # }
31
- # }
32
- ]
33
  }
34
- ],
 
 
 
 
 
35
  max_tokens=500,
36
  )
37
 
38
-
39
-
40
  return response_from_llama.choices[0].message['content']
41
 
42
  app = FastAPI()
@@ -46,10 +141,9 @@ pipeline = KPipeline(lang_code='a')
46
 
47
  @app.post("/generate")
48
  async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
49
-
50
  text_reply = llm_chat_response(text)
51
 
52
- # Generate audio
53
  generator = pipeline(
54
  text_reply,
55
  voice=voice,
@@ -57,43 +151,23 @@ async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0)
57
  split_pattern=r'\n+'
58
  )
59
 
60
- # # Save first segment only for demo
61
- # for i, (gs, ps, audio) in enumerate(generator):
62
- # sf.write(f"output_{i}.wav", audio, 24000)
63
- # return FileResponse(
64
- # f"output_{i}.wav",
65
- # media_type="audio/wav",
66
- # filename="output.wav"
67
- # )
68
-
69
- # return Response("No audio generated", status_code=400)
70
-
71
-
72
- # Process only the first segment for demo
73
  for i, (gs, ps, audio) in enumerate(generator):
74
-
75
- # Convert PyTorch tensor to NumPy array
76
  audio_numpy = audio.cpu().numpy()
77
- # Convert to 16-bit PCM
78
-
79
- # Ensure the audio is in the range [-1, 1]
80
  audio_numpy = np.clip(audio_numpy, -1, 1)
81
- # Convert to 16-bit signed integers
82
  pcm_data = (audio_numpy * 32767).astype(np.int16)
83
-
84
- # Convert to bytes (automatically uses row-major order)
85
  raw_audio = pcm_data.tobytes()
86
 
87
- # Return PCM data with minimal necessary headers
88
  return Response(
89
  content=raw_audio,
90
  media_type="application/octet-stream",
91
  headers={
92
- "Content-Disposition": f'attachment; filename="output.pcm"',
93
  "X-Sample-Rate": "24000",
94
  "X-Bits-Per-Sample": "16",
95
  "X-Endianness": "little"
96
  }
97
  )
98
 
99
- return Response("No audio generated", status_code=400)
 
1
+ # from fastapi import FastAPI, Response
2
+ # from fastapi.responses import FileResponse
3
+ # from kokoro import KPipeline
4
+ # import soundfile as sf
5
+ # import os
6
+ # import numpy as np
7
+ # import torch
8
+ # from huggingface_hub import InferenceClient
9
+
10
+ # def llm_chat_response(text):
11
+ # HF_TOKEN = os.getenv("HF_TOKEN")
12
+ # client = InferenceClient(
13
+ # provider="hf-inference",
14
+ # api_key=HF_TOKEN,)
15
+
16
+ # response_from_llama = client.chat.completions.create(
17
+ # model="meta-llama/Llama-3.2-11B-Vision-Instruct",
18
+ # messages=[
19
+ # {
20
+ # "role": "user",
21
+ # "content": [
22
+ # {
23
+ # "type": "text",
24
+ # "text": "Describe this image in one sentence."
25
+ # }#,
26
+ # # {
27
+ # # "type": "image_url",
28
+ # # "image_url": {
29
+ # # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
30
+ # # }
31
+ # # }
32
+ # ]
33
+ # }
34
+ # ],
35
+ # max_tokens=500,
36
+ # )
37
+
38
+
39
+
40
+ # return response_from_llama.choices[0].message['content']
41
+
42
+ # app = FastAPI()
43
+
44
+ # # Initialize pipeline once at startup
45
+ # pipeline = KPipeline(lang_code='a')
46
+
47
+ # @app.post("/generate")
48
+ # async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
49
+
50
+ # text_reply = llm_chat_response(text)
51
+
52
+ # # Generate audio
53
+ # generator = pipeline(
54
+ # text_reply,
55
+ # voice=voice,
56
+ # speed=speed,
57
+ # split_pattern=r'\n+'
58
+ # )
59
+
60
+ # # # Save first segment only for demo
61
+ # # for i, (gs, ps, audio) in enumerate(generator):
62
+ # # sf.write(f"output_{i}.wav", audio, 24000)
63
+ # # return FileResponse(
64
+ # # f"output_{i}.wav",
65
+ # # media_type="audio/wav",
66
+ # # filename="output.wav"
67
+ # # )
68
+
69
+ # # return Response("No audio generated", status_code=400)
70
+
71
+
72
+ # # Process only the first segment for demo
73
+ # for i, (gs, ps, audio) in enumerate(generator):
74
+
75
+ # # Convert PyTorch tensor to NumPy array
76
+ # audio_numpy = audio.cpu().numpy()
77
+ # # Convert to 16-bit PCM
78
+
79
+ # # Ensure the audio is in the range [-1, 1]
80
+ # audio_numpy = np.clip(audio_numpy, -1, 1)
81
+ # # Convert to 16-bit signed integers
82
+ # pcm_data = (audio_numpy * 32767).astype(np.int16)
83
+
84
+ # # Convert to bytes (automatically uses row-major order)
85
+ # raw_audio = pcm_data.tobytes()
86
+
87
+ # # Return PCM data with minimal necessary headers
88
+ # return Response(
89
+ # content=raw_audio,
90
+ # media_type="application/octet-stream",
91
+ # headers={
92
+ # "Content-Disposition": f'attachment; filename="output.pcm"',
93
+ # "X-Sample-Rate": "24000",
94
+ # "X-Bits-Per-Sample": "16",
95
+ # "X-Endianness": "little"
96
+ # }
97
+ # )
98
+
99
+ # return Response("No audio generated", status_code=400)
100
+
101
+
102
+
103
  from fastapi import FastAPI, Response
104
  from fastapi.responses import FileResponse
105
  from kokoro import KPipeline
 
106
  import os
107
  import numpy as np
108
  import torch
 
111
  def llm_chat_response(text):
112
  HF_TOKEN = os.getenv("HF_TOKEN")
113
  client = InferenceClient(
114
+ provider="sambanova", # Use the provider that supports conversational image-text tasks.
115
+ api_key=HF_TOKEN,
116
+ )
117
+
118
+ # Build the message payload; here we append a prompt suffix when no image is involved.
119
+ messages = [{
120
+ "role": "user",
121
+ "content": [
122
  {
123
+ "type": "text",
124
+ "text": text + " describe in one line only"
 
 
 
 
 
 
 
 
 
 
 
125
  }
126
+ ]
127
+ }]
128
+
129
+ response_from_llama = client.chat.completions.create(
130
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
131
+ messages=messages,
132
  max_tokens=500,
133
  )
134
 
 
 
135
  return response_from_llama.choices[0].message['content']
136
 
137
  app = FastAPI()
 
141
 
142
  @app.post("/generate")
143
  async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
 
144
  text_reply = llm_chat_response(text)
145
 
146
+ # Generate audio using the pipeline
147
  generator = pipeline(
148
  text_reply,
149
  voice=voice,
 
151
  split_pattern=r'\n+'
152
  )
153
 
154
+ # Process only the first segment for demonstration
 
 
 
 
 
 
 
 
 
 
 
 
155
  for i, (gs, ps, audio) in enumerate(generator):
156
+ # Convert PyTorch tensor to NumPy array and prepare 16-bit PCM data
 
157
  audio_numpy = audio.cpu().numpy()
 
 
 
158
  audio_numpy = np.clip(audio_numpy, -1, 1)
 
159
  pcm_data = (audio_numpy * 32767).astype(np.int16)
 
 
160
  raw_audio = pcm_data.tobytes()
161
 
 
162
  return Response(
163
  content=raw_audio,
164
  media_type="application/octet-stream",
165
  headers={
166
+ "Content-Disposition": 'attachment; filename="output.pcm"',
167
  "X-Sample-Rate": "24000",
168
  "X-Bits-Per-Sample": "16",
169
  "X-Endianness": "little"
170
  }
171
  )
172
 
173
+ return Response("No audio generated", status_code=400)