khurrameycon commited on
Commit
c034a74
·
verified ·
1 Parent(s): e1bc235

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -42
app.py CHANGED
@@ -1,79 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, Response
2
  from fastapi.responses import FileResponse
3
  from kokoro import KPipeline
4
  import soundfile as sf
5
  import os
6
  import numpy as np
7
- import torch
8
  from huggingface_hub import InferenceClient
 
 
 
 
9
 
10
- def llm_chat_response(text):
 
 
 
 
 
 
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
  client = InferenceClient(api_key=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  messages = [
14
- {
15
- "role": "user",
16
- "content": [
17
- {
18
- "type": "text",
19
- "text": text + str('describe in one line only')
20
- } #,
21
- # {
22
- # "type": "image_url",
23
- # "image_url": {
24
- # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
25
- # }
26
- # }
27
- ]
28
- }
29
  ]
30
-
31
  response_from_llama = client.chat.completions.create(
32
- model="meta-llama/Llama-3.2-11B-Vision-Instruct",
33
- messages=messages,
34
- max_tokens=500)
35
-
36
  return response_from_llama.choices[0].message['content']
37
 
38
  app = FastAPI()
39
-
40
  # Initialize pipeline once at startup
41
  pipeline = KPipeline(lang_code='a')
42
 
43
  @app.post("/generate")
44
- async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
45
-
46
- text_reply = llm_chat_response(text)
 
 
 
 
 
 
 
47
 
48
  # Generate audio
49
  generator = pipeline(
50
  text_reply,
51
- voice=voice,
52
- speed=speed,
53
  split_pattern=r'\n+'
54
  )
55
 
56
- # # Save first segment only for demo
57
- # for i, (gs, ps, audio) in enumerate(generator):
58
- # sf.write(f"output_{i}.wav", audio, 24000)
59
- # return FileResponse(
60
- # f"output_{i}.wav",
61
- # media_type="audio/wav",
62
- # filename="output.wav"
63
- # )
64
-
65
- # return Response("No audio generated", status_code=400)
66
-
67
-
68
  # Process only the first segment for demo
69
  for i, (gs, ps, audio) in enumerate(generator):
70
-
71
  # Convert PyTorch tensor to NumPy array
72
  audio_numpy = audio.cpu().numpy()
73
- # Convert to 16-bit PCM
74
 
 
75
  # Ensure the audio is in the range [-1, 1]
76
  audio_numpy = np.clip(audio_numpy, -1, 1)
 
77
  # Convert to 16-bit signed integers
78
  pcm_data = (audio_numpy * 32767).astype(np.int16)
79
 
 
1
+ # from fastapi import FastAPI, Response
2
+ # from fastapi.responses import FileResponse
3
+ # from kokoro import KPipeline
4
+ # import soundfile as sf
5
+ # import os
6
+ # import numpy as np
7
+ # import torch
8
+ # from huggingface_hub import InferenceClient
9
+
10
+ # def llm_chat_response(text):
11
+ # HF_TOKEN = os.getenv("HF_TOKEN")
12
+ # client = InferenceClient(api_key=HF_TOKEN)
13
+ # messages = [
14
+ # {
15
+ # "role": "user",
16
+ # "content": [
17
+ # {
18
+ # "type": "text",
19
+ # "text": text + str('describe in one line only')
20
+ # } #,
21
+ # # {
22
+ # # "type": "image_url",
23
+ # # "image_url": {
24
+ # # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
25
+ # # }
26
+ # # }
27
+ # ]
28
+ # }
29
+ # ]
30
+
31
+ # response_from_llama = client.chat.completions.create(
32
+ # model="meta-llama/Llama-3.2-11B-Vision-Instruct",
33
+ # messages=messages,
34
+ # max_tokens=500)
35
+
36
+ # return response_from_llama.choices[0].message['content']
37
+
38
+ # app = FastAPI()
39
+
40
+ # # Initialize pipeline once at startup
41
+ # pipeline = KPipeline(lang_code='a')
42
+
43
+ # @app.post("/generate")
44
+ # async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
45
+
46
+ # text_reply = llm_chat_response(text)
47
+
48
+ # # Generate audio
49
+ # generator = pipeline(
50
+ # text_reply,
51
+ # voice=voice,
52
+ # speed=speed,
53
+ # split_pattern=r'\n+'
54
+ # )
55
+
56
+ # # # Save first segment only for demo
57
+ # # for i, (gs, ps, audio) in enumerate(generator):
58
+ # # sf.write(f"output_{i}.wav", audio, 24000)
59
+ # # return FileResponse(
60
+ # # f"output_{i}.wav",
61
+ # # media_type="audio/wav",
62
+ # # filename="output.wav"
63
+ # # )
64
+
65
+ # # return Response("No audio generated", status_code=400)
66
+
67
+
68
+ # # Process only the first segment for demo
69
+ # for i, (gs, ps, audio) in enumerate(generator):
70
+
71
+ # # Convert PyTorch tensor to NumPy array
72
+ # audio_numpy = audio.cpu().numpy()
73
+ # # Convert to 16-bit PCM
74
+
75
+ # # Ensure the audio is in the range [-1, 1]
76
+ # audio_numpy = np.clip(audio_numpy, -1, 1)
77
+ # # Convert to 16-bit signed integers
78
+ # pcm_data = (audio_numpy * 32767).astype(np.int16)
79
+
80
+ # # Convert to bytes (automatically uses row-major order)
81
+ # raw_audio = pcm_data.tobytes()
82
+
83
+ # # Return PCM data with minimal necessary headers
84
+ # return Response(
85
+ # content=raw_audio,
86
+ # media_type="application/octet-stream",
87
+ # headers={
88
+ # "Content-Disposition": f'attachment; filename="output.pcm"',
89
+ # "X-Sample-Rate": "24000",
90
+ # "X-Bits-Per-Sample": "16",
91
+ # "X-Endianness": "little"
92
+ # }
93
+ # )
94
+
95
+ # return Response("No audio generated", status_code=400)
96
+
97
+
98
+
99
  from fastapi import FastAPI, Response
100
  from fastapi.responses import FileResponse
101
  from kokoro import KPipeline
102
  import soundfile as sf
103
  import os
104
  import numpy as np
105
+ import torch
106
  from huggingface_hub import InferenceClient
107
+ from pydantic import BaseModel
108
+ import base64
109
+ from io import BytesIO
110
+ from PIL import Image
111
 
112
+ class TextImageRequest(BaseModel):
113
+ text: str = None
114
+ image_base64: str = None
115
+ voice: str = "af_heart"
116
+ speed: float = 1.0
117
+
118
+ def llm_chat_response(text, image_base64=None):
119
  HF_TOKEN = os.getenv("HF_TOKEN")
120
  client = InferenceClient(api_key=HF_TOKEN)
121
+
122
+ message_content = [
123
+ {
124
+ "type": "text",
125
+ "text": text + str('describe in one line only')
126
+ }
127
+ ]
128
+
129
+ # If image_base64 is provided, add it to the message content
130
+ if image_base64:
131
+ # Convert base64 to PIL Image for validation
132
+ try:
133
+ image_bytes = base64.b64decode(image_base64)
134
+ # Validate that it's a proper image
135
+ Image.open(BytesIO(image_bytes))
136
+
137
+ # Add the image to message content
138
+ message_content.append({
139
+ "type": "image",
140
+ "image": {
141
+ "data": image_base64
142
+ }
143
+ })
144
+ except Exception as e:
145
+ print(f"Error processing image: {e}")
146
+
147
  messages = [
148
+ {
149
+ "role": "user",
150
+ "content": message_content
151
+ }
 
 
 
 
 
 
 
 
 
 
 
152
  ]
153
+
154
  response_from_llama = client.chat.completions.create(
155
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
156
+ messages=messages,
157
+ max_tokens=500
158
+ )
159
  return response_from_llama.choices[0].message['content']
160
 
161
  app = FastAPI()
 
162
  # Initialize pipeline once at startup
163
  pipeline = KPipeline(lang_code='a')
164
 
165
  @app.post("/generate")
166
+ async def generate_audio(request: TextImageRequest):
167
+ # If no text is provided but image is provided, use default prompt
168
+ user_text = request.text
169
+ if user_text is None and request.image_base64:
170
+ user_text = "Describe what you see in the image"
171
+ elif user_text is None:
172
+ user_text = ""
173
+
174
+ # Generate response using text and image if provided
175
+ text_reply = llm_chat_response(user_text, request.image_base64)
176
 
177
  # Generate audio
178
  generator = pipeline(
179
  text_reply,
180
+ voice=request.voice,
181
+ speed=request.speed,
182
  split_pattern=r'\n+'
183
  )
184
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # Process only the first segment for demo
186
  for i, (gs, ps, audio) in enumerate(generator):
 
187
  # Convert PyTorch tensor to NumPy array
188
  audio_numpy = audio.cpu().numpy()
 
189
 
190
+ # Convert to 16-bit PCM
191
  # Ensure the audio is in the range [-1, 1]
192
  audio_numpy = np.clip(audio_numpy, -1, 1)
193
+
194
  # Convert to 16-bit signed integers
195
  pcm_data = (audio_numpy * 32767).astype(np.int16)
196