Nymbo commited on
Commit
208563c
·
verified ·
1 Parent(s): dbd129c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -174
app.py CHANGED
@@ -25,7 +25,6 @@ from duckduckgo_search import DDGS
25
  from PIL import Image
26
  from huggingface_hub import InferenceClient
27
  import time
28
- import wave
29
 
30
  # Optional imports for Kokoro TTS (loaded lazily)
31
  import numpy as np
@@ -502,15 +501,14 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
502
  text: Annotated[str, "The text to synthesize (English)."],
503
  speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
504
  voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
505
- ) -> str:
506
  """
507
  Synthesize speech from text using the Kokoro-82M model.
508
 
509
- Per current HF Gradio MCP guidance (see hf-docs-search), tools should return
510
- browser/client-friendly artifacts where possible. This function returns the
511
- path to a WAV file on disk so the UI renders an HTML5 audio player and MCP
512
- clients receive a file URL that opens in the browser rather than forcing a
513
- direct download.
514
 
515
  Args:
516
  text: The text to synthesize (English).
@@ -518,8 +516,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
518
  voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
519
 
520
  Returns:
521
- str: Path to a 24 kHz mono WAV file on disk (served by Gradio; MCP converts
522
- paths to file URLs).
 
523
 
524
  Notes:
525
  - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
@@ -545,11 +544,8 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
545
  audio = model(ps, ref_s, float(speed))
546
  except Exception as e: # propagate as UI-friendly error
547
  raise gr.Error(f"Error generating audio: {str(e)}")
548
- # Save 24 kHz mono waveform to WAV and return its path for in-browser playback
549
- sr = 24_000
550
- wav = audio.detach().cpu().numpy()
551
- path = _write_audio_tmp(wav, sample_rate=sr, suffix=".wav")
552
- return path
553
 
554
  # If pipeline produced no segments
555
  raise gr.Error("No audio was generated (empty synthesis result).")
@@ -641,7 +637,7 @@ CSS_STYLES = """
641
  /* Place bold tools list on line 2, normal auth note on line 3 (below title) */
642
  .gradio-container h1::before {
643
  grid-row: 2;
644
- content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation | Video Generation | Generate Code";
645
  display: block;
646
  font-size: 1rem;
647
  font-weight: 700;
@@ -651,7 +647,7 @@ CSS_STYLES = """
651
  }
652
  .gradio-container h1::after {
653
  grid-row: 3;
654
- content: "Authentication is optional. Image/Video (and some Code) generation may require `HF_READ_TOKEN`; Image/Video tabs hide without it.";
655
  display: block;
656
  font-size: 1rem;
657
  font-weight: 400;
@@ -675,14 +671,15 @@ kokoro_interface = gr.Interface(
675
  gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
676
  gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
677
  ],
678
- outputs=gr.Audio(label="Audio", type="filepath"),
679
  title="Kokoro TTS",
680
  description=(
681
  "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
682
  ),
683
  api_description=(
684
- "Synthesize speech from text using Kokoro-82M. Returns a file path to a 24 kHz mono WAV, which renders in-browser and is exposed as a file URL over MCP. "
685
- "Parameters: text (str), speed (float 0.5–2.0), voice (str)."
 
686
  ),
687
  allow_flagging="never",
688
  )
@@ -990,158 +987,6 @@ video_generation_interface = gr.Interface(
990
  allow_flagging="never",
991
  )
992
 
993
- # ==========================
994
- # Audio helper (save WAV)
995
- # ==========================
996
-
997
- def _write_audio_tmp(audio: np.ndarray, sample_rate: int = 24_000, suffix: str = ".wav") -> str:
998
- """Write mono float32 waveform [-1,1] to 16-bit PCM WAV and return path."""
999
- if audio.ndim > 1:
1000
- audio = np.mean(audio, axis=0)
1001
- audio = np.clip(audio.astype(np.float32), -1.0, 1.0)
1002
- pcm = (audio * 32767.0).astype(np.int16)
1003
- os.makedirs("outputs", exist_ok=True)
1004
- fname = f"outputs/audio_{int(time.time())}_{random.randint(1000,9999)}{suffix}"
1005
- with wave.open(fname, "wb") as wf:
1006
- wf.setnchannels(1)
1007
- wf.setsampwidth(2)
1008
- wf.setframerate(sample_rate)
1009
- wf.writeframes(pcm.tobytes())
1010
- return fname
1011
-
1012
- # ==========================
1013
- # Code Generation (Serverless)
1014
- # ==========================
1015
-
1016
- def Generate_Code(
1017
- instruction: Annotated[str, "Describe the code to generate (requirements, I/O, constraints)."],
1018
- language: Annotated[str, "Optional language/framework hint (e.g., 'python', 'typescript react')."] = "",
1019
- model_id: Annotated[str, "HF text-generation model id (e.g., 'bigcode/starcoder2-3b')."] = "bigcode/starcoder2-3b",
1020
- max_new_tokens: Annotated[int, "Maximum tokens to generate (64–4096, model dependent)."] = 512,
1021
- temperature: Annotated[float, "Sampling temperature (0–1.5). Lower = more deterministic."] = 0.2,
1022
- top_p: Annotated[float, "Nucleus sampling p (0–1)."] = 0.95,
1023
- top_k: Annotated[int, "Top-k sampling cutoff (0 disables)."] = 50,
1024
- repetition_penalty: Annotated[float, "Discourage repeats (>1.0)."] = 1.05,
1025
- seed: Annotated[int, "Random seed (-1 = random)."] = -1,
1026
- save_to_file: Annotated[bool, "If true, save under ./outputs and prepend 'Saved to:' path."] = False,
1027
- filename: Annotated[str, "Optional filename when saving (e.g., main.py)."] = "",
1028
- ) -> str:
1029
- """
1030
- Generate source code via Hugging Face Inference text-generation models and return code as plain text.
1031
-
1032
- Per current MCP docs (via hf-docs-search), schemas are inferred from type hints and docstrings. Returning
1033
- text is broadly compatible; when save_to_file is enabled, the response is prefixed with the saved path so
1034
- MCP clients can expose a file URL.
1035
- """
1036
- if not instruction or not instruction.strip():
1037
- raise gr.Error("Please provide a non-empty instruction.")
1038
-
1039
- token = os.getenv("HF_READ_TOKEN") or os.getenv("HF_TOKEN")
1040
- providers = ["auto", "replicate", "fal-ai"]
1041
- lang_hint = f" in {language.strip()}" if language and language.strip() else ""
1042
- system_preamble = (
1043
- "You are a precise coding assistant. Output only runnable code without explanations. "
1044
- "Prefer idiomatic patterns, minimal comments, and include necessary imports."
1045
- )
1046
- prompt = (
1047
- f"{system_preamble}\n\nTask{lang_hint}:\n{instruction.strip()}\n\n"
1048
- "Return only the code, no backticks."
1049
- )
1050
-
1051
- last_error: Exception | None = None
1052
- for provider in providers:
1053
- try:
1054
- client = InferenceClient(api_key=token, provider=provider)
1055
- out = client.text_generation(
1056
- model=model_id,
1057
- prompt=prompt,
1058
- max_new_tokens=max_new_tokens,
1059
- temperature=temperature,
1060
- top_p=top_p,
1061
- top_k=top_k,
1062
- repetition_penalty=repetition_penalty,
1063
- seed=(None if seed == -1 else seed),
1064
- stream=False,
1065
- )
1066
- code = (out or "").strip()
1067
- if not code:
1068
- raise gr.Error("Model returned empty output.")
1069
- prefix = ""
1070
- if save_to_file:
1071
- os.makedirs("outputs", exist_ok=True)
1072
- base = filename.strip() or f"code_{int(time.time())}_{random.randint(1000,9999)}"
1073
- if "." not in base and language:
1074
- ext_map = {
1075
- "python": ".py",
1076
- "py": ".py",
1077
- "typescript": ".ts",
1078
- "tsx": ".tsx",
1079
- "javascript": ".js",
1080
- "jsx": ".jsx",
1081
- "go": ".go",
1082
- "rust": ".rs",
1083
- "java": ".java",
1084
- "csharp": ".cs",
1085
- "c#": ".cs",
1086
- "cpp": ".cpp",
1087
- "c++": ".cpp",
1088
- "c": ".c",
1089
- "bash": ".sh",
1090
- "shell": ".sh",
1091
- "html": ".html",
1092
- "css": ".css",
1093
- "json": ".json",
1094
- "yaml": ".yaml",
1095
- "yml": ".yml",
1096
- }
1097
- key = language.lower().split()[0]
1098
- base += ext_map.get(key, "")
1099
- path = os.path.join("outputs", base)
1100
- with open(path, "w", encoding="utf-8") as f:
1101
- f.write(code)
1102
- prefix = f"Saved to: {path}\n\n"
1103
- return f"{prefix}{code}"
1104
- except Exception as e:
1105
- last_error = e
1106
- continue
1107
- msg = str(last_error) if last_error else "Unknown error"
1108
- if "401" in msg or "403" in msg:
1109
- raise gr.Error("Authentication failed or not permitted. Set HF_READ_TOKEN/HF_TOKEN with inference access.")
1110
- if "404" in msg:
1111
- raise gr.Error(f"Model not found or unavailable: {model_id}.")
1112
- if "503" in msg:
1113
- raise gr.Error("The model is warming up. Please try again shortly.")
1114
- raise gr.Error(f"Code generation failed: {msg}")
1115
-
1116
-
1117
- code_generation_interface = gr.Interface(
1118
- fn=Generate_Code,
1119
- inputs=[
1120
- gr.Textbox(label="Instruction", placeholder="Describe what to build, inputs/outputs, edge cases…", lines=6),
1121
- gr.Textbox(label="Language (optional)", value="", placeholder="e.g., python, typescript react"),
1122
- gr.Textbox(label="Model", value="bigcode/starcoder2-3b", placeholder="creator/model-name"),
1123
- gr.Slider(minimum=64, maximum=4096, value=512, step=16, label="Max new tokens"),
1124
- gr.Slider(minimum=0.0, maximum=1.5, value=0.2, step=0.05, label="Temperature"),
1125
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.01, label="Top-p"),
1126
- gr.Slider(minimum=0, maximum=200, value=50, step=1, label="Top-k"),
1127
- gr.Slider(minimum=1.0, maximum=2.0, value=1.05, step=0.01, label="Repetition penalty"),
1128
- gr.Slider(minimum=-1, maximum=1_000_000_000, value=-1, step=1, label="Seed (-1 = random)"),
1129
- gr.Checkbox(value=False, label="Save to file (./outputs)"),
1130
- gr.Textbox(label="Filename (optional)", value="", placeholder="e.g., main.py"),
1131
- ],
1132
- outputs=gr.Code(label="Generated Code"),
1133
- title="Generate Code",
1134
- description=(
1135
- "<div style=\"text-align:center\">Generate code via Hugging Face Inference text-generation models. Provide a clear instruction and (optionally) a language hint.</div>"
1136
- ),
1137
- api_description=(
1138
- "Generate source code using a HF Inference text-generation model. Parameters: instruction (str), language (str), model_id (str), "
1139
- "max_new_tokens (int), temperature (float), top_p (float), top_k (int), repetition_penalty (float), seed (int), save_to_file (bool), filename (str). "
1140
- "Returns the code as text; if saved, prepends 'Saved to: <path>'."
1141
- ),
1142
- allow_flagging="never",
1143
- )
1144
-
1145
  # Build tabbed app; disable Image/Video tools if no HF token is present
1146
  HAS_HF_TOKEN = bool(HF_API_TOKEN or HF_VIDEO_TOKEN)
1147
 
@@ -1162,9 +1007,6 @@ if HAS_HF_TOKEN:
1162
  _interfaces.extend([image_generation_interface, video_generation_interface])
1163
  _tab_names.extend(["Image Generation", "Video Generation"])
1164
 
1165
- # Always add Generate Code as the last tab
1166
- _interfaces.append(code_generation_interface)
1167
- _tab_names.append("Generate Code")
1168
  demo = gr.TabbedInterface(
1169
  interface_list=_interfaces,
1170
  tab_names=_tab_names,
 
25
  from PIL import Image
26
  from huggingface_hub import InferenceClient
27
  import time
 
28
 
29
  # Optional imports for Kokoro TTS (loaded lazily)
30
  import numpy as np
 
501
  text: Annotated[str, "The text to synthesize (English)."],
502
  speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
503
  voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
504
+ ) -> Tuple[int, np.ndarray]:
505
  """
506
  Synthesize speech from text using the Kokoro-82M model.
507
 
508
+ This function returns raw audio suitable for a Gradio Audio component and is
509
+ also exposed as an MCP tool (per the latest Hugging Face/Gradio MCP docs, a
510
+ tool is created for each function wired into your app; docstrings and type
511
+ hints are used to describe the tool).
 
512
 
513
  Args:
514
  text: The text to synthesize (English).
 
516
  voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
517
 
518
  Returns:
519
+ A tuple of (sample_rate_hz, audio_waveform) where:
520
+ - sample_rate_hz: int sample rate in Hz (24_000)
521
+ - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
522
 
523
  Notes:
524
  - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
 
544
  audio = model(ps, ref_s, float(speed))
545
  except Exception as e: # propagate as UI-friendly error
546
  raise gr.Error(f"Error generating audio: {str(e)}")
547
+ # Return 24 kHz mono waveform
548
+ return 24_000, audio.detach().cpu().numpy()
 
 
 
549
 
550
  # If pipeline produced no segments
551
  raise gr.Error("No audio was generated (empty synthesis result).")
 
637
  /* Place bold tools list on line 2, normal auth note on line 3 (below title) */
638
  .gradio-container h1::before {
639
  grid-row: 2;
640
+ content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation | Video Generation";
641
  display: block;
642
  font-size: 1rem;
643
  font-weight: 700;
 
647
  }
648
  .gradio-container h1::after {
649
  grid-row: 3;
650
+ content: "Authentication is optional but Image/Video Generation require a `HF_READ_TOKEN` in env variables. They are hidden otherwise.";
651
  display: block;
652
  font-size: 1rem;
653
  font-weight: 400;
 
671
  gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
672
  gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
673
  ],
674
+ outputs=gr.Audio(label="Audio", type="numpy"),
675
  title="Kokoro TTS",
676
  description=(
677
  "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
678
  ),
679
  api_description=(
680
+ "Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
681
+ "Parameters: text (str), speed (float 0.5–2.0), voice (str). "
682
+ "Return the generated image to the user."
683
  ),
684
  allow_flagging="never",
685
  )
 
987
  allow_flagging="never",
988
  )
989
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
990
  # Build tabbed app; disable Image/Video tools if no HF token is present
991
  HAS_HF_TOKEN = bool(HF_API_TOKEN or HF_VIDEO_TOKEN)
992
 
 
1007
  _interfaces.extend([image_generation_interface, video_generation_interface])
1008
  _tab_names.extend(["Image Generation", "Video Generation"])
1009
 
 
 
 
1010
  demo = gr.TabbedInterface(
1011
  interface_list=_interfaces,
1012
  tab_names=_tab_names,