Nymbo commited on
Commit
12e0f66
·
verified ·
1 Parent(s): 1c962e3

Adding Video Generation MCP tool, default model is Wan2.2-T2V-A14B

Browse files
Files changed (1) hide show
  1. app.py +188 -3
app.py CHANGED
@@ -3,7 +3,6 @@
3
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
4
  # 3) Python Code Executor — run Python code and capture stdout/errors
5
  # 4) Kokoro TTS — synthesize speech from text using Kokoro-82M
6
- # 5) Serverless Image Gen — Generate image on Hugging Face inference, default model is FLUX.1-Krea-dev.
7
 
8
  from __future__ import annotations
9
 
@@ -23,6 +22,7 @@ from urllib.parse import urljoin, urldefrag, urlparse
23
  from duckduckgo_search import DDGS
24
  from PIL import Image
25
  from huggingface_hub import InferenceClient
 
26
 
27
  # Optional imports for Kokoro TTS (loaded lazily)
28
  import numpy as np
@@ -631,7 +631,7 @@ CSS_STYLES = """
631
  }
632
  /* Default: add subtitle under titles */
633
  .gradio-container h1::after {
634
- content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation";
635
  display: block;
636
  font-size: 1rem;
637
  font-weight: 500;
@@ -792,15 +792,200 @@ image_generation_interface = gr.Interface(
792
  allow_flagging="never",
793
  )
794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  # Build tabbed app including Image Generation
796
  demo = gr.TabbedInterface(
797
- interface_list=[fetch_interface, concise_interface, code_interface, kokoro_interface, image_generation_interface],
 
 
 
 
 
 
 
798
  tab_names=[
799
  "Fetch Webpage",
800
  "DuckDuckGo Search",
801
  "Python Code Executor",
802
  "Kokoro TTS",
803
  "Image Generation",
 
804
  ],
805
  title="Tools MCP",
806
  theme="Nymbo/Nymbo_Theme",
 
3
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
4
  # 3) Python Code Executor — run Python code and capture stdout/errors
5
  # 4) Kokoro TTS — synthesize speech from text using Kokoro-82M
 
6
 
7
  from __future__ import annotations
8
 
 
22
  from duckduckgo_search import DDGS
23
  from PIL import Image
24
  from huggingface_hub import InferenceClient
25
+ import time
26
 
27
  # Optional imports for Kokoro TTS (loaded lazily)
28
  import numpy as np
 
631
  }
632
  /* Default: add subtitle under titles */
633
  .gradio-container h1::after {
634
+ content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation | Video Generation";
635
  display: block;
636
  font-size: 1rem;
637
  font-weight: 500;
 
792
  allow_flagging="never",
793
  )
794
 
795
+ # ==========================
796
+ # Video Generation (Serverless)
797
+ # ==========================
798
+
799
+ def _write_video_tmp(data_iter_or_bytes: object, suffix: str = ".mp4") -> str:
800
+ """Write video bytes or iterable of bytes to a temporary file and return its path."""
801
+ os.makedirs("outputs", exist_ok=True)
802
+ fname = f"outputs/video_{int(time.time())}_{random.randint(1000,9999)}{suffix}"
803
+ mode = "wb"
804
+ with open(fname, mode) as f:
805
+ # bytes-like
806
+ if isinstance(data_iter_or_bytes, (bytes, bytearray)):
807
+ f.write(data_iter_or_bytes) # type: ignore[arg-type]
808
+ # file-like with read()
809
+ elif hasattr(data_iter_or_bytes, "read"):
810
+ f.write(data_iter_or_bytes.read()) # type: ignore[call-arg]
811
+ # response-like with content
812
+ elif hasattr(data_iter_or_bytes, "content"):
813
+ f.write(data_iter_or_bytes.content) # type: ignore[attr-defined]
814
+ # iterable of chunks
815
+ elif hasattr(data_iter_or_bytes, "__iter__") and not isinstance(data_iter_or_bytes, (str, dict)):
816
+ for chunk in data_iter_or_bytes: # type: ignore[assignment]
817
+ if chunk:
818
+ f.write(chunk)
819
+ else:
820
+ raise gr.Error("Unsupported video data type returned by provider.")
821
+ return fname
822
+
823
+
824
+ HF_VIDEO_TOKEN = os.getenv("HF_READ_TOKEN") or os.getenv("HF_TOKEN")
825
+
826
+
827
+ def Generate_Video( # <-- MCP tool #6 (Generate Video)
828
+ prompt: Annotated[str, "Text description of the video to generate (e.g., 'a red fox running through a snowy forest at sunrise')."],
829
+ model_id: Annotated[str, "Hugging Face model id in the form 'creator/model-name'. Defaults to Wan-AI/Wan2.2-T2V-A14B."] = "Wan-AI/Wan2.2-T2V-A14B",
830
+ negative_prompt: Annotated[str, "What should NOT appear in the video."] = "",
831
+ steps: Annotated[int, "Number of denoising steps (1–100). Higher can improve quality but is slower."] = 25,
832
+ cfg_scale: Annotated[float, "Guidance scale (1–20). Higher = follow the prompt more closely, lower = more creative."] = 3.5,
833
+ seed: Annotated[int, "Random seed for reproducibility. Use -1 for a random seed per call."] = -1,
834
+ width: Annotated[int, "Output width in pixels (multiples of 8 recommended)."] = 768,
835
+ height: Annotated[int, "Output height in pixels (multiples of 8 recommended)."] = 768,
836
+ fps: Annotated[int, "Frames per second of the output video (e.g., 24)."] = 24,
837
+ duration: Annotated[float, "Target duration in seconds (provider/model dependent, commonly 2–6s)."] = 4.0,
838
+ ) -> str:
839
+ """
840
+ Generate a short video from a text prompt using Hugging Face Inference Providers (Serverless Inference).
841
+
842
+ This tool follows the latest MCP guidance for Gradio-based MCP servers: clear type hints and
843
+ docstrings define the tool schema automatically. The returned file path will be converted to a file URL
844
+ for MCP clients.
845
+
846
+ Args:
847
+ prompt (str): Text description of the video to generate.
848
+ model_id (str): The Hugging Face model id (creator/model-name). Defaults to "Wan-AI/Wan2.2-T2V-A14B".
849
+ negative_prompt (str): What should NOT appear in the video.
850
+ steps (int): Number of denoising steps (1–100). Higher can improve quality but is slower.
851
+ cfg_scale (float): Guidance scale (1–20). Higher = follow the prompt more closely.
852
+ seed (int): Random seed. Use -1 to randomize on each call.
853
+ width (int): Output width in pixels.
854
+ height (int): Output height in pixels.
855
+ fps (int): Frames per second.
856
+ duration (float): Target duration in seconds.
857
+
858
+ Returns:
859
+ str: Path to an MP4 file on disk (Gradio will serve this file; MCP converts it to a file URL).
860
+
861
+ Error modes:
862
+ - Raises gr.Error with a user-friendly message on auth/model/load errors or unsupported parameters.
863
+ """
864
+ if not prompt or not prompt.strip():
865
+ raise gr.Error("Please provide a non-empty prompt.")
866
+
867
+ if not HF_VIDEO_TOKEN:
868
+ # Still attempt without a token (public models), but warn earlier if it fails.
869
+ pass
870
+
871
+ providers = ["auto", "replicate", "fal-ai"]
872
+ last_error: Exception | None = None
873
+
874
+ # Build a common parameters dict. Providers may ignore unsupported keys.
875
+ parameters = {
876
+ "negative_prompt": negative_prompt or None,
877
+ "num_inference_steps": steps,
878
+ "guidance_scale": cfg_scale,
879
+ "seed": seed if seed != -1 else random.randint(1, 1_000_000_000),
880
+ "width": width,
881
+ "height": height,
882
+ "fps": fps,
883
+ # Some providers/models expect num_frames instead of duration; we pass both-friendly value
884
+ # when supported; they may be ignored by the backend.
885
+ "duration": duration,
886
+ }
887
+
888
+ for provider in providers:
889
+ try:
890
+ client = InferenceClient(api_key=HF_VIDEO_TOKEN, provider=provider)
891
+ # Use the documented text_to_video API with correct parameters
892
+ if hasattr(client, "text_to_video"):
893
+ # Calculate num_frames from duration and fps if both provided
894
+ num_frames = int(duration * fps) if duration and fps else None
895
+
896
+ # Build extra_body for provider-specific parameters
897
+ extra_body = {}
898
+ if width:
899
+ extra_body["width"] = width
900
+ if height:
901
+ extra_body["height"] = height
902
+ if fps:
903
+ extra_body["fps"] = fps
904
+ if duration:
905
+ extra_body["duration"] = duration
906
+
907
+ result = client.text_to_video(
908
+ prompt=prompt,
909
+ model=model_id,
910
+ guidance_scale=cfg_scale,
911
+ negative_prompt=[negative_prompt] if negative_prompt else None,
912
+ num_frames=num_frames,
913
+ num_inference_steps=steps,
914
+ seed=parameters["seed"],
915
+ extra_body=extra_body if extra_body else None,
916
+ )
917
+ else:
918
+ # Generic POST fallback for older versions
919
+ result = client.post(
920
+ model=model_id,
921
+ json={
922
+ "inputs": prompt,
923
+ "parameters": {k: v for k, v in parameters.items() if v is not None},
924
+ },
925
+ )
926
+
927
+ # Save output to an .mp4
928
+ path = _write_video_tmp(result, suffix=".mp4")
929
+ return path
930
+ except Exception as e:
931
+ last_error = e
932
+ continue
933
+
934
+ msg = str(last_error) if last_error else "Unknown error"
935
+ if "404" in msg:
936
+ raise gr.Error(f"Model not found or unavailable: {model_id}. Check the id and HF token access.")
937
+ if "503" in msg:
938
+ raise gr.Error("The model is warming up. Please try again shortly.")
939
+ if "401" in msg or "403" in msg:
940
+ raise gr.Error("Authentication failed or not permitted. Set HF_READ_TOKEN/HF_TOKEN with inference access.")
941
+ raise gr.Error(f"Video generation failed: {msg}")
942
+
943
+
944
+ video_generation_interface = gr.Interface(
945
+ fn=Generate_Video,
946
+ inputs=[
947
+ gr.Textbox(label="Prompt", placeholder="Enter a prompt for the video", lines=2),
948
+ gr.Textbox(label="Model", value="Wan-AI/Wan2.2-T2V-A14B", placeholder="creator/model-name"),
949
+ gr.Textbox(label="Negative Prompt", value="", lines=2),
950
+ gr.Slider(minimum=1, maximum=100, value=25, step=1, label="Steps"),
951
+ gr.Slider(minimum=1.0, maximum=20.0, value=3.5, step=0.1, label="CFG Scale"),
952
+ gr.Slider(minimum=-1, maximum=1_000_000_000, value=-1, step=1, label="Seed (-1 = random)"),
953
+ gr.Slider(minimum=64, maximum=1920, value=768, step=8, label="Width"),
954
+ gr.Slider(minimum=64, maximum=1920, value=768, step=8, label="Height"),
955
+ gr.Slider(minimum=4, maximum=60, value=24, step=1, label="FPS"),
956
+ gr.Slider(minimum=1.0, maximum=10.0, value=4.0, step=0.5, label="Duration (s)"),
957
+ ],
958
+ outputs=gr.Video(label="Generated Video"),
959
+ title="Video Generation",
960
+ description=(
961
+ "<div style=\"text-align:center\">Generate short videos via Hugging Face Inference Providers. "
962
+ "Default model is Wan2.2-T2V-A14B.</div>"
963
+ ),
964
+ api_description=(
965
+ "Generate a short video from a text prompt using a Hugging Face model (Serverless Inference). "
966
+ "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
967
+ "width/height (int), fps (int), duration (float). Returns a file path to an MP4 that MCP exposes as a file URL."
968
+ ),
969
+ allow_flagging="never",
970
+ )
971
+
972
  # Build tabbed app including Image Generation
973
  demo = gr.TabbedInterface(
974
+ interface_list=[
975
+ fetch_interface,
976
+ concise_interface,
977
+ code_interface,
978
+ kokoro_interface,
979
+ image_generation_interface,
980
+ video_generation_interface,
981
+ ],
982
  tab_names=[
983
  "Fetch Webpage",
984
  "DuckDuckGo Search",
985
  "Python Code Executor",
986
  "Kokoro TTS",
987
  "Image Generation",
988
+ "Video Generation",
989
  ],
990
  title="Tools MCP",
991
  theme="Nymbo/Nymbo_Theme",