Ggggggg

Paused

App Files Files Community

jnjj commited on Apr 21

Commit

4fceb86

verified ·

1 Parent(s): 26eadbd

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -38

app.py CHANGED Viewed

@@ -5,14 +5,13 @@ import asyncio
 import threading
 import time
 import torch
-from fastapi import FastAPI, HTTPException, BackgroundTasks
-from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse, HTMLResponse, FileResponse
 from pydantic import BaseModel, Field
 from transformers import (
     AutoConfig,
     AutoTokenizer,
     GenerationConfig,
-    BitsAndBytesConfig,
     AutoModelForCausalLM,
     AutoProcessor,
     MusicgenForConditionalGeneration
@@ -28,25 +27,15 @@ from duckduckgo_search import DDGS
 # -----------------------
 MODEL_NAME = "jnjj/gemma-3-4b-it-1layer-actual"
 MAX_CONTEXT_LEN = 1024
-# Cuantización y offload para texto
-bnb_config = BitsAndBytesConfig(
-    load_in_8bit=False,
-    llm_int8_threshold=6.0,
-    llm_int8_has_fp16_weight=False
-)
-# Contexto máximo para MusicGen
 MUSICGEN_MAX_TOKENS = 256
 global_model = None
 global_tokenizer = None
 global_tokens = {}
-# Diffusers animation pipeline globals
 motion_adapter = None
 anim_pipe = None
-# MusicGen globals
 music_processor = None
 music_model = None
 executor = ThreadPoolExecutor(max_workers=4)
 # -----------------------
@@ -74,7 +63,6 @@ def load_global_models():
     global_model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         config=text_config,
-        quantization_config=bnb_config,
         device_map="auto",
         offload_folder="./offload",
         offload_state_dict=True,
@@ -85,7 +73,6 @@ def load_global_models():
     )
     global_model = torch.compile(global_model, backend="inductor")
-    # Tokens especiales
     if global_tokenizer.eos_token_id is not None and global_tokenizer.pad_token_id is None:
         global_tokenizer.pad_token_id = global_tokenizer.eos_token_id
     global_tokens.update({
@@ -121,7 +108,7 @@ def load_global_models():
     )
     music_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-    print("Modelos de texto, animación y audio cargados con optimizaciones.")
 @app.get("/", response_class=HTMLResponse)
 def index():
@@ -131,7 +118,7 @@ def index():
         <body>
             <h1>Servicio de Generación Multimedia</h1>
             <ul>
-                <li>Texto: cuantización 4-bit, offload, torch.compile.</li>
                 <li>Animación: AnimateDiffPipeline con LoRA y CPU offload.</li>
                 <li>Audio: MusicGen small, max tokens 256.</li>
             </ul>
@@ -144,9 +131,6 @@ def index():
 def health():
     return {"status": "ok"}
-# -----------------------
-# Pydantic Schemas
-# -----------------------
 class GenerateRequest(BaseModel):
     input_text: str = ""
     max_new_tokens: int = 2
@@ -178,10 +162,7 @@ class MusicRequest(BaseModel):
     texts: list[str]
     max_new_tokens: int = MUSICGEN_MAX_TOKENS
-# -----------------------
-# Utility Functions
-# -----------------------
-def cleanup_memory():
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
@@ -199,9 +180,6 @@ async def perform_duckasgo_search(query: str, max_results: int = 3) -> str:
         text += f"{i}. {r.get('title','')}\n   URL: {r.get('href','')}\n   {r.get('body','')}\n"
     return text
-# -----------------------
-# Text Generation
-# -----------------------
 async def generate_next_token(input_ids, past_key_values, gen_config, device):
     with torch.autocast(device_type=device, dtype=torch.float16):
         outputs = global_model(
@@ -240,7 +218,7 @@ async def stream_text(request: GenerateRequest, device: str):
         top_p=request.top_p,
         top_k=request.top_k,
         repetition_penalty=request.repetition_penalty,
-        frequency_penalty=request.frequency_penalty,
         presence_penalty=request.presence_penalty,
         do_sample=request.do_sample
     )
@@ -266,11 +244,8 @@ async def stream_text(request: GenerateRequest, device: str):
         yield buffer
     if request.include_duckasgo:
         yield "\n" + await perform_duckasgo_search(request.input_text)
-    cleanup_memory()
-# -----------------------
-# Endpoints
-# -----------------------
 @app.post("/generate")
 async def generate_text(request: GenerateRequest, background_tasks: BackgroundTasks):
     if global_model is None:
@@ -318,14 +293,10 @@ async def generate_music(request: MusicRequest):
         return_tensors="pt"
     ).to(device)
     with torch.no_grad():
-        audio = music_model.generate(
-            **inputs,
-            max_new_tokens=request.max_new_tokens
-        )
-    # audio is a tensor of shape (batch, seq_len)
-    # convert to WAV bytes
     wav_bytes = music_processor.decode(audio[0].cpu()).numpy().tobytes()
     return Response(wav_bytes, media_type="audio/wav")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import threading
 import time
 import torch
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Response
+from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse, HTMLResponse
 from pydantic import BaseModel, Field
 from transformers import (
     AutoConfig,
     AutoTokenizer,
     GenerationConfig,
     AutoModelForCausalLM,
     AutoProcessor,
     MusicgenForConditionalGeneration
 # -----------------------
 MODEL_NAME = "jnjj/gemma-3-4b-it-1layer-actual"
 MAX_CONTEXT_LEN = 1024
 MUSICGEN_MAX_TOKENS = 256
 global_model = None
 global_tokenizer = None
 global_tokens = {}
 motion_adapter = None
 anim_pipe = None
 music_processor = None
 music_model = None
 executor = ThreadPoolExecutor(max_workers=4)
 # -----------------------
     global_model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         config=text_config,
         device_map="auto",
         offload_folder="./offload",
         offload_state_dict=True,
     )
     global_model = torch.compile(global_model, backend="inductor")
     if global_tokenizer.eos_token_id is not None and global_tokenizer.pad_token_id is None:
         global_tokenizer.pad_token_id = global_tokenizer.eos_token_id
     global_tokens.update({
     )
     music_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+    print("Modelos de texto, animación y audio cargados sin bitsandbytes.")
 @app.get("/", response_class=HTMLResponse)
 def index():
         <body>
             <h1>Servicio de Generación Multimedia</h1>
             <ul>
+                <li>Texto: FP16, offload, torch.compile.</li>
                 <li>Animación: AnimateDiffPipeline con LoRA y CPU offload.</li>
                 <li>Audio: MusicGen small, max tokens 256.</li>
             </ul>
 def health():
     return {"status": "ok"}
 class GenerateRequest(BaseModel):
     input_text: str = ""
     max_new_tokens: int = 2
     texts: list[str]
     max_new_tokens: int = MUSICGEN_MAX_TOKENS
+async def cleanup_memory():
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         text += f"{i}. {r.get('title','')}\n   URL: {r.get('href','')}\n   {r.get('body','')}\n"
     return text
 async def generate_next_token(input_ids, past_key_values, gen_config, device):
     with torch.autocast(device_type=device, dtype=torch.float16):
         outputs = global_model(
         top_p=request.top_p,
         top_k=request.top_k,
         repetition_penalty=request.repetition_penalty,
+        frequency_penalty=request.frequency.penalty,
         presence_penalty=request.presence_penalty,
         do_sample=request.do_sample
     )
         yield buffer
     if request.include_duckasgo:
         yield "\n" + await perform_duckasgo_search(request.input_text)
+    await cleanup_memory()
 @app.post("/generate")
 async def generate_text(request: GenerateRequest, background_tasks: BackgroundTasks):
     if global_model is None:
         return_tensors="pt"
     ).to(device)
     with torch.no_grad():
+        audio = music_model.generate(**inputs, max_new_tokens=request.max_new_tokens)
     wav_bytes = music_processor.decode(audio[0].cpu()).numpy().tobytes()
     return Response(wav_bytes, media_type="audio/wav")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)