Spaces:

AlphaPhoenix
/

MATRIX

Sleeping

App Files Files Community

laserbeam2045 commited on May 4

Commit

6dd176e

1 Parent(s): 9736832

fix

Browse files

Files changed (1) hide show

app.py +34 -44

app.py CHANGED Viewed

@@ -1,52 +1,49 @@
 # app.py
 import os
-import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM
 # -----------------------------------------------------------------------------
-# 設定
 # -----------------------------------------------------------------------------
-MODEL_ID = "google/gemma-3-4b-it"
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-DEVICE   = torch.device("cpu")  # 無料枠は CPU のみ
-# -----------------------------------------------------------------------------
-# トークナイザーのロード
-# -----------------------------------------------------------------------------
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-    token=HF_TOKEN,
-    trust_remote_code=True
-)
 # -----------------------------------------------------------------------------
-# モデルのロード＋低メモリモード
 # -----------------------------------------------------------------------------
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    token=HF_TOKEN,
-    trust_remote_code=True,
-    torch_dtype=torch.float32,
-    low_cpu_mem_usage=True
-).to(DEVICE)
 # -----------------------------------------------------------------------------
-# 動的量子化の適用
 # -----------------------------------------------------------------------------
-# - {torch.nn.Linear} を INT8 化
-# - dtype=torch.qint8 で重みのみ量子化
-model = torch.quantization.quantize_dynamic(
-    model,
-    {torch.nn.Linear},
-    dtype=torch.qint8
 )
 # -----------------------------------------------------------------------------
-# FastAPI サーバー定義
 # -----------------------------------------------------------------------------
-app = FastAPI(title="Gemma3-4B-IT with Dynamic Quantization")
 class GenerationRequest(BaseModel):
     prompt: str
@@ -58,22 +55,15 @@ class GenerationRequest(BaseModel):
 async def generate(req: GenerationRequest):
     if not req.prompt:
         raise HTTPException(status_code=400, detail="`prompt` は必須です。")
-    # トークナイズして推論
-    inputs = tokenizer(
         req.prompt,
-        return_tensors="pt",
-        truncation=True,
-        padding=True
-    ).to(DEVICE)
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=req.max_new_tokens,
-        temperature=req.temperature,
         top_p=req.top_p,
-        do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
     )
-    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return {"generated_text": text}
 # -----------------------------------------------------------------------------

 # app.py
 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from huggingface_hub import hf_hub_download
+from pyllamacpp.model import Model
 # -----------------------------------------------------------------------------
+# Hugging Face Hub の設定
 # -----------------------------------------------------------------------------
+HF_TOKEN = os.environ.get("HF_TOKEN")  # 必要に応じて Secrets にセット
+REPO_ID  = "google/gemma-3-12b-it-qat-q4_0-gguf"
+# 実際にリポジトリに置かれている GGUF ファイル名を確認してください。
+# 例: "gemma-3-12b-it-qat-q4_0-gguf.gguf"
+GGUF_FILENAME = "gemma-3-12b-it-qat-q4_0-gguf.gguf"
+# キャッシュ先のパス（リポジトリ直下に置く場合）
+MODEL_PATH = os.path.join(os.getcwd(), GGUF_FILENAME)
 # -----------------------------------------------------------------------------
+# 起動時に一度だけダウンロード
 # -----------------------------------------------------------------------------
+if not os.path.exists(MODEL_PATH):
+    print(f"Downloading {GGUF_FILENAME} from {REPO_ID} …")
+    hf_hub_download(
+        repo_id=REPO_ID,
+        filename=GGUF_FILENAME,
+        token=HF_TOKEN,
+        repo_type="model",        # 明示的にモデルリポジトリを指定
+        local_dir=os.getcwd(),    # カレントディレクトリに保存
+        local_dir_use_symlinks=False
+    )
 # -----------------------------------------------------------------------------
+# llama.cpp (pyllamacpp) で 4bit GGUF モデルをロード
 # -----------------------------------------------------------------------------
+llm = Model(
+    model_path=MODEL_PATH,
+    n_ctx=512,       # 必要に応じて調整
+    n_threads=4,     # 実マシンのコア数に合わせて
 )
 # -----------------------------------------------------------------------------
+# FastAPI 定義
 # -----------------------------------------------------------------------------
+app = FastAPI(title="Gemma3-12B-IT Q4_0 GGUF API")
 class GenerationRequest(BaseModel):
     prompt: str
 async def generate(req: GenerationRequest):
     if not req.prompt:
         raise HTTPException(status_code=400, detail="`prompt` は必須です。")
+    # llama.cpp の generate を呼び出し
+    text = llm.generate(
         req.prompt,
         top_p=req.top_p,
+        temp=req.temperature,
+        n_predict=req.max_new_tokens,
+        repeat_last_n=64,
+        repeat_penalty=1.1
     )
     return {"generated_text": text}
 # -----------------------------------------------------------------------------