Spaces:

VietCat
/

ViT5BaseNode

Sleeping

VietCat commited on Jun 12

Commit

fb4a646

1 Parent(s): 08a672b

reduce processing time

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,6 +16,11 @@ model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-large-vietnews-summar
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 class SummarizeRequest(BaseModel):
     text: str
@@ -37,27 +42,20 @@ async def summarize(req: Request, body: SummarizeRequest):
     else:
         text = "Vietnews: " + text
     input_text = text + " </s>"
     encoding = tokenizer(input_text, return_tensors="pt")
     input_ids = encoding["input_ids"].to(device)
     attention_mask = encoding["attention_mask"].to(device)
-    # Sinh tóm tắt với cấu hình ổn định
-    # outputs = model.generate(
-    #     input_ids=input_ids,
-    #     attention_mask=attention_mask,
-    #     max_length=128,
-    #     num_beams=1,
-    #     early_stopping=True,
-    #     no_repeat_ngram_size=2,
-    #     num_return_sequences=1
-    # )
     outputs = model.generate(
-        input_ids=input_ids, attention_mask=attention_mask,
         max_length=256,
-        early_stopping=True
     )
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
     end_time = time.time()

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+# Warm-up model to reduce first-request latency
+dummy_input = tokenizer("Tin nhanh: Đây là văn bản mẫu để warmup mô hình.", return_tensors="pt").to(device)
+with torch.no_grad():
+    _ = model.generate(**dummy_input, max_length=32)
 class SummarizeRequest(BaseModel):
     text: str
     else:
         text = "Vietnews: " + text
     input_text = text + " </s>"
     encoding = tokenizer(input_text, return_tensors="pt")
     input_ids = encoding["input_ids"].to(device)
     attention_mask = encoding["attention_mask"].to(device)
+    # Sinh tóm tắt với cấu hình ổn định (loại bỏ early_stopping và dùng greedy decoding)
     outputs = model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
         max_length=256,
+        num_beams=1,  # greedy decoding
+        no_repeat_ngram_size=2
     )
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
     end_time = time.time()