AbstractPhil commited on
Commit
4b732ce
·
1 Parent(s): 3d65633
Files changed (1) hide show
  1. app.py +14 -8
app.py CHANGED
@@ -73,8 +73,8 @@ def _hf_login() -> None:
73
  else:
74
  print("[HF Auth] No token found in environment variables")
75
 
76
- # Login before loading any models
77
- _hf_login()
78
 
79
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
80
 
@@ -364,12 +364,17 @@ def zerogpu_generate(full_prompt,
364
  out_ids = model.generate(
365
  **inputs,
366
  do_sample=bool(gen_kwargs.get("do_sample", True)),
367
- temperature=float(gen_kwargs.get("temperature", 0.7)),
368
- top_p=float(gen_kwargs.get("top_p", 0.9)),
369
- top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
370
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
371
  pad_token_id=model.config.pad_token_id,
372
  eos_token_id=tokenizer.eos_token_id,
 
 
 
 
 
373
 
374
  logits_processor=logits_processor,
375
  repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.2)),
@@ -424,7 +429,7 @@ def zerogpu_generate(full_prompt,
424
  # Simple (non-Harmony) GPU path — matches your minimal example
425
  # -----------------------
426
  @spaces.GPU(duration=120)
427
- def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_map: Optional[Dict[str, float]], rose_alpha: float, seed: Optional[int]) -> Dict[str, str]:
428
  """Straight chat_template path. No Harmony tokens. Slices completion from prompt_len.
429
  Mirrors the minimal HF example and avoids header loops entirely."""
430
  model = None
@@ -445,7 +450,8 @@ def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_ma
445
  logits_processor = None
446
  if rose_map:
447
  bias = build_bias_from_tokens(tokenizer, rose_map).to(device)
448
- logits_processor = [RoseGuidedLogits(bias, float(rose_alpha))]
 
449
 
450
  out_ids = model.generate(
451
  **inputs,
@@ -512,7 +518,6 @@ def zerogpu_generate_debug(full_prompt, gen_kwargs: Dict[str, Any]) -> Dict[str,
512
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
513
  pad_token_id=model.config.pad_token_id,
514
  eos_token_id=tokenizer.eos_token_id,
515
- bad_words_ids=bad_words_ids,
516
  stopping_criteria=sc,
517
  repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
518
  no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
@@ -653,6 +658,7 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
653
  },
654
  rose_map,
655
  float(rose_alpha),
 
656
  int(seed) if seed is not None else None,
657
  )
658
  else:
 
73
  else:
74
  print("[HF Auth] No token found in environment variables")
75
 
76
+ # Login is handled by Space OAuth/session; avoid explicit CLI login here to prevent OAuth var errors
77
+ # _hf_login()
78
 
79
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
80
 
 
364
  out_ids = model.generate(
365
  **inputs,
366
  do_sample=bool(gen_kwargs.get("do_sample", True)),
367
+ temperature=float(gen_kwargs.get("temperature", 0.6)),
368
+ top_p=(float(gen_kwargs.get("top_p")) if gen_kwargs.get("top_p") is not None else None),
369
+ top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
370
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
371
  pad_token_id=model.config.pad_token_id,
372
  eos_token_id=tokenizer.eos_token_id,
373
+ repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.1)),
374
+ no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
375
+ logits_processor=logits_processor,
376
+ )
377
+ eos_token_id=tokenizer.eos_token_id,
378
 
379
  logits_processor=logits_processor,
380
  repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.2)),
 
429
  # Simple (non-Harmony) GPU path — matches your minimal example
430
  # -----------------------
431
  @spaces.GPU(duration=120)
432
+ def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_map: Optional[Dict[str, float]], rose_alpha: float, rose_score: Optional[float], seed: Optional[int]) -> Dict[str, str]:
433
  """Straight chat_template path. No Harmony tokens. Slices completion from prompt_len.
434
  Mirrors the minimal HF example and avoids header loops entirely."""
435
  model = None
 
450
  logits_processor = None
451
  if rose_map:
452
  bias = build_bias_from_tokens(tokenizer, rose_map).to(device)
453
+ eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
454
+ logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
455
 
456
  out_ids = model.generate(
457
  **inputs,
 
518
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
519
  pad_token_id=model.config.pad_token_id,
520
  eos_token_id=tokenizer.eos_token_id,
 
521
  stopping_criteria=sc,
522
  repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
523
  no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
 
658
  },
659
  rose_map,
660
  float(rose_alpha),
661
+ float(rose_score) if rose_score is not None else None,
662
  int(seed) if seed is not None else None,
663
  )
664
  else: