Spaces:
Paused
Paused
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList | |
| import torch | |
| import subprocess | |
| # useradd -m -u 1000 user | |
| subprocess.run(['useradd', '-m', "-u", "1000", "user" ]) | |
| import torch._dynamo | |
| torch._dynamo.config.suppress_errors = True | |
| import os | |
| # import pwd | |
| # print("HERE will print PWD") | |
| # print(pwd.getpwuid(os.getuid())[0]) | |
| # os.system("nvidia-smi") | |
| # print("TORCH_CUDA", torch.cuda.is_available()) | |
| print("loading model") | |
| # Load the tokenizer and model | |
| repo_name = "nvidia/Hymba-1.5B-Instruct" | |
| # repo_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True) | |
| model = model.cuda().to(torch.bfloat16) | |
| print("model is loaded") | |
| # Chat with Hymba | |
| # prompt = input() | |
| prompt = "Who are you?" | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."} | |
| ] | |
| messages.append({"role": "user", "content": prompt}) | |
| # Apply chat template | |
| tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda') | |
| stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="</s>")]) | |
| print("generating prompt") | |
| outputs = model.generate( | |
| tokenized_chat, | |
| max_new_tokens=256, | |
| do_sample=False, | |
| temperature=0.7, | |
| use_cache=True, | |
| stopping_criteria=stopping_criteria | |
| ) | |
| input_length = tokenized_chat.shape[1] | |
| response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True) | |
| print(f"Model response: {response}") | |
| def greet(name): | |
| print(f"User: prompt") | |
| print(f"Model response: {response}") | |
| # return "Hello " + name + "!!" | |
| demo = gr.Interface(fn=greet, inputs="text", outputs="text") | |
| demo.launch() |