Spaces:

Lihuchen
/

llm_with_confidence

Runtime error

Lihuchen commited on Oct 31, 2023

Commit

d94406a

1 Parent(s): a10a2a4

Upload 2 files

Files changed (2) hide show

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from llama_generate import run
 def greet(query):
-    results = run(query)
     return results
@@ -12,5 +12,5 @@ sample_list = [
     "Who is Gaël Varoquaux?"
 ]
-iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=False)
 iface.launch()

 def greet(query):
+    results = run(query, 5)
     return results
     "Who is Gaël Varoquaux?"
 ]
+iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=True)
 iface.launch()

llama_generate.py CHANGED Viewed

@@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from nltk.tokenize import sent_tokenize
-torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
 model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
 model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
@@ -11,6 +11,12 @@ model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                              trust_remote_code=False,
                                              revision="main")
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
@@ -41,7 +47,7 @@ def single_generate(query):
     model_inputs = encodeds.to(device)
     model.to(device)
-    generated_ids = model.generate(model_inputs, max_new_tokens=100,  do_sample=True, temperature=1.0)
     decoded = tokenizer.batch_decode(generated_ids)
     results = list()
     for index, result in enumerate(decoded):
@@ -158,5 +164,5 @@ if __name__ == '__main__':
     # print(result)
     # result = """
-    answer = run(query='WHo is Lihu Chen?', sample_size=10)
     print(answer)

 import torch
 from nltk.tokenize import sent_tokenize
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
 model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
 model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                              trust_remote_code=False,
                                              revision="main")
+from ctransformers import AutoModelForCausalLM
+# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
+llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)
+print(llm("AI is going to"))
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
     model_inputs = encodeds.to(device)
     model.to(device)
+    generated_ids = model.generate(model_inputs, max_new_tokens=150,  do_sample=True, temperature=1.0)
     decoded = tokenizer.batch_decode(generated_ids)
     results = list()
     for index, result in enumerate(decoded):
     # print(result)
     # result = """
+    answer = run(query='Tell me something about Gaël Varoquaux, e.g., birth date and place and short bio ', sample_size=10)
     print(answer)