Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Upload 2 files
Browse files- app.py +2 -2
- llama_generate.py +9 -3
    	
        app.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ from llama_generate import run | |
| 3 |  | 
| 4 |  | 
| 5 | 
             
            def greet(query):
         | 
| 6 | 
            -
                results = run(query)
         | 
| 7 | 
             
                return results
         | 
| 8 |  | 
| 9 |  | 
| @@ -12,5 +12,5 @@ sample_list = [ | |
| 12 | 
             
                "Who is Gaël Varoquaux?"
         | 
| 13 | 
             
            ]
         | 
| 14 |  | 
| 15 | 
            -
            iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples= | 
| 16 | 
             
            iface.launch()
         | 
|  | |
| 3 |  | 
| 4 |  | 
| 5 | 
             
            def greet(query):
         | 
| 6 | 
            +
                results = run(query, 5)
         | 
| 7 | 
             
                return results
         | 
| 8 |  | 
| 9 |  | 
|  | |
| 12 | 
             
                "Who is Gaël Varoquaux?"
         | 
| 13 | 
             
            ]
         | 
| 14 |  | 
| 15 | 
            +
            iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=True)
         | 
| 16 | 
             
            iface.launch()
         | 
    	
        llama_generate.py
    CHANGED
    
    | @@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer | |
| 2 | 
             
            import torch
         | 
| 3 | 
             
            from nltk.tokenize import sent_tokenize
         | 
| 4 |  | 
| 5 | 
            -
            torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
         | 
| 6 | 
             
            model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
         | 
| 7 |  | 
| 8 | 
             
            model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
         | 
| @@ -11,6 +11,12 @@ model = AutoModelForCausalLM.from_pretrained(model_name_or_path, | |
| 11 | 
             
                                                         trust_remote_code=False,
         | 
| 12 | 
             
                                                         revision="main")
         | 
| 13 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 14 |  | 
| 15 |  | 
| 16 | 
             
            tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
         | 
| @@ -41,7 +47,7 @@ def single_generate(query): | |
| 41 | 
             
                model_inputs = encodeds.to(device)
         | 
| 42 | 
             
                model.to(device)
         | 
| 43 |  | 
| 44 | 
            -
                generated_ids = model.generate(model_inputs, max_new_tokens= | 
| 45 | 
             
                decoded = tokenizer.batch_decode(generated_ids)
         | 
| 46 | 
             
                results = list()
         | 
| 47 | 
             
                for index, result in enumerate(decoded):
         | 
| @@ -158,5 +164,5 @@ if __name__ == '__main__': | |
| 158 | 
             
                # print(result)
         | 
| 159 | 
             
                # result = """
         | 
| 160 |  | 
| 161 | 
            -
                answer = run(query=' | 
| 162 | 
             
                print(answer)
         | 
|  | |
| 2 | 
             
            import torch
         | 
| 3 | 
             
            from nltk.tokenize import sent_tokenize
         | 
| 4 |  | 
| 5 | 
            +
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
         | 
| 6 | 
             
            model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
         | 
| 7 |  | 
| 8 | 
             
            model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
         | 
|  | |
| 11 | 
             
                                                         trust_remote_code=False,
         | 
| 12 | 
             
                                                         revision="main")
         | 
| 13 |  | 
| 14 | 
            +
            from ctransformers import AutoModelForCausalLM
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
         | 
| 17 | 
            +
            llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            print(llm("AI is going to"))
         | 
| 20 |  | 
| 21 |  | 
| 22 | 
             
            tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
         | 
|  | |
| 47 | 
             
                model_inputs = encodeds.to(device)
         | 
| 48 | 
             
                model.to(device)
         | 
| 49 |  | 
| 50 | 
            +
                generated_ids = model.generate(model_inputs, max_new_tokens=150,  do_sample=True, temperature=1.0)
         | 
| 51 | 
             
                decoded = tokenizer.batch_decode(generated_ids)
         | 
| 52 | 
             
                results = list()
         | 
| 53 | 
             
                for index, result in enumerate(decoded):
         | 
|  | |
| 164 | 
             
                # print(result)
         | 
| 165 | 
             
                # result = """
         | 
| 166 |  | 
| 167 | 
            +
                answer = run(query='Tell me something about Gaël Varoquaux, e.g., birth date and place and short bio ', sample_size=10)
         | 
| 168 | 
             
                print(answer)
         |