umarigan commited on
Commit
c54e666
·
1 Parent(s): 2cdfc91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -1
app.py CHANGED
@@ -49,7 +49,16 @@ def load_model():
49
  logging.info("uploading model from hf pub")
50
  #model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
51
  llm = LlamaCpp(model_path=model_path, n_ctx=4096)
52
- llm_chain = LLMChain(llm=llm, prompt=prompt)
 
 
 
 
 
 
 
 
 
53
  #llm_chain = ConversationChain(llm=llm, prompt=promptmemory=ConversationBufferMemory())
54
  logging.info("uploading model done")
55
  return llm_chain
 
49
  logging.info("uploading model from hf pub")
50
  #model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
51
  llm = LlamaCpp(model_path=model_path, n_ctx=4096)
52
+ #llm_chain = LLMChain(llm=llm, prompt=prompt)
53
+ n_gpu_layers = 1 # Change this value based on your model and your GPU VRAM pool.
54
+ n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
55
+ llm = LlamaCpp(model_path=model_path, n_ctx=2048,
56
+ input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
57
+ callback_manager=callback_manager,
58
+ n_gpu_layers=n_gpu_layers,
59
+ n_batch=n_batch,
60
+ verbose=True,)
61
+
62
  #llm_chain = ConversationChain(llm=llm, prompt=promptmemory=ConversationBufferMemory())
63
  logging.info("uploading model done")
64
  return llm_chain