Update app.py
Browse files
app.py
CHANGED
@@ -49,7 +49,16 @@ def load_model():
|
|
49 |
logging.info("uploading model from hf pub")
|
50 |
#model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
|
51 |
llm = LlamaCpp(model_path=model_path, n_ctx=4096)
|
52 |
-
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
#llm_chain = ConversationChain(llm=llm, prompt=promptmemory=ConversationBufferMemory())
|
54 |
logging.info("uploading model done")
|
55 |
return llm_chain
|
|
|
49 |
logging.info("uploading model from hf pub")
|
50 |
#model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
|
51 |
llm = LlamaCpp(model_path=model_path, n_ctx=4096)
|
52 |
+
#llm_chain = LLMChain(llm=llm, prompt=prompt)
|
53 |
+
n_gpu_layers = 1 # Change this value based on your model and your GPU VRAM pool.
|
54 |
+
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
|
55 |
+
llm = LlamaCpp(model_path=model_path, n_ctx=2048,
|
56 |
+
input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
|
57 |
+
callback_manager=callback_manager,
|
58 |
+
n_gpu_layers=n_gpu_layers,
|
59 |
+
n_batch=n_batch,
|
60 |
+
verbose=True,)
|
61 |
+
|
62 |
#llm_chain = ConversationChain(llm=llm, prompt=promptmemory=ConversationBufferMemory())
|
63 |
logging.info("uploading model done")
|
64 |
return llm_chain
|