Spaces:

ruslanmv
/

ai-story-server

Running

App Files Files Community

ruslanmv commited on Feb 2, 2024

Commit

9b12079

verified ·

1 Parent(s): 053de85

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -11

app.py CHANGED Viewed

@@ -5,22 +5,20 @@ is_gpu=False
 #download for mecab
 os.system('python -m unidic download')
-# we need to compile a CUBLAS version
-# Or get it from  https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
-os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 os.environ["COQUI_TOS_AGREED"] = "1"
 # NOTE: for streaming will require gradio audio streaming fix
 # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
 #Now you’re ready to install 🤗 Transformers with the following command:
 if not is_gpu:
   #For CPU-support only, Transformers and PyTorch with:
   os.system('pip install transformers[torch]')
   #pip install 'transformers[tf-cpu]'   #Transformers and TensorFlow 2.0:
 import textwrap
 from scipy.io.wavfile import write
 from pydub import AudioSegment
@@ -155,14 +153,21 @@ from llama_cpp import Llama
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 # else 35 full layers + XTTS works fine on T4 16GB
 # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
-GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
 LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 LLAMA_VERBOSE=False
 print("Running LLM Zephyr")
-llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=512, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 def split_sentences(text, max_len):
     # Apply custom rules to enforce sentence breaks with double punctuation

 #download for mecab
 os.system('python -m unidic download')
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 os.environ["COQUI_TOS_AGREED"] = "1"
 # NOTE: for streaming will require gradio audio streaming fix
 # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
 #Now you’re ready to install 🤗 Transformers with the following command:
 if not is_gpu:
   #For CPU-support only, Transformers and PyTorch with:
   os.system('pip install transformers[torch]')
   #pip install 'transformers[tf-cpu]'   #Transformers and TensorFlow 2.0:
+  os.system('pip install llama-cpp-python==0.2.11')
+else:
+    # we need to compile a CUBLAS version
+    # Or get it from  https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
+    os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
 import textwrap
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 # else 35 full layers + XTTS works fine on T4 16GB
 # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
+if is_gpu:
+    GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))-10
+else:
+    GPU_LAYERS=-1
 LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
 LLAMA_VERBOSE=False
 print("Running LLM Zephyr")
+llm_zephyr = Llama(model_path=zephyr_model_path,
+                   n_gpu_layers=GPU_LAYERS,
+                   max_new_tokens=512,
+                   context_window=4096,
+                   n_ctx=4096,
+                   n_batch=128,
+                   verbose=LLAMA_VERBOSE)
 def split_sentences(text, max_len):
     # Apply custom rules to enforce sentence breaks with double punctuation