Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,22 +5,20 @@ is_gpu=False
|
|
5 |
|
6 |
#download for mecab
|
7 |
os.system('python -m unidic download')
|
8 |
-
|
9 |
-
# we need to compile a CUBLAS version
|
10 |
-
# Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
|
11 |
-
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
|
12 |
-
|
13 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
14 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
15 |
-
|
16 |
# NOTE: for streaming will require gradio audio streaming fix
|
17 |
# pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
|
18 |
-
|
19 |
#Now you’re ready to install 🤗 Transformers with the following command:
|
20 |
if not is_gpu:
|
21 |
#For CPU-support only, Transformers and PyTorch with:
|
22 |
os.system('pip install transformers[torch]')
|
23 |
#pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
|
|
|
|
|
|
|
|
|
|
|
24 |
import textwrap
|
25 |
from scipy.io.wavfile import write
|
26 |
from pydub import AudioSegment
|
@@ -155,14 +153,21 @@ from llama_cpp import Llama
|
|
155 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
156 |
# else 35 full layers + XTTS works fine on T4 16GB
|
157 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
|
161 |
-
|
162 |
LLAMA_VERBOSE=False
|
163 |
|
164 |
print("Running LLM Zephyr")
|
165 |
-
llm_zephyr = Llama(model_path=zephyr_model_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
def split_sentences(text, max_len):
|
168 |
# Apply custom rules to enforce sentence breaks with double punctuation
|
|
|
5 |
|
6 |
#download for mecab
|
7 |
os.system('python -m unidic download')
|
|
|
|
|
|
|
|
|
|
|
8 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
9 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
|
|
10 |
# NOTE: for streaming will require gradio audio streaming fix
|
11 |
# pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
|
|
|
12 |
#Now you’re ready to install 🤗 Transformers with the following command:
|
13 |
if not is_gpu:
|
14 |
#For CPU-support only, Transformers and PyTorch with:
|
15 |
os.system('pip install transformers[torch]')
|
16 |
#pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
|
17 |
+
os.system('pip install llama-cpp-python==0.2.11')
|
18 |
+
else:
|
19 |
+
# we need to compile a CUBLAS version
|
20 |
+
# Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
|
21 |
+
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
|
22 |
import textwrap
|
23 |
from scipy.io.wavfile import write
|
24 |
from pydub import AudioSegment
|
|
|
153 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
154 |
# else 35 full layers + XTTS works fine on T4 16GB
|
155 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
156 |
+
if is_gpu:
|
157 |
+
GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))-10
|
158 |
+
else:
|
159 |
+
GPU_LAYERS=-1
|
160 |
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
|
|
|
161 |
LLAMA_VERBOSE=False
|
162 |
|
163 |
print("Running LLM Zephyr")
|
164 |
+
llm_zephyr = Llama(model_path=zephyr_model_path,
|
165 |
+
n_gpu_layers=GPU_LAYERS,
|
166 |
+
max_new_tokens=512,
|
167 |
+
context_window=4096,
|
168 |
+
n_ctx=4096,
|
169 |
+
n_batch=128,
|
170 |
+
verbose=LLAMA_VERBOSE)
|
171 |
|
172 |
def split_sentences(text, max_len):
|
173 |
# Apply custom rules to enforce sentence breaks with double punctuation
|