release
Browse files
app.py
CHANGED
|
@@ -91,14 +91,7 @@ hf_hub_download(
|
|
| 91 |
local_dir="./models",
|
| 92 |
)
|
| 93 |
|
| 94 |
-
|
| 95 |
-
title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
| 96 |
-
description = """## My Best CPU Rag Solution
|
| 97 |
-
- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
|
| 98 |
-
- search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
| 99 |
-
- Qwen2.5-0.5B as good as small-size.
|
| 100 |
-
- anyway google T5 series on CPU is amazing
|
| 101 |
-
"""
|
| 102 |
|
| 103 |
|
| 104 |
|
|
@@ -142,13 +135,12 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
|
|
| 142 |
try:
|
| 143 |
tokens = llama.tokenize(f"{message}".encode("utf-8"))
|
| 144 |
print(f"text length={len(tokens)}")
|
| 145 |
-
#print(tokens)
|
| 146 |
llama.encode(tokens)
|
| 147 |
tokens = [llama.decoder_start_token()]
|
| 148 |
|
| 149 |
|
| 150 |
outputs =""
|
| 151 |
-
|
| 152 |
iteration = 1
|
| 153 |
temperature = 0.5
|
| 154 |
top_k = 40
|
|
@@ -234,12 +226,6 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
|
|
| 234 |
verbose=False
|
| 235 |
)
|
| 236 |
llm_model = model
|
| 237 |
-
#provider = LlamaCppPythonProvider(llm)
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
#answer = to_answer(provider,document,question)
|
| 241 |
-
#return result['choices'][0]['text']
|
| 242 |
-
|
| 243 |
|
| 244 |
def respond(
|
| 245 |
message: str,
|
|
@@ -276,13 +262,25 @@ def respond(
|
|
| 276 |
answer(document,message)
|
| 277 |
response = ""
|
| 278 |
#do direct in here
|
| 279 |
-
for chunk in llm(system_message%(document,message),max_tokens=
|
| 280 |
text = chunk['choices'][0]['text']
|
| 281 |
-
#print(text, end='', flush=True) # 逐次表示
|
| 282 |
response += text
|
| 283 |
yield response
|
| 284 |
|
|
|
|
| 285 |
# Create a chat interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
demo = gr.ChatInterface(
|
| 287 |
respond,
|
| 288 |
examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
|
|
@@ -306,9 +304,9 @@ demo = gr.ChatInterface(
|
|
| 306 |
lines=2,visible=False
|
| 307 |
),
|
| 308 |
gr.Slider(
|
| 309 |
-
minimum=
|
| 310 |
-
maximum=
|
| 311 |
-
value=
|
| 312 |
step=1,
|
| 313 |
label="Max Tokens",
|
| 314 |
info="Maximum length of response (higher = longer replies)",
|
|
|
|
| 91 |
local_dir="./models",
|
| 92 |
)
|
| 93 |
|
| 94 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
|
|
|
|
| 135 |
try:
|
| 136 |
tokens = llama.tokenize(f"{message}".encode("utf-8"))
|
| 137 |
print(f"text length={len(tokens)}")
|
|
|
|
| 138 |
llama.encode(tokens)
|
| 139 |
tokens = [llama.decoder_start_token()]
|
| 140 |
|
| 141 |
|
| 142 |
outputs =""
|
| 143 |
+
|
| 144 |
iteration = 1
|
| 145 |
temperature = 0.5
|
| 146 |
top_k = 40
|
|
|
|
| 226 |
verbose=False
|
| 227 |
)
|
| 228 |
llm_model = model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
def respond(
|
| 231 |
message: str,
|
|
|
|
| 262 |
answer(document,message)
|
| 263 |
response = ""
|
| 264 |
#do direct in here
|
| 265 |
+
for chunk in llm(system_message%(document,message),max_tokens=max_tokens,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
|
| 266 |
text = chunk['choices'][0]['text']
|
|
|
|
| 267 |
response += text
|
| 268 |
yield response
|
| 269 |
|
| 270 |
+
|
| 271 |
# Create a chat interface
|
| 272 |
+
# Set the title and description
|
| 273 |
+
title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
| 274 |
+
description = """
|
| 275 |
+
- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
|
| 276 |
+
- Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
| 277 |
+
- Qwen2.5-0.5B as good as small-size.
|
| 278 |
+
- anyway google T5 series on CPU is amazing
|
| 279 |
+
## Huggingface Free CPU Limitation
|
| 280 |
+
- When duplicating a space, the build process can occasionally become stuck, requiring a manual restart to finish.
|
| 281 |
+
- Spaces may unexpectedly stop functioning or even be deleted, leading to the need to rework them. Refer to [issue](https://github.com/huggingface/hub-docs/issues/1633) for more information.
|
| 282 |
+
"""
|
| 283 |
+
|
| 284 |
demo = gr.ChatInterface(
|
| 285 |
respond,
|
| 286 |
examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
|
|
|
|
| 304 |
lines=2,visible=False
|
| 305 |
),
|
| 306 |
gr.Slider(
|
| 307 |
+
minimum=1024,
|
| 308 |
+
maximum=8192,
|
| 309 |
+
value=2048,
|
| 310 |
step=1,
|
| 311 |
label="Max Tokens",
|
| 312 |
info="Maximum length of response (higher = longer replies)",
|