Qwen2.5-0.5B-Rag-Thinking-Flan-T5

Sleeping

App Files Files Community

Akjava commited on Mar 20

Commit

ad44bc4

1 Parent(s): 9cbac54

release

Browse files

Files changed (1) hide show

app.py +19 -21

app.py CHANGED Viewed

@@ -91,14 +91,7 @@ hf_hub_download(
     local_dir="./models",
 )
-# Set the title and description
-title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
-description = """## My Best CPU Rag Solution
-- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
-- search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
-- Qwen2.5-0.5B as good as small-size.
-- anyway google T5 series on CPU is amazing
-"""
@@ -142,13 +135,12 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
     try:
         tokens = llama.tokenize(f"{message}".encode("utf-8"))
         print(f"text length={len(tokens)}")
-        #print(tokens)
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
         outputs =""
-        #TODO support stream
         iteration = 1
         temperature = 0.5
         top_k = 40
@@ -234,12 +226,6 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
                 verbose=False
             )
     llm_model = model
-    #provider = LlamaCppPythonProvider(llm)
-    #answer = to_answer(provider,document,question)
-    #return result['choices'][0]['text']
 def respond(
     message: str,
@@ -276,13 +262,25 @@ def respond(
     answer(document,message)
     response = ""
     #do direct in here
-    for chunk in  llm(system_message%(document,message),max_tokens=2048*4,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
         text = chunk['choices'][0]['text']
-        #print(text, end='', flush=True)  # 逐次表示
         response += text
         yield response
 # Create a chat interface
 demo = gr.ChatInterface(
     respond,
     examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
@@ -306,9 +304,9 @@ demo = gr.ChatInterface(
             lines=2,visible=False
         ),
         gr.Slider(
-            minimum=512,
-            maximum=2048,
-            value=1024,
             step=1,
             label="Max Tokens",
             info="Maximum length of response (higher = longer replies)",

     local_dir="./models",
 )
     try:
         tokens = llama.tokenize(f"{message}".encode("utf-8"))
         print(f"text length={len(tokens)}")
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
         outputs =""
         iteration = 1
         temperature = 0.5
         top_k = 40
                 verbose=False
             )
     llm_model = model
 def respond(
     message: str,
     answer(document,message)
     response = ""
     #do direct in here
+    for chunk in  llm(system_message%(document,message),max_tokens=max_tokens,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
         text = chunk['choices'][0]['text']
         response += text
         yield response
 # Create a chat interface
+# Set the title and description
+title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
+description = """
+- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
+- Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
+- Qwen2.5-0.5B as good as small-size.
+- anyway google T5 series on CPU is amazing
+## Huggingface Free CPU Limitation
+- When duplicating a space, the build process can occasionally become stuck, requiring a manual restart to finish.
+- Spaces may unexpectedly stop functioning or even be deleted, leading to the need to rework them. Refer to [issue](https://github.com/huggingface/hub-docs/issues/1633) for more information.
+"""
 demo = gr.ChatInterface(
     respond,
     examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
             lines=2,visible=False
         ),
         gr.Slider(
+            minimum=1024,
+            maximum=8192,
+            value=2048,
             step=1,
             label="Max Tokens",
             info="Maximum length of response (higher = longer replies)",