qwen-7b-chat

Runtime error

App Files Files Community

Hilda Cran May

mikeee commited on Sep 3, 2023

Commit

fce4951

0 Parent(s):

Duplicate from mikeee/qwen-7b-chat

Browse files

Co-authored-by: mikeee <[email protected]>

Files changed (7) hide show

.gitattributes +35 -0
.gitignore +1 -0
.ruff.toml +17 -0
README.md +13 -0
app.py +535 -0
example_list.py +56 -0
requirements.txt +19 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .ruff_cache

.ruff.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+# Assume Python 3.10.
+target-version = "py310"
+# Decrease the maximum line length to 79 characters.
+line-length = 300
+# pyflakes, pycodestyle, isort
+# flake8 YTT, pydocstyle D, pylint PLC
+select = ["F", "E", "W", "I001", "YTT", "D", "PLC"]
+# select = ["ALL"]
+# D103 Missing docstring in public function
+# D101 Missing docstring in public class
+# `multi-line-summary-first-line` (D212)
+# `one-blank-line-before-class` (D203)
+extend-ignore = ["D103", "D101", "D212", "D203"]
+exclude = [".venv"]

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Qwen 7b Chat
+emoji: ⚡
+colorFrom: purple
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.39.0
+app_file: app.py
+pinned: false
+duplicated_from: mikeee/qwen-7b-chat
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,535 @@

+"""
+Run qwen 7b chat.
+transformers 4.31.0
+import torch
+torch.cuda.empty_cache()
+model.chat(
+    tokenizer: transformers.tokenization_utils.PreTrainedTokenizer,
+    query: str,
+    history: Optional[List[Tuple[str, str]]],
+    system: str = 'You are a helpful assistant.',
+    append_history: bool = True,
+    stream: Optional[bool] = <object object at 0x7f905797ec20>,
+    stop_words_ids: Optional[List[List[int]]] = None,
+    **kwargs) -> Tuple[str, List[Tuple[str, str]]]
+)
+model.generation_config
+GenerationConfig {
+  "chat_format": "chatml",
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "max_new_tokens": 512,
+  "max_window_size": 6144,
+  "pad_token_id": 151643,
+  "top_k": 0,
+  "top_p": 0.5,
+  "transformers_version": "4.31.0",
+  "trust_remote_code": true
+}
+"""
+# pylint: disable=line-too-long, invalid-name, no-member, redefined-outer-name, missing-function-docstring, missing-class-docstring, broad-except,
+import gc
+import os
+import sys
+import time
+from collections import deque
+from dataclasses import asdict, dataclass
+from textwrap import dedent
+from types import SimpleNamespace
+from typing import List, Optional
+import gradio as gr
+import torch
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from example_list import css, example_list
+if not torch.cuda.is_available():
+    raise gr.Error("No cuda, cant continue...")
+os.environ["TZ"] = "Asia/Shanghai"
+try:
+    time.tzset()  # type: ignore # pylint: disable=no-member
+except Exception:
+    # Windows
+    logger.warning("Windows, cant run time.tzset()")
+model_name = "Qwen/Qwen-7B-Chat"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+n_gpus = torch.cuda.device_count()
+try:
+    _ = f"{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB"
+except AssertionError:
+    _ = 0
+max_memory = {i: _ for i in range(n_gpus)}
+del sys
+# logger.remove()  # to turn on trace
+# logger.add(sys.stderr, level="TRACE")
+# logger.trace(f"{chat_history=}")
+def gen_model(model_name: str):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        trust_remote_code=True,
+        device_map="auto",
+        load_in_4bit=True,
+        max_memory=max_memory,
+        fp16=True,
+        torch_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    model = model.eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        model_name,
+        trust_remote_code=True,
+    )
+    return model
+def user_clear(message, chat_history):
+    """Gen a response, clear message in user textbox."""
+    logger.debug(f"{message=}")
+    try:
+        chat_history.append([message, ""])
+    except Exception:
+        chat_history = deque([message, ""], maxlen=5)
+    logger.trace(f"{chat_history=}")
+    return "", chat_history
+def user(message, chat_history):
+    """Gen a response."""
+    logger.debug(f"{message=}")
+    logger.trace(f"{chat_history=}")
+    try:
+        chat_history.append([message, ""])
+    except Exception:
+        chat_history = deque([message, ""], maxlen=5)
+    return message, chat_history
+# for rerun in tests
+model = None
+gc.collect()
+torch.cuda.empty_cache()
+if not torch.cuda.is_available():
+    # raise gr.Error("GPU not available, cant run. Turn on GPU and retry")
+    raise SystemExit("GPU not available, cant run. Turn on GPU and retry")
+model = gen_model(model_name)
+def bot(chat_history, **kwargs):
+    try:
+        message = chat_history[-1][0]
+    except Exception as exc:
+        logger.error(f"{chat_history=}: {exc}")
+        return chat_history
+    logger.debug(f"{chat_history=}")
+    try:
+        _ = """
+        response, chat_history = model.chat(
+            tokenizer,
+            message,
+            history=chat_history,
+            temperature=0.7,
+            repetition_penalty=1.2,
+            # max_length=128,
+        )
+        """
+        logger.debug("run model.chat...")
+        model.generation_config.update(**kwargs)
+        response, chat_history = model.chat(
+            tokenizer,
+            message,
+            chat_history[:-1],
+            # **kwargs,
+        )
+        del response
+        return chat_history
+    except Exception as exc:
+        logger.error(exc)
+        chat_history[:-1].append(["message", str(exc)])
+        return chat_history
+def bot_stream(chat_history, **kwargs):
+    logger.trace(f"{chat_history=}")
+    logger.trace(f"{kwargs=}")
+    try:
+        message = chat_history[-1][0]
+    except Exception as exc:
+        logger.error(f"{chat_history=}: {exc}")
+        raise gr.Error(f"{chat_history=}")
+        # yield chat_history
+    # for elm in model.chat_stream(tokenizer, message, chat_history):
+    model.generation_config.update(**kwargs)
+    response = ""
+    for elm in model.chat_stream(tokenizer, message, chat_history):
+        chat_history[-1] = [message, elm]
+        response = elm
+        yield chat_history
+    logger.debug(f"{response=}")
+    logger.debug(f"{model.generation_config=}")
+SYSTEM_PROMPT = "You are a helpful assistant."
+MAX_MAX_NEW_TOKENS = 2048  # sequence length 2048
+MAX_NEW_TOKENS = 256
+@dataclass
+class Config:
+    max_new_tokens: int = MAX_NEW_TOKENS
+    repetition_penalty: float = 1.1
+    temperature: float = 1.0
+    top_k: int = 0
+    top_p: float = 0.9
+# stats_default = SimpleNamespace(llm=model, system_prompt=SYSTEM_PROMPT, config=Config())
+stats_default = SimpleNamespace(llm=None, system_prompt=SYSTEM_PROMPT, config=Config())
+# input max_new_tokens temperature repetition_penalty top_k top_p system_prompt history
+def api_fn(  # pylint: disable=too-many-arguments
+    input_text: Optional[str],
+    # max_length: int = 256,
+    max_new_tokens: int = stats_default.config.max_new_tokens,
+    temperature: float = stats_default.config.temperature,
+    repetition_penalty: float = stats_default.config.repetition_penalty,
+    top_k: int = stats_default.config.top_k,
+    top_p: int = stats_default.config.top_p,
+    system_prompt: Optional[str] = None,
+    history: Optional[List[str]] = None,
+):
+    if input_text is None:
+        input_text = ""
+    try:
+        input_text = str(input_text).strip()
+    except Exception as exc:
+        logger.error(exc)
+        input_text = ""
+    if not input_text:
+        return ""
+    if history is None:
+        history = []
+    try:
+        temperature = float(temperature)
+    except Exception:
+        temperature = stats_default.config.temperature
+    if system_prompt is None:
+        system_prompt = stats_default.system_prompt
+    # if max_length < 10: max_length = 4096
+    if max_new_tokens < 10:
+        max_new_tokens = stats_default.config.max_new_tokens
+    if top_p < 0.1 or top_p > 1:
+        top_p = stats_default.config.top_p
+    if temperature <= 0.5:
+        temperature = stats_default.config.temperature
+    _ = {
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "repetition_penalty": repetition_penalty,
+        "top_k": top_k,
+        "top_p": top_p,
+    }
+    model.generation_config.update(**_)
+    try:
+        res, _ = model.chat(
+            tokenizer,
+            input_text,
+            history=history,
+            # max_length=max_length,
+            append_history=False,
+        )
+        # logger.debug(f"{res=} \n{_=}")
+    except Exception as exc:
+        logger.error(f"{exc=}")
+        res = str(exc)
+    logger.debug(f"api {res=}")
+    logger.debug(f"api {model.generation_config=}")
+    return res
+theme = gr.themes.Soft(text_size="sm")
+with gr.Blocks(
+    theme=theme,
+    title=model_name.lower(),
+    css=css,
+) as block:
+    stats = gr.State(stats_default)
+    # would this reset model?
+    model.generation_config = GenerationConfig.from_pretrained(
+        model_name,
+        trust_remote_code=True,
+    )
+    config = asdict(stats.value.config)
+    def bot_stream_state(chat_history):
+        logger.trace(f"{chat_history=}")
+        yield from bot_stream(chat_history, **config)
+    with gr.Accordion("🎈 Info", open=False):
+        gr.Markdown(
+            dedent(
+                f"""
+                ## {model_name.lower()}
+                * temperature range: .51 and up; higher temperature implies more randomness. Suggested temperature for chatting and creative writing is around 1.1 while it should be set to 0.51-1.0 for summarizing and translation.
+                * Set `repetition_penalty` to 2.1 or higher for a chatty conversation (more unpredictable and undesirable output). Lower it to 1.1 or smaller if more focused anwsers are desired (for example for translations or fact-oriented queries).
+                * Smaller `top_k` probably will result in smoothier sentences.
+                (`top_k=0` is equivalent to `top_k` equal to very very big though.) Consult `transformers` documentation for more details.
+                * An API is available at                  https://mikeee-qwen-7b-chat.hf.space/ that can be queried, e.g., in python
+                ```python
+                from gradio_client import Client
+                client = Client("https://mikeee-qwen-7b-chat.hf.space/")
+                result = client.predict(
+                    "你好!",  # user prompt
+                    256,  # max_new_tokens
+                    1.2,  # temperature
+                    1.1,  # repetition_penalty
+                    0,  # top_k
+                    0.9,  # top_p
+                    "You are a helpful assistant.",  # system_prompt
+                    None,  # history
+                    api_name="/api"
+                )
+                print(result)
+                ```
+                or in javascript
+                ```js
+                import {{ client }} from "@gradio/client";
+                const app = await client("https://mikeee-qwen-7b-chat.hf.space/");
+                const result = await app.predict("api", [...]);
+                console.log(result.data);
+                ```
+                Check documentation and examples by clicking `Use via API` at the very bottom of [https://huggingface.co/spaces/mikeee/qwen-7b-chat](https://huggingface.co/spaces/mikeee/qwen-7b-chat).
+                <p></p>
+                Most examples are meant for another model.
+                You probably should try to test
+                some related prompts. System prompt can be changed in Advaned Options as well."""
+            ),
+            elem_classes="xsmall",
+        )
+    chatbot = gr.Chatbot(height=500, value=deque([], maxlen=5))  # type: ignore
+    with gr.Row():
+        with gr.Column(scale=5):
+            msg = gr.Textbox(
+                label="Chat Message Box",
+                placeholder="Ask me anything (press Shift+Enter or click Submit to send)",
+                show_label=False,
+                # container=False,
+                lines=4,
+                max_lines=30,
+                show_copy_button=True,
+                # ).style(container=False)
+            )
+        with gr.Column(scale=1, min_width=50):
+            with gr.Row():
+                submit = gr.Button("Submit", elem_classes="xsmall")
+                stop = gr.Button("Stop", visible=True)
+                clear = gr.Button("Clear History", visible=True)
+    msg_submit_event = msg.submit(
+        # fn=conversation.user_turn,
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        queue=True,
+        show_progress="full",
+        # api_name=None,
+    ).then(bot_stream_state, chatbot, chatbot, queue=True)
+    submit_click_event = submit.click(
+        # fn=lambda x, y: ("",) + user(x, y)[1:],  # clear msg
+        fn=user_clear,  # clear msg
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        queue=True,
+        show_progress="full",
+        # api_name=None,
+    ).then(bot_stream_state, chatbot, chatbot, queue=True)
+    stop.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[msg_submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
+    with gr.Accordion(label="Advanced Options", open=False):
+        system_prompt = gr.Textbox(
+            label="System prompt",
+            value=stats_default.system_prompt,
+            lines=3,
+            visible=True,
+        )
+        max_new_tokens = gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=stats_default.config.max_new_tokens,
+        )
+        repetition_penalty = gr.Slider(
+            label="Repetition penalty",
+            minimum=0.1,
+            maximum=40.0,
+            step=0.1,
+            value=stats_default.config.repetition_penalty,
+        )
+        temperature = gr.Slider(
+            label="Temperature",
+            minimum=0.51,
+            maximum=40.0,
+            step=0.1,
+            value=stats_default.config.temperature,
+        )
+        top_p = gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=stats_default.config.top_p,
+        )
+        top_k = gr.Slider(
+            label="Top-k",
+            minimum=0,
+            maximum=1000,
+            step=1,
+            value=stats_default.config.top_k,
+        )
+        def system_prompt_fn(system_prompt):
+            stats.value.system_prompt = system_prompt
+            logger.debug(f"{stats.value.system_prompt=}")
+        def max_new_tokens_fn(max_new_tokens):
+            stats.value.config.max_new_tokens = max_new_tokens
+            logger.debug(f"{stats.value.config.max_new_tokens=}")
+        def repetition_penalty_fn(repetition_penalty):
+            stats.value.config.repetition_penalty = repetition_penalty
+            logger.debug(f"{stats.value=}")
+        def temperature_fn(temperature):
+            stats.value.config.temperature = temperature
+            logger.debug(f"{stats.value=}")
+        def top_p_fn(top_p):
+            stats.value.config.top_p = top_p
+            logger.debug(f"{stats.value=}")
+        def top_k_fn(top_k):
+            stats.value.config.top_k = top_k
+            logger.debug(f"{stats.value=}")
+        system_prompt.change(system_prompt_fn, system_prompt)
+        max_new_tokens.change(max_new_tokens_fn, max_new_tokens)
+        repetition_penalty.change(repetition_penalty_fn, repetition_penalty)
+        temperature.change(temperature_fn, temperature)
+        top_p.change(top_p_fn, top_p)
+        top_k.change(top_k_fn, top_k)
+        def reset_fn(stats_):
+            logger.debug("reset_fn")
+            stats_ = gr.State(stats_default)
+            logger.debug(f"{stats_.value=}")
+            return (
+                stats_,
+                stats_default.system_prompt,
+                stats_default.config.max_new_tokens,
+                stats_default.config.repetition_penalty,
+                stats_default.config.temperature,
+                stats_default.config.top_p,
+                stats_default.config.top_k,
+            )
+        reset_btn = gr.Button("Reset")
+        reset_btn.click(
+            reset_fn,
+            stats,
+            [
+                stats,
+                system_prompt,
+                max_new_tokens,
+                repetition_penalty,
+                temperature,
+                top_p,
+                top_k,
+            ],
+        )
+    with gr.Accordion("Example inputs", open=True):
+        etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
+        examples = gr.Examples(
+            examples=example_list,
+            inputs=[msg],
+            examples_per_page=60,
+        )
+    with gr.Accordion("Disclaimer", open=False):
+        _ = model_name.lower()
+        gr.Markdown(
+            f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
+            f"factually accurate information. {_} was trained on various public datasets; while great efforts "
+            "have been taken to clean the pretraining data, it is possible that this model could generate lewd, "
+            "biased, or otherwise offensive outputs.",
+            elem_classes=["disclaimer"],
+        )
+    with gr.Accordion("For Chat/Translation API", open=False, visible=False):
+        input_text = gr.Text()
+        api_history = gr.Chatbot(value=[])
+        api_btn = gr.Button("Go", variant="primary")
+        out_text = gr.Text()
+    # api_fn args order
+    # input_text max_new_tokens temperature repetition_penalty top_k top_p system_prompt history
+    api_btn.click(
+        api_fn,
+        [
+            input_text,
+            max_new_tokens,
+            temperature,
+            repetition_penalty,
+            top_k,
+            top_p,
+            system_prompt,
+            api_history,  # dont know how to pass this in gradio_client.Client calls
+        ],
+        out_text,
+        api_name="api",
+    )
+if __name__ == "__main__":
+    logger.info("Just record start time")
+    block.queue(max_size=8).launch(debug=True)

example_list.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Do exmaple_list css."""
+# pylint: disable=invalid-name, line-too-long,
+css = """
+    .importantButton {
+        background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
+        border: none !important;
+    }
+    .importantButton:hover {
+        background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
+        border: none !important;
+    }
+    .disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
+    .xsmall {font-size: x-small;}
+"""
+etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
+example_list = [
+    ["What NFL team won the Super Bowl in the year Justin Bieber was born?"],
+    [
+        "What NFL team won the Super Bowl in the year Justin Bieber was born? Think step by step."
+    ],
+    ["How to pick a lock? Provide detailed steps."],
+    [
+        "If it takes 10 hours to dry 10 clothes,  assuming all the clothes are hung together at the same time for drying , then how long will it take to dry a cloth?"
+    ],
+    [
+        "If it takes 10 hours to dry 10 clothes,  assuming all the clothes are hung together at the same time for drying , then how long will it take to dry 23 clothes? Think step by step."
+    ],
+    ["is infinity + 1 bigger than infinity?"],
+    ["Explain the plot of Cinderella in a sentence."],
+    [
+        "How long does it take to become proficient in French, and what are the best methods for retaining information?"
+    ],
+    ["What are some common mistakes to avoid when writing code?"],
+    ["Build a prompt to generate a beautiful portrait of a horse"],
+    ["Suggest four metaphors to describe the benefits of AI"],
+    ["Write a pop song about leaving home for the sandy beaches."],
+    ["Write a summary demonstrating my ability to tame lions"],
+    ["鲁迅和周树人什么关系"],
+    ["从前有一头牛，这头牛后面有什么？"],
+    ["正无穷大加一大于正无穷大吗？"],
+    ["正无穷大加正无穷大大于正无穷大吗？"],
+    ["-2的平方根等于什么"],
+    ["树上有5只鸟，猎人开枪打死了一只。树上还有几只鸟？"],
+    ["树上有11只鸟，猎人开枪打死了一只。树上还有几只鸟？提示：需考虑鸟可能受惊吓飞走。"],
+    ["鲁迅和周树人什么关系 用英文回答"],
+    ["以红楼梦的行文风格写一张委婉的请假条。不少于320字。"],
+    [f"{etext} 翻成中文，列出3个版本"],
+    [f"{etext} \n 翻成中文，保留原意，但使用文学性的语言。不要写解释。列出3个版本"],
+    ["js 判断一个数是不是质数"],
+    ["js 实现python 的 range(10)"],
+    ["js 实现python 的 [*(range(10)]"],
+    ["假定 1 + 2 = 4, 试求 7 + 8"],
+    ["Erkläre die Handlung von Cinderella in einem Satz."],
+    ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+transformers==4.31.0
+accelerate
+tiktoken
+einops
+# flash-attention
+# git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
+# cd flash-attention && pip install .
+# pip install csrc/layer_norm
+# pip install csrc/rotary
+torch  # 2.0.1
+safetensors
+bitsandbytes
+transformers_stream_generator
+scipy
+loguru
+about-time