Tonic commited on
Commit
d75506d
·
1 Parent(s): eda6426

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -71
app.py CHANGED
@@ -1,99 +1,92 @@
 
1
  import transformers
2
- from transformers import AutoTokenizer, MistralForCausalLM
3
- from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, MistralForCausalLM
4
  import torch
5
  import gradio as gr
6
- import random
7
- from textwrap import wrap
8
- from peft import PeftModel, PeftConfig
9
- import torch
10
- import gradio as gr
11
-
12
- # Functions to Wrap the Prompt Correctly
13
- def wrap_text(text, width=90):
14
- lines = text.split('\n')
15
- wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
16
- wrapped_text = '\n'.join(wrapped_lines)
17
- return wrapped_text
18
-
19
- def multimodal_prompt(user_input, system_prompt="You are an expert medical analyst:"):
20
- # Combine user input and system prompt
21
- formatted_input = f"{system_prompt} {user_input}"
22
 
23
- # Encode the input text
24
- encodeds = tokenizer(formatted_input, return_tensors="pt", add_special_tokens=False)
25
- model_inputs = encodeds.to(device)
26
 
27
- # Generate a response using the model
28
- output = model.generate(
29
- **model_inputs,
30
- max_length=max_length,
31
- use_cache=True,
32
- early_stopping=True,
33
- bos_token_id=model.config.bos_token_id,
34
- eos_token_id=model.config.eos_token_id,
35
- pad_token_id=model.config.eos_token_id,
36
- temperature=0.1,
37
- do_sample=True
38
- )
39
 
40
- response_text = tokenizer.decode(output[0], skip_special_tokens=True)
41
 
42
- return response_text
43
 
44
  device = "cuda" if torch.cuda.is_available() else "cpu"
45
- model_name = "OpenLLM-France/Claire-Mistral-7B-0.1"
 
 
 
 
46
 
47
  tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
48
  model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
49
  device_map="auto",
50
  torch_dtype=torch.bfloat16,
51
- load_in_4bit=True # For efficient inference, if supported by the GPU card
52
  )
53
-
54
- class ChatBot:
55
- def __init__(self):
56
- self.history = []
57
-
58
- def predict(self, user_input, system_prompt):
59
- # Combine user input and system prompt
60
- formatted_input = f"{system_prompt} {user_input}"
61
-
62
- # Encode user input
63
- user_input_ids = tokenizer.encode(formatted_input, return_tensors="pt")
64
-
65
- # Concatenate the user input with chat history
66
- if len(self.history) > 0:
67
- chat_history_ids = torch.cat([self.history, user_input_ids], dim=-1)
68
- else:
69
- chat_history_ids = user_input_ids
70
-
71
- # Generate a response using the PEFT model
72
- response = model.generate(input_ids=chat_history_ids, max_length=200, pad_token_id=tokenizer.eos_token_id)
73
-
74
- # Update chat history
75
- self.history = chat_history_ids
76
-
77
- # Decode and return the response
78
  response_text = tokenizer.decode(response[0], skip_special_tokens=True)
79
  return response_text
80
 
81
- bot = ChatBot()
 
 
 
82
 
83
- title = "👋🏻Welcome to Tonic's Claire Chat🚀"
84
- description = "You can use this Space to test out the current model ([ClaireLLM](https://huggingface.co/OpenLLM-France/Claire-Mistral-7B-0.1)) or duplicate this Space and use it for any other model on 🤗HuggingFace. Join me on [Discord to build together](https://discord.gg/nXx5wbX9)."
85
- examples = [["[Estragon :] On va voir. Tiens. Ils prennent chacun un bout de la corde et tirent. La corde se casse. Ils manquent de tomber.", "[Vladimir] Fais voir quand même. (Estragon dénoue la corde qui maintient son pantalon.Celui-ci, beaucoup trop large, lui tombe autour des chevilles. Ils regardent la corde.) À la rigueur ça pourrait aller. Mais est-elle solide ?"]]
86
  iface = gr.Interface(
87
- fn=bot.predict,
88
  title=title,
89
  description=description,
90
  examples=examples,
91
  inputs=[
92
- gr.Textbox(label="Deuxieme partie d'un dialogue"),
93
- gr.Textbox(label="Premiere partie d'un dialogue")
 
 
 
 
 
 
94
  ],
95
- outputs=gr.outputs.Textbox(label="Claire LLM Dialogue"),
96
  theme="ParityError/Anime"
97
  )
98
 
99
- iface.launch()
 
1
+ import optimum
2
  import transformers
3
+ from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
4
+ from optimum.bettertransformer import BetterTransformer
5
  import torch
6
  import gradio as gr
7
+ import json
8
+ import os
9
+ import shutil
10
+ import requests
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
 
 
12
 
13
+ title = "👋🏻Welcome to Tonic's 💫🌠Starling 7B"
14
+ description = "You can use [💫🌠Starling 7B](https://huggingface.co/berkeley-nest/Starling-RM-7B-alpha) or duplicate it for local use or on Hugging Face! [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."
15
+ examples = [
16
+ [
17
+ "The following dialogue is a conversation between Emmanuel Macron and Elon Musk:", # user_message
18
+ "[Emmanuel Macron]: Hello Mr. Musk. Thank you for receiving me today.", # assistant_message
19
+ 0.9, # temperature
20
+ 450, # max_new_tokens
21
+ 0.90, # top_p
22
+ 1.9, # repetition_penalty
23
+ ]
24
+ ]
25
 
26
+ model_name = "berkeley-nest/Starling-RM-7B-alpha"
27
 
 
28
 
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ temperature=0.4
31
+ max_new_tokens=240
32
+ top_p=0.92
33
+ repetition_penalty=1.7
34
+
35
 
36
  tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
37
  model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
38
  device_map="auto",
39
  torch_dtype=torch.bfloat16,
40
+ load_in_4bit=True
41
  )
42
+ model = BetterTransformer.transform(model)
43
+
44
+ class StarlingBot:
45
+ def __init__(self, system_prompt="The following dialogue is a conversation"):
46
+ self.system_prompt = system_prompt
47
+
48
+ def predict(self, user_message, assistant_message, system_prompt, do_sample, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
49
+ conversation = f" <s> [INST] {self.system_prompt} [INST] {assistant_message if assistant_message else ''} </s> [/INST] {user_message} </s> "
50
+ # Encode the conversation using the tokenizer
51
+ input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
52
+ input_ids = input_ids.to(device)
53
+ response = model.generate(
54
+ input_ids=input_ids,
55
+ use_cache=False,
56
+ early_stopping=False,
57
+ bos_token_id=model.config.bos_token_id,
58
+ eos_token_id=model.config.eos_token_id,
59
+ pad_token_id=model.config.eos_token_id,
60
+ temperature=temperature,
61
+ do_sample=True,
62
+ max_new_tokens=max_new_tokens,
63
+ top_p=top_p,
64
+ repetition_penalty=repetition_penalty
65
+ )
 
66
  response_text = tokenizer.decode(response[0], skip_special_tokens=True)
67
  return response_text
68
 
69
+ # Create the Falcon chatbot instance
70
+ StarlingBot_bot = StarlingBot()
71
+
72
+ starling_bot = StarlingBot() # Renamed for consistency
73
 
 
 
 
74
  iface = gr.Interface(
75
+ fn=starling_bot.predict, # Corrected to match the instance name
76
  title=title,
77
  description=description,
78
  examples=examples,
79
  inputs=[
80
+ gr.Textbox(label="User Message", type="text", lines=5),
81
+ gr.Textbox(label="💫🌠Starling Assistant Message or Instructions ", lines=2),
82
+ gr.Textbox(label="💫🌠Starling System Prompt or Instruction", lines=2),
83
+ gr.Checkbox(label="Advanced", value=False), # Ensure this is connected to functionality
84
+ gr.Slider(label="Temperature", value=0.7, minimum=0.05, maximum=1.0, step=0.05),
85
+ gr.Slider(label="Max new tokens", value=100, minimum=25, maximum=256, step=1),
86
+ gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05),
87
+ gr.Slider(label="Repetition penalty", value=1.9, minimum=1.0, maximum=2.0, step=0.05)
88
  ],
89
+ outputs="text",
90
  theme="ParityError/Anime"
91
  )
92