File size: 1,540 Bytes
6a2e815
 
 
 
 
 
 
f0666ff
6a2e815
 
 
 
 
 
 
 
 
 
08127ab
6e047fb
6a2e815
 
 
08127ab
6a2e815
 
 
 
6f8eeb0
6a2e815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gradio as gr

from peft import PeftModel, PeftConfig
from transformers import MistralForCausalLM, TextIteratorStreamer, AutoTokenizer, BitsAndBytesConfig
from time import sleep
from threading import Thread
from torch import float16
import spaces

config = PeftConfig.from_pretrained("lang-uk/dragoman")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=float16,
    bnb_4bit_use_double_quant=False,
)

model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", 
    quantization_config=quant_config,)#.to("cuda")
    #device_map="auto",)
model = PeftModel.from_pretrained(model, "lang-uk/dragoman")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False, add_bos_token=False)

@spaces.GPU(duration=20)
def translate(input_text):
    # iteratively generate
    input_text = input_text.strip()
    input_text = f"[INST] {input_text} [/INST]"
    inputs = tokenizer([input_text], return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=200)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)

    thread.start()
    
    generated_text = ""
    for new_text in streamer:
        generated_text += new_text
        yield generated_text


iface = gr.Interface(fn=translate, inputs="text", outputs="text", examples=[["who holds this neighborhood?"]])
iface.launch()