gregorlied commited on
Commit
78f48b0
·
verified ·
1 Parent(s): 58306e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -34
app.py CHANGED
@@ -3,10 +3,10 @@ import spaces
3
  import gradio as gr
4
 
5
  import torch
6
- from transformers import AutoTokenizer
7
  from huggingface_hub import login as hf_login
8
 
9
- from vllm import LLM
10
  from pydantic import BaseModel
11
 
12
  os.environ["VLLM_LOGGING_LEVEL"]="DEBUG"
@@ -16,35 +16,70 @@ hf_login(token=os.getenv("HF_TOKEN"))
16
 
17
  model_name = "meta-llama/Llama-3.2-1B-Instruct"
18
 
19
- model = LLM(
20
- model=model_name,
21
- dtype=torch.bfloat16,
22
- trust_remote_code=True,
23
- enforce_eager=True,
24
  )
25
 
26
- class Info(BaseModel):
27
- name: str
28
- age: int
29
-
30
- json_schema = Info.model_json_schema()
31
- guided_decoding_params = GuidedDecodingParams(json=json_schema)
32
- sampling_params = SamplingParams(
33
- temperature=0.1,
34
- max_tokens=2048,
35
- guided_decoding=guided_decoding_params,
36
- )
37
-
38
- prompt = "You are a helpful assistant."
 
 
 
 
 
 
39
 
40
- tokenizer = AutoTokenizer.from_pretrained(
41
- model_name,
42
- padding_side='right',
43
- trust_remote_code=True,
44
  )
45
 
46
- if tokenizer.pad_token is None:
47
- tokenizer.add_special_tokens({'pad_token': '<pad>'})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  @spaces.GPU(duration=60)
50
  def summarize(text):
@@ -56,16 +91,25 @@ def summarize(text):
56
  {"role": "user", "content": text},
57
  ]
58
 
59
- input_text = tokenizer.apply_chat_template(
60
- messages,
61
- tokenize=False,
62
- add_generation_prompt=True,
63
- enable_thinking=False,
 
 
 
 
 
 
64
  )
 
 
 
 
65
 
66
- outputs = model.generate([input_text], sampling_params)
67
- prediction = outputs[0].outputs[0].text
68
- return prediction
69
 
70
  with gr.Blocks() as demo:
71
  gr.Markdown("## 📝 Summarization for News, SciTLDR and Dialog Texts")
 
3
  import gradio as gr
4
 
5
  import torch
6
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
7
  from huggingface_hub import login as hf_login
8
 
9
+ import xgrammar as xgr
10
  from pydantic import BaseModel
11
 
12
  os.environ["VLLM_LOGGING_LEVEL"]="DEBUG"
 
16
 
17
  model_name = "meta-llama/Llama-3.2-1B-Instruct"
18
 
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ model_name, torch_dtype=torch.float32, device_map="auto"
 
 
 
21
  )
22
 
23
+ class Person(BaseModel):
24
+ life_style: str
25
+ family_history: str
26
+ social_history: str
27
+ medical_surgical_history: str
28
+ signs_symptoms: str
29
+ comorbidities: str
30
+ diagnostic_techniques_procedures: str
31
+ diagnosis: str
32
+ laboratory_values: str
33
+ pathology: str
34
+ pharmacological_therapy: str
35
+ interventional_therapy: str
36
+ patient_outcome_assessment: str
37
+ age: str
38
+ gender: str
39
+
40
+ config = AutoConfig.from_pretrained(model_name)
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
42
 
43
+ tokenizer_info = xgr.TokenizerInfo.from_huggingface(
44
+ tokenizer, vocab_size=config.vocab_size
 
 
45
  )
46
 
47
+ grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
48
+ compiled_grammar = grammar_compiler.compile_json_schema(Person)
49
+ xgr_logits_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)
50
+
51
+ prompt = """You are a text extraction system for clinical reports.
52
+ Please extract relevant clinical information from the report.
53
+
54
+ ### Instructions
55
+
56
+ - Use the JSON Schema given below.
57
+ - Return only a valid JSON object – no markdown, no comments.
58
+ - If no relevant facts are given for a field, set its value to "N/A".
59
+ - If multile relevant facts are given for a field, separate them with "; ".
60
+
61
+ ### JSON Schema
62
+
63
+ {
64
+ 'life_style': '',
65
+ 'family_history': '',
66
+ 'social_history': '',
67
+ 'medical_surgical_history': '',
68
+ 'signs_symptoms': '',
69
+ 'comorbidities': '',
70
+ 'diagnostic_techniques_procedures': '',
71
+ 'diagnosis': '',
72
+ 'laboratory_values': '',
73
+ 'pathology': '',
74
+ 'pharmacological_therapy': '',
75
+ 'interventional_therapy': '',
76
+ 'patient_outcome_assessment': '',
77
+ 'age': '',
78
+ 'gender': '',
79
+ }
80
+
81
+ ### Clinical Report
82
+ """
83
 
84
  @spaces.GPU(duration=60)
85
  def summarize(text):
 
91
  {"role": "user", "content": text},
92
  ]
93
 
94
+ model_inputs = tokenizer([text], return_tensors="pt").to('cuda')
95
+
96
+ generated_ids = model.generate(
97
+ input_ids=model_inputs["input_ids"],
98
+ attention_mask = model_inputs["attention_mask"],
99
+ # num_beams=8,
100
+ # top_p=0.9,
101
+ # do_sample=True,
102
+ # temperature=0.6,
103
+ max_new_tokens=2048,
104
+ logits_processor=[xgr_logits_processor]
105
  )
106
+
107
+ generated_ids = [
108
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
109
+ ]
110
 
111
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
112
+ return response[0]
 
113
 
114
  with gr.Blocks() as demo:
115
  gr.Markdown("## 📝 Summarization for News, SciTLDR and Dialog Texts")