import os
import spaces
import gradio as gr

import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login as hf_login

import xgrammar as xgr
from pydantic import BaseModel

os.environ["VLLM_LOGGING_LEVEL"]="DEBUG"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"]="spawn"

hf_login(token=os.getenv("HF_TOKEN"))

model_name = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float32, device_map="auto"
)

class Person(BaseModel):
  life_style: str
  family_history: str
  social_history: str
  medical_surgical_history: str
  signs_symptoms: str
  comorbidities: str
  diagnostic_techniques_procedures: str
  diagnosis: str
  laboratory_values: str
  pathology: str
  pharmacological_therapy: str
  interventional_therapy: str
  patient_outcome_assessment: str
  age: str
  gender: str
    
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_info = xgr.TokenizerInfo.from_huggingface(
    tokenizer, vocab_size=config.vocab_size
)

grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
compiled_grammar = grammar_compiler.compile_json_schema(Person)
xgr_logits_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)

prompt = """You are a text extraction system for clinical reports.
Please extract relevant clinical information from the report.

### Instructions

- Use the JSON Schema given below.
- Return only a valid JSON object – no markdown, no comments.
- If no relevant facts are given for a field, set its value to "N/A".
- If multile relevant facts are given for a field, separate them with "; ".

### JSON Schema

{
  'life_style': '',
  'family_history': '',
  'social_history': '',
  'medical_surgical_history': '',
  'signs_symptoms': '',
  'comorbidities': '',
  'diagnostic_techniques_procedures': '',
  'diagnosis': '',
  'laboratory_values': '',
  'pathology': '',
  'pharmacological_therapy': '',
  'interventional_therapy': '',
  'patient_outcome_assessment': '',
  'age': '',
  'gender': '',
}

### Clinical Report
"""

@spaces.GPU(duration=60)
def summarize(text):
    if not text.strip():
        return "Please enter some text to summarize."

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": text},
    ]
    
    model_inputs = tokenizer([text], return_tensors="pt").to('cuda')

    generated_ids = model.generate(
        input_ids=model_inputs["input_ids"],
        attention_mask = model_inputs["attention_mask"],
        # num_beams=8,
        # top_p=0.9,
        # do_sample=True,
        # temperature=0.6,
        max_new_tokens=2048,
        logits_processor=[xgr_logits_processor]
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return response[0]

with gr.Blocks() as demo:
    gr.Markdown("## 📝 Summarization for News, SciTLDR and Dialog Texts")

    with gr.Row():
        input_text = gr.Textbox(
            label="Input Text", 
            autoscroll=False,
            lines=15, 
            max_lines=15, 
            placeholder="Paste your article or paragraph here...",
        )
        output_text = gr.Textbox(
            label="Summary", 
            autoscroll=False,
            lines=15, 
            max_lines=15, 
            show_copy_button=True,
        )

    with gr.Row():
        summarize_btn = gr.Button("Summarize")
        summarize_btn.click(
            fn=summarize, 
            inputs=input_text, 
            outputs=output_text, 
            show_progress=True,
        )

if __name__ == "__main__":
    demo.launch()