Spaces:

KG0101
/

TeachingFiles

Sleeping

App Files Files Community

kgupta21 commited on 25 days ago

Commit

746ae2b

1 Parent(s): eb5c340

local inference page with fixes to gpu with zerogpu + add accelerate for device mapping - removed previous and fixed overall

Browse files

Files changed (2) hide show

app.py +69 -98
requirements.txt +6 -5

app.py CHANGED Viewed

@@ -21,59 +21,27 @@ logger = logging.getLogger(__name__)
 APP_VERSION = "1.0.0"
 logger.info(f"Starting Radiology Teaching App v{APP_VERSION}")
-# Global variables
-pipe = None
-llm = None
-tokenizer = None
-device = 0 if torch.cuda.is_available() else "cpu"
-logger.info(f"Using device: {device}")
-# Initialize Whisper
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 5000
-try:
-    logger.info("Initializing Whisper model...")
-    pipe = pipeline(
-        task="automatic-speech-recognition",
-        model=MODEL_NAME,
-        chunk_length_s=30,
-        device=device,
-    )
-except Exception as e:
-    logger.error(f"Error initializing Whisper model: {str(e)}")
-    pipe = None
-# Initialize Llama
-try:
-    logger.info("Initializing Llama model...")
     llm_model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
-    # Initialize tokenizer first
     tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
     tokenizer.use_default_system_prompt = False
-    # Initialize model with proper device mapping
-    if torch.cuda.is_available():
-        logger.info("Loading Llama model on GPU...")
-        llm = AutoModelForCausalLM.from_pretrained(
-            llm_model_id,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            load_in_8bit=True  # Use 8-bit quantization to reduce memory usage
-        )
-    else:
-        logger.info("Loading Llama model on CPU...")
-        llm = AutoModelForCausalLM.from_pretrained(
-            llm_model_id,
-            device_map={"": "cpu"},
-            low_cpu_mem_usage=True
-        )
-except Exception as e:
-    logger.error(f"Error initializing Llama model: {str(e)}")
-    llm = None
-    tokenizer = None
 try:
     # Load only 10 rows from the dataset
@@ -133,8 +101,6 @@ def transcribe(inputs, task="transcribe"):
     """Transcribe audio using Whisper"""
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    if pipe is None:
-        raise gr.Error("Whisper model not initialized properly!")
     try:
         logger.info("Transcribing audio...")
@@ -151,61 +117,60 @@ def analyze_with_llama(
     ground_truth_impression: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
 ) -> Iterator[str]:
     """Analyze transcribed report against ground truth using Llama"""
-    global llm, tokenizer  # Add global declaration
-    if llm is None or tokenizer is None:
-        raise gr.Error("Llama model not initialized properly!")
-    try:
-        task_prompt = f"""You are an expert radiologist. Compare the following transcribed radiology report with the ground truth and provide detailed feedback.
-        Transcribed Report:
-        {transcribed_text}
-        Ground Truth Findings:
-        {ground_truth_findings}
-        Ground Truth Impression:
-        {ground_truth_impression}
-        Please analyze:
-        1. Accuracy of findings
-        2. Completeness of report
-        3. Structure and clarity
-        4. Areas for improvement
-        Provide your analysis in a clear, structured format."""
-        conversation = [
-            {"role": "system", "content": "You are an expert radiologist providing detailed feedback."},
-            {"role": "user", "content": task_prompt}
-        ]
-        input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
-        input_ids = input_ids.to(llm.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-        generate_kwargs = dict(
-            input_ids=input_ids,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            num_beams=1,
-        )
-        t = Thread(target=llm.generate, kwargs=generate_kwargs)
-        t.start()
-        outputs = []
-        for text in streamer:
-            outputs.append(text)
-            yield "".join(outputs)
-    except Exception as e:
-        logger.error(f"Error in Llama analysis: {str(e)}")
-        raise gr.Error(f"Analysis failed: {str(e)}")
 def load_random_case(hide_ground_truth):
     try:
@@ -279,14 +244,18 @@ with gr.Blocks() as demo:
                 # Load case for comparison
                 load_case_btn = gr.Button("Load Random Case for Comparison")
                 local_ground_truth_findings = gr.Textbox(label="Ground Truth Findings", lines=5, interactive=False)
                 local_ground_truth_impression = gr.Textbox(label="Ground Truth Impression", lines=5, interactive=False)
             with gr.Column():
                 # Editable transcription and analysis interface
                 edited_transcription = gr.Textbox(label="Edit Transcription", lines=10)
-                temperature_input = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1)
                 max_tokens_input = gr.Slider(label="Max Tokens", minimum=256, maximum=2048, value=1024, step=128)
                 analyze_btn = gr.Button("Analyze with Llama")
                 llama_analysis_output = gr.Textbox(label="Llama Analysis Output", lines=15, interactive=False)
@@ -305,12 +274,11 @@ with gr.Blocks() as demo:
         )
         # Load case for local analysis
-        local_image_display = gr.Image(label="Chest X-ray Image", type="pil")  # Add this line
         load_case_btn.click(
             fn=load_random_case,
             inputs=[gr.Checkbox(value=False, visible=False)],  # Hidden checkbox for hide_ground_truth
             outputs=[
-                local_image_display,  # Update this line
                 local_ground_truth_findings,
                 local_ground_truth_impression,
                 gr.State(),  # Hidden state
@@ -326,7 +294,10 @@ with gr.Blocks() as demo:
                 local_ground_truth_findings,
                 local_ground_truth_impression,
                 max_tokens_input,
-                temperature_input
             ],
             outputs=llama_analysis_output
         )
@@ -370,4 +341,4 @@ with gr.Blocks() as demo:
     )
 logger.info("Starting Gradio interface...")
-demo.launch()

 APP_VERSION = "1.0.0"
 logger.info(f"Starting Radiology Teaching App v{APP_VERSION}")
+# Model configuration
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 5000
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = 0 if torch.cuda.is_available() else "cpu"
+# Initialize the LLM
+if torch.cuda.is_available():
     llm_model_id = "chuanli11/Llama-3.2-3B-Instruct-uncensored"
+    llm = AutoModelForCausalLM.from_pretrained(llm_model_id, torch_dtype=torch.float16, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
     tokenizer.use_default_system_prompt = False
+# Initialize the transcription pipeline
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    chunk_length_s=30,
+    device=device,
+)
 try:
     # Load only 10 rows from the dataset
     """Transcribe audio using Whisper"""
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     try:
         logger.info("Transcribing audio...")
     ground_truth_impression: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     """Analyze transcribed report against ground truth using Llama"""
+    task_prompt = f"""You are an expert radiologist. Compare the following transcribed radiology report with the ground truth and provide detailed feedback.
+    Transcribed Report:
+    {transcribed_text}
+    Ground Truth Findings:
+    {ground_truth_findings}
+    Ground Truth Impression:
+    {ground_truth_impression}
+    Please analyze:
+    1. Accuracy of findings
+    2. Completeness of report
+    3. Structure and clarity
+    4. Areas for improvement
+    Provide your analysis in a clear, structured format."""
+    conversation = [
+        {"role": "system", "content": "You are an expert radiologist providing detailed feedback."},
+        {"role": "user", "content": task_prompt}
+    ]
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(llm.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+    )
+    t = Thread(target=llm.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 def load_random_case(hide_ground_truth):
     try:
                 # Load case for comparison
                 load_case_btn = gr.Button("Load Random Case for Comparison")
+                local_image_display = gr.Image(label="Chest X-ray Image", type="pil")
                 local_ground_truth_findings = gr.Textbox(label="Ground Truth Findings", lines=5, interactive=False)
                 local_ground_truth_impression = gr.Textbox(label="Ground Truth Impression", lines=5, interactive=False)
             with gr.Column():
                 # Editable transcription and analysis interface
                 edited_transcription = gr.Textbox(label="Edit Transcription", lines=10)
+                temperature_input = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, value=0.6, step=0.1)
+                top_p_input = gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, value=0.9, step=0.05)
+                top_k_input = gr.Slider(label="Top-k", minimum=1, maximum=1000, value=50, step=1)
                 max_tokens_input = gr.Slider(label="Max Tokens", minimum=256, maximum=2048, value=1024, step=128)
+                repetition_penalty_input = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, value=1.2, step=0.05)
                 analyze_btn = gr.Button("Analyze with Llama")
                 llama_analysis_output = gr.Textbox(label="Llama Analysis Output", lines=15, interactive=False)
         )
         # Load case for local analysis
         load_case_btn.click(
             fn=load_random_case,
             inputs=[gr.Checkbox(value=False, visible=False)],  # Hidden checkbox for hide_ground_truth
             outputs=[
+                local_image_display,
                 local_ground_truth_findings,
                 local_ground_truth_impression,
                 gr.State(),  # Hidden state
                 local_ground_truth_findings,
                 local_ground_truth_impression,
                 max_tokens_input,
+                temperature_input,
+                top_p_input,
+                top_k_input,
+                repetition_penalty_input
             ],
             outputs=llama_analysis_output
         )
     )
 logger.info("Starting Gradio interface...")
+demo.queue().launch(ssr_mode=False)

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
-gradio>=4.16.0
 pandas>=2.0.0
 datasets>=2.15.0
 openai>=1.0.0
 Pillow>=10.0.0
 huggingface-hub>=0.20.0
-torch>=2.0.0
-transformers>=4.36.0
-spaces>=0.19.3
-accelerate>=0.27.0

+transformers
+gradio
+torch
+accelerate
+SentencePiece
 pandas>=2.0.0
 datasets>=2.15.0
 openai>=1.0.0
 Pillow>=10.0.0
 huggingface-hub>=0.20.0
+spaces>=0.19.3