thunder-007 commited on
Commit
a440325
·
1 Parent(s): 13679ee

whisper audio question input

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  .idea
2
  .env
3
  /venv/
 
 
1
  .idea
2
  .env
3
  /venv/
4
+ /rough.py
app.py CHANGED
@@ -14,6 +14,8 @@ from monai.transforms import (
14
  Orientationd,
15
  EnsureChannelFirstd,
16
  )
 
 
17
  import torch
18
 
19
  title = 'Detect and Segment Brain Tumors 🧠'
@@ -36,6 +38,9 @@ preproc_transforms = Compose(
36
  )
37
  post_trans = Compose([Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
38
 
 
 
 
39
  model_tumor_seg = SegResNet(
40
  blocks_down=[1, 2, 2, 4],
41
  blocks_up=[1, 1, 1],
@@ -62,13 +67,31 @@ def inference(input):
62
 
63
 
64
  examples = [
65
- ['examples/BRATS_225.nii.gz', 83, 2],
66
- ['examples/BRATS_485.nii.gz', 90, 1],
67
- ['examples/BRATS_485.nii.gz', 110, 0]
68
  ]
69
 
70
 
71
- def detector(tumor_file, slice_number, channel, audio_question):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  tumor_file_path = tumor_file.name
73
  processed_data = preproc_transforms({'image': [tumor_file_path]})
74
  tensor_3d_input = processed_data['image'].unsqueeze(0).to('cpu')
@@ -87,18 +110,19 @@ def detector(tumor_file, slice_number, channel, audio_question):
87
  plt.savefig(output_image_path, bbox_inches='tight', pad_inches=0)
88
  segment_image = np.asarray(Image.open(output_image_path))
89
  os.remove(output_image_path)
90
- return (channel_image, segment_image, "Question answer")
91
 
92
 
93
  interface = gr.Interface(fn=detector, inputs=[gr.File(label="Tumor File"),
94
  gr.Slider(0, 200, 50, step=1, label="Slice Number"),
95
- gr.Radio([0, 1, 2], label="Channel"),
 
96
  gr.Audio(source="microphone"), ],
97
  outputs=[gr.Image(label='channel', shape=(1, 1)),
98
  gr.Image(label='Segmented Tumor', shape=(1, 1)),
99
  gr.Textbox(label="Medical Summary")], title=title,
100
  examples=examples,
101
- description=description, outputs_layout="row", theme='dark')
102
 
103
  theme = gr.themes.Default().set(
104
  button_primary_background_fill="#FF0000",
 
14
  Orientationd,
15
  EnsureChannelFirstd,
16
  )
17
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
18
+ import librosa
19
  import torch
20
 
21
  title = 'Detect and Segment Brain Tumors 🧠'
 
38
  )
39
  post_trans = Compose([Activations(sigmoid=True), AsDiscrete(threshold=0.5)])
40
 
41
+ processor_whisper = WhisperProcessor.from_pretrained("whisper-tiny")
42
+ model_whisper = WhisperForConditionalGeneration.from_pretrained("whisper-tiny")
43
+
44
  model_tumor_seg = SegResNet(
45
  blocks_down=[1, 2, 2, 4],
46
  blocks_up=[1, 1, 1],
 
67
 
68
 
69
  examples = [
70
+ ['examples/BRATS_225.nii.gz', 83, 2, 'english', 'examples/sample1_en.mp3'],
71
+ ['examples/BRATS_485.nii.gz', 90, 1, 'japanese', 'examples/sample2_jp.mp3'],
72
+ ['examples/BRATS_485.nii.gz', 110, 0, 'german', 'examples/sample3_gr.mp3'],
73
  ]
74
 
75
 
76
+ def process_audio(sampling_rate, waveform):
77
+ waveform = waveform / 32678.0
78
+ if len(waveform.shape) > 1:
79
+ waveform = librosa.to_mono(waveform.T)
80
+ if sampling_rate != 16000:
81
+ waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
82
+ waveform = waveform[:16000 * 30]
83
+ waveform = torch.tensor(waveform)
84
+ return waveform
85
+
86
+
87
+ def detector(tumor_file, slice_number, channel, language, audio_question):
88
+ sampling_rate, waveform = audio_question
89
+ forced_decoder_ids = processor_whisper.get_decoder_prompt_ids(language=language, task="transcribe")
90
+ waveform = process_audio(sampling_rate, waveform)
91
+ audio_inputs = processor_whisper(audio=waveform, sampling_rate=16000, return_tensors="pt")
92
+ predicted_ids = model_whisper.generate(**audio_inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
93
+ transcription = processor_whisper.batch_decode(predicted_ids, skip_special_tokens=True)
94
+ output_text = transcription[0]
95
  tumor_file_path = tumor_file.name
96
  processed_data = preproc_transforms({'image': [tumor_file_path]})
97
  tensor_3d_input = processed_data['image'].unsqueeze(0).to('cpu')
 
110
  plt.savefig(output_image_path, bbox_inches='tight', pad_inches=0)
111
  segment_image = np.asarray(Image.open(output_image_path))
112
  os.remove(output_image_path)
113
+ return (channel_image, segment_image, output_text)
114
 
115
 
116
  interface = gr.Interface(fn=detector, inputs=[gr.File(label="Tumor File"),
117
  gr.Slider(0, 200, 50, step=1, label="Slice Number"),
118
+ gr.Radio((0, 1, 2), label="Channel"),
119
+ gr.Radio(("english", "japanese", "german", "spanish"), label="Language"),
120
  gr.Audio(source="microphone"), ],
121
  outputs=[gr.Image(label='channel', shape=(1, 1)),
122
  gr.Image(label='Segmented Tumor', shape=(1, 1)),
123
  gr.Textbox(label="Medical Summary")], title=title,
124
  examples=examples,
125
+ description=description, theme='dark')
126
 
127
  theme = gr.themes.Default().set(
128
  button_primary_background_fill="#FF0000",
examples/sample1_en.mp3 ADDED
Binary file (23.5 kB). View file
 
examples/sample2_jp.mp3 ADDED
Binary file (30.4 kB). View file
 
examples/sample3_gr.mp3 ADDED
Binary file (19.8 kB). View file
 
examples/sample4_sp.mp3 ADDED
Binary file (22.2 kB). View file
 
requirements.txt CHANGED
@@ -4,4 +4,5 @@ torchvision
4
  torchaudio
5
  nibabel
6
  monai
7
- matplotlib
 
 
4
  torchaudio
5
  nibabel
6
  monai
7
+ matplotlib
8
+ librosa