bluenevus commited on
Commit
a80c887
·
verified ·
1 Parent(s): ffd5e97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -29
app.py CHANGED
@@ -70,38 +70,30 @@ def download_audio_from_url(url):
70
  def transcribe_audio(audio_file):
71
  try:
72
  logger.info("Loading audio file...")
73
- audio = AudioSegment.from_file(audio_file)
74
- audio = audio.set_channels(1).set_frame_rate(16000)
75
- audio_array = torch.tensor(audio.get_array_of_samples()).float()
 
 
 
 
76
 
77
- logger.info(f"Audio duration: {len(audio) / 1000:.2f} seconds")
78
  logger.info("Starting transcription...")
79
- input_features = whisper_processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device)
80
-
81
- # Create attention mask
82
- attention_mask = torch.ones_like(input_features)
83
-
84
- max_retries = 3
85
- for attempt in range(max_retries):
86
- # Generate with specific parameters
87
- predicted_ids = whisper_model.generate(
88
- input_features,
89
- attention_mask=attention_mask,
90
- language='en',
91
- task='translate',
92
- temperature=0.7, # Adjust temperature for potentially better results
93
- num_beams=5, # Increase beam search for potentially better results
94
- max_length=448, # Increase max length to allow for longer transcriptions
95
- )
96
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
97
-
98
- logger.info(f"Transcription attempt {attempt + 1} complete. Length: {len(transcription[0])} characters")
99
- if len(transcription[0]) >= 10:
100
- return transcription[0]
101
- else:
102
- logger.warning(f"Transcription too short on attempt {attempt + 1}: {transcription[0]}")
103
-
104
- raise ValueError(f"Failed to generate a valid transcription after {max_retries} attempts")
 
 
105
  except Exception as e:
106
  logger.error(f"Error in transcribe_audio: {str(e)}")
107
  raise
 
70
  def transcribe_audio(audio_file):
71
  try:
72
  logger.info("Loading audio file...")
73
+ audio_input, sr = librosa.load(audio_file, sr=16000)
74
+ audio_input = audio_input.astype(np.float32)
75
+ logger.info(f"Audio duration: {len(audio_input) / sr:.2f} seconds")
76
+
77
+ chunk_length = 30 * sr
78
+ overlap = 5 * sr
79
+ transcriptions = []
80
 
 
81
  logger.info("Starting transcription...")
82
+ for i in range(0, len(audio_input), chunk_length - overlap):
83
+ chunk = audio_input[i:i+chunk_length]
84
+ input_features = whisper_processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
85
+ predicted_ids = whisper_model.generate(input_features)
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
87
+ transcriptions.extend(transcription)
88
+ logger.info(f"Processed {i / sr:.2f} to {(i + chunk_length) / sr:.2f} seconds")
89
+
90
+ full_transcription = " ".join(transcriptions)
91
+ logger.info(f"Transcription complete. Full transcription length: {len(full_transcription)} characters")
92
+
93
+ logger.info("Applying speaker separation using Qwen...")
94
+ separated_transcript = separate_speakers(full_transcription)
95
+
96
+ return separated_transcript
97
  except Exception as e:
98
  logger.error(f"Error in transcribe_audio: {str(e)}")
99
  raise