Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

App Files Files Community

tahirsher commited on Mar 9

Commit

8d55ac9

verified ·

1 Parent(s): 9b5528f

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -26

app.py CHANGED Viewed

@@ -14,24 +14,19 @@ from transformers import (
 )
 # ================================
-# 1️⃣ Authenticate with Hugging Face Hub
 # ================================
-# Get HF token securely from environment variables
-HF_TOKEN = os.getenv("hf_token")
 if HF_TOKEN is None:
     raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
-# Login using the stored token
 login(token=HF_TOKEN)
 # ================================
 # 2️⃣ Load Model & Processor
 # ================================
 MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
-# Load ASR model and processor
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
@@ -163,9 +158,11 @@ if audio_file:
     # Convert audio to model input
     input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features[0]
     # Perform ASR inference
     with torch.no_grad():
-        input_tensor = torch.tensor([input_features]).to(device)
         logits = model(input_tensor).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids)[0]
@@ -173,21 +170,3 @@ if audio_file:
     # Display transcription
     st.success("📄 Transcription:")
     st.write(transcription)
-    # ================================
-    # 8️⃣ Fine-Tune Model with User Correction
-    # ================================
-    user_correction = st.text_area("🔧 Correct the transcription (if needed):", transcription)
-    if st.button("Fine-Tune with Correction"):
-        if user_correction:
-            corrected_input = processor.tokenizer(user_correction).input_ids
-            # Dynamically add new example to dataset
-            dataset.append({"input_features": input_features, "labels": corrected_input})
-            # Perform quick re-training (1 epoch)
-            trainer.args.num_train_epochs = 1
-            trainer.train()
-            st.success("✅ Model fine-tuned with new correction! Try another audio file.")

 )
 # ================================
+# 1️⃣ Authenticate with Hugging Face Hub (Securely)
 # ================================
+HF_TOKEN = os.getenv("hf_token")  # Ensure it's set in Hugging Face Spaces Secrets
 if HF_TOKEN is None:
     raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
 login(token=HF_TOKEN)
 # ================================
 # 2️⃣ Load Model & Processor
 # ================================
 MODEL_NAME = "AqeelShafy7/AudioSangraha-Audio_to_Text"
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
     # Convert audio to model input
     input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features[0]
+    # ✅ FIX: Ensure input tensor is correctly formatted
+    input_tensor = input_features.unsqueeze(0).to(device)  # Adds batch dimension
     # Perform ASR inference
     with torch.no_grad():
         logits = model(input_tensor).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids)[0]
     # Display transcription
     st.success("📄 Transcription:")
     st.write(transcription)