Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

File size: 3,764 Bytes

import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
from fastapi import FastAPI, File, UploadFile
import uvicorn
import os
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(
    filename="transcription_log.log",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO
)

# Initialize FastAPI app
app = FastAPI()

# Log initialization of the application
logging.info("FastAPI application started.")

# Load the Whisper model and processor
model_name = "openai/whisper-base"
logging.info(f"Loading Whisper model: {model_name}")

try:
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    logging.info(f"Model {model_name} successfully loaded.")
except Exception as e:
    logging.error(f"Error loading the model: {e}")
    raise e

# Move model to the appropriate device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
logging.info(f"Model is using device: {device}")


@app.post("/transcribe/")
async def transcribe_audio(file: UploadFile = File(...)):
    # Log file upload start
    logging.info(f"Received audio file: {file.filename}")
    start_time = datetime.now()

    # Save the uploaded file
    file_location = f"temp_{file.filename}"
    try:
        with open(file_location, "wb+") as f:
            f.write(await file.read())
        logging.info(f"File saved to: {file_location}")
    except Exception as e:
        logging.error(f"Error saving the file: {e}")
        return {"error": f"Error saving the file: {e}"}

    # Load the audio file and preprocess it
    try:
        audio_input, _ = sf.read(file_location)
        logging.info(f"Audio file {file.filename} successfully read.")

        inputs = processor(audio_input, return_tensors="pt", sampling_rate=16000)
        logging.info(f"Audio file preprocessed for transcription.")
    except Exception as e:
        logging.error(f"Error processing the audio file: {e}")
        return {"error": f"Error processing the audio file: {e}"}

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    logging.info("Inputs moved to the appropriate device.")

    # Generate the transcription
    try:
        with torch.no_grad():
            predicted_ids = model.generate(inputs["input_features"])
        logging.info("Transcription successfully generated.")
    except Exception as e:
        logging.error(f"Error during transcription generation: {e}")
        return {"error": f"Error during transcription generation: {e}"}

    # Decode the transcription
    try:
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        logging.info("Transcription successfully decoded.")
    except Exception as e:
        logging.error(f"Error decoding the transcription: {e}")
        return {"error": f"Error decoding the transcription: {e}"}

    # Clean up the temporary file
    try:
        os.remove(file_location)
        logging.info(f"Temporary file {file_location} deleted.")
    except Exception as e:
        logging.error(f"Error deleting the temporary file: {e}")

    end_time = datetime.now()
    time_taken = end_time - start_time
    logging.info(f"Transcription completed in {time_taken.total_seconds()} seconds.")

    return {"transcription": transcription, "processing_time_seconds": time_taken.total_seconds()}


if __name__ == "__main__":
    # Log application start
    logging.info("Starting FastAPI server with Uvicorn...")

    # Run the FastAPI app on the default port (7860)
    uvicorn.run(app, host="0.0.0.0", port=7860)