File size: 5,064 Bytes
d41c42c
 
 
 
 
 
 
27999b6
 
 
 
 
 
 
 
d41c42c
 
 
 
 
27999b6
 
 
d41c42c
 
 
 
 
 
 
 
 
27999b6
 
 
d41c42c
 
 
 
 
 
 
 
 
 
 
 
 
27999b6
 
d41c42c
27999b6
d41c42c
 
 
 
 
 
 
 
 
27999b6
 
d41c42c
27999b6
d41c42c
 
 
 
 
 
 
 
 
27999b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st 
import soundfile as sf
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor , pipeline , Wav2Vec2ForCTC , Wav2Vec2Tokenizer
import torch
import spacy 
from spacy import displacy
import en_core_web_sm
import spacy.cli
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk import tokenize
nltk.download('punkt')
import spacy_streamlit


st.title('Audio-to-Text')

audio_file = st.file_uploader('Upload Audio' , type=['wav' , 'mp3','m4a'])

st.title( 'Please select any of the NLP tasks')


if st.button('Trascribe Audio'):
    if audio_file is not None:
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        st.markdown(result) 
    else:
        st.error('please upload the audio file')



if st.button('Summarize'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        summarize = pipeline("summarization")
        st.markdown(summarize(result)[0]['summary_text'])

if st.button('sentiment-analysis'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        nlp_sa = pipeline("sentiment-analysis")
        st.markdown(nlp_sa(result))

if st.button('Name'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        nlp = spacy.load('en_core_web_sm')
        doc=nlp(result)
        spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe("ner").labels, title= "List of Entities")


tokenizer = AutoTokenizer.from_pretrained("t5-base")

@st.cache(allow_output_mutation=True)
def load_model():
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
    return model

model1 = load_model()

st.subheader('Select your source and target language below.')
source_lang = st.selectbox("Source language",['English'])
target_lang = st.selectbox("Target language",['German','French'])


if st.button('Translate'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        prefix = 'translate '+str(source_lang)+' to '+str(target_lang)
        sentence_token =  tokenize.sent_tokenize(result)
        output = tokenizer([prefix+sentence for sentence in sentence_token], padding=True, return_tensors="pt")
        translated_id = model1.generate(output["input_ids"], attention_mask=output['attention_mask'], max_length=100)
        translated_word = tokenizer.batch_decode(translated_id, skip_special_tokens=True)
        st.subheader('Translated Text')
        st.write(' '.join(translated_word))