File size: 4,485 Bytes
972c6bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
"""Gradio NLP Group Project.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1hDGMwj7G7avlxrqmXe6SIN9LjLRRsuqE
"""

import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer, AutoModelForSequenceClassification
import torch

class TextProcessor:
    def __init__(self, text):
        self.text = text

    def summarize_text(self, text):
        tokenizer = AutoTokenizer.from_pretrained('cranonieu2021/pegasus-on-lectures')
        model = AutoModelForSeq2SeqLM.from_pretrained("cranonieu2021/pegasus-on-lectures")
        inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
        summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    def translate_text(self, text):
        model_name = "sfarjebespalaia/enestranslatorforsummaries"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        src_text = [text]
        tokenized_text = tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt")
        translated = model.generate(**tokenized_text)
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        return translated_text

    def classify_text(self, text):
        model_name = "gserafico/roberta-base-finetuned-classifier-roberta1"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
        labels = {
            0: 'Social Sciences',
            1: 'Arts',
            2: 'Natural Sciences',
            3: 'Business and Law',
            4: 'Engineering and Technology'
        }
        return labels[predicted_class_idx]

def get_transcript(video_id):

  transcripts = YouTubeTranscriptApi.list_transcripts(video_id)

  available_languages = []

  for transcript in transcripts:
    language_details = {
                'Language': transcript.language,
                'Language Code': transcript.language_code,
                'Is Generated': transcript.is_generated,
                'Is Translatable': transcript.is_translatable
            }

    available_languages.append(language_details)

  available_languages = [transcript['Language Code'] for transcript in available_languages if transcript['Language Code'] in ['en']]

  if 'en' in available_languages:
    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
    transcript_text = ' '.join([i['text'] for i in transcript_list])

    return transcript_text, 'en'
  else:
    return 'Transcript in unsupported language.'

def process_text(video_id):
    transcript, language = get_transcript(video_id)
    if "The following languages were not available" in transcript:
        return {"Error": transcript, "Language Detected": "None"}
    processor = TextProcessor(transcript)

    results = {"Language Detected": language}  # Include language in the output for debugging

    if language == 'en':
        summarized_text = processor.summarize_text(transcript)
        translated_text = processor.translate_text(summarized_text)
        classification_result = processor.classify_text(summarized_text)
        results.update({
            'Summarized Text': summarized_text,
            'Translated Text': translated_text,
            'Classification Result': classification_result
        })
    else:
        results.update({'Error': 'Unsupported language'})

    return results

iface = gr.Interface(
    fn=process_text,
    inputs=[gr.Textbox(label="YouTube Video ID")],
    outputs=[gr.JSON(label="Results")],
    title="Text Processing App with YouTube Transcript",
    description="This app allows you to fetch, summarize, translate, and classify YouTube video transcripts based on their language. Currently, English to Spanish Translation is currently supported."

)

def main():

  iface.launch()

if __name__ == '__main__':
  main()