Spaces:
Sleeping
Sleeping
File size: 4,706 Bytes
972c6bb 4d7ec8c 972c6bb 3331caa 972c6bb 3331caa 972c6bb 3331caa 972c6bb 3331caa 972c6bb 4d7ec8c 972c6bb 4d7ec8c 972c6bb 4d7ec8c 972c6bb 4d7ec8c 972c6bb 4d7ec8c 972c6bb 4d7ec8c 972c6bb 4d7ec8c 972c6bb 4d7ec8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# -*- coding: utf-8 -*-
"""Gradio NLP Group Project.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1hDGMwj7G7avlxrqmXe6SIN9LjLRRsuqE
"""
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer, AutoModelForSequenceClassification
import torch
class TextProcessor:
def __init__(self, text):
self.text = text
def summarize_text(self, text):
tokenizer = AutoTokenizer.from_pretrained('cranonieu2021/pegasus-on-lectures')
model = AutoModelForSeq2SeqLM.from_pretrained("cranonieu2021/pegasus-on-lectures")
inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def translate_text(self, text):
model_name = "sfarjebespalaia/enestranslatorforsummaries"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
src_text = [text]
tokenized_text = tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt")
translated = model.generate(**tokenized_text)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
def classify_text(self, text):
model_name = "gserafico/roberta-base-finetuned-classifier-roberta1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
labels = {
0: 'Social Sciences',
1: 'Arts',
2: 'Natural Sciences',
3: 'Business and Law',
4: 'Engineering and Technology'
}
return labels[predicted_class_idx]
def get_transcript(video_id):
try:
transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
except NoTranscriptFound:
return "No transcript found for this video.", 'en'
except TranscriptsDisabled:
return "Transcripts are disabled for this video.", 'en'
except Exception as e:
return f"An error occurred: {str(e)}", 'en'
available_languages = []
for transcript in transcripts:
language_details = {
'Language': transcript.language,
'Language Code': transcript.language_code,
'Is Generated': transcript.is_generated,
'Is Translatable': transcript.is_translatable
}
available_languages.append(language_details)
available_languages = [transcript['Language Code'] for transcript in available_languages if transcript['Language Code'] in ['en']]
if 'en' in available_languages:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
transcript_text = ' '.join([i['text'] for i in transcript_list])
return transcript_text, 'en'
else:
return 'Transcript in unsupported language.'
def process_text(video_id):
transcript, language = get_transcript(video_id)
if transcript.startswith("An error occurred:") or "No transcript" in transcript:
return {"Error": transcript, "Language Detected": "None"}
processor = TextProcessor(transcript)
try:
summarized_text = processor.summarize_text(transcript)
translated_text = processor.translate_text(summarized_text)
classification_result = processor.classify_text(summarized_text)
results = {
'Language Detected': language,
'Summarized Text': summarized_text,
'Translated Text': translated_text,
'Classification Result': classification_result
}
except Exception as e:
results = {'Error': f"An error occurred during processing: {str(e)}"}
return results
iface = gr.Interface(
fn=process_text,
inputs=[gr.Textbox(label="YouTube Video ID")],
outputs=[gr.JSON(label="Results")],
title="Text Processing App with YouTube Transcript",
description="This app allows you to fetch, summarize, translate, and classify YouTube video transcripts. Errors are handled and displayed."
)
def main():
iface.launch()
if __name__ == '__main__':
main() |