Spaces:
Sleeping
Sleeping
import gradio as gr | |
import random | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from datasets import load_dataset | |
import requests | |
from bs4 import BeautifulSoup | |
tokenizer = AutoTokenizer.from_pretrained("liamvbetts/bart-large-cnn-v4") | |
model = AutoModelForSeq2SeqLM.from_pretrained("liamvbetts/bart-large-cnn-v4") | |
dataset = load_dataset("cnn_dailymail", "3.0.0") | |
NEWS_API_KEY = "da2cc601304341e7a39cb5604d0b076b" | |
def summarize(article): | |
inputs = tokenizer(article, return_tensors="pt").input_ids | |
outputs = model.generate(inputs, max_new_tokens=128, do_sample=False) | |
summary = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return summary | |
def get_random_article(): | |
random.seed() | |
val_example = dataset["validation"].shuffle().select(range(1)) | |
val_article = val_example['article'][0][:1024] | |
return val_article | |
def load_article(): | |
return get_random_article() | |
def get_news_article(): | |
url = 'https://newsapi.org/v2/top-headlines' | |
news_url = '' | |
params = { | |
'apiKey': NEWS_API_KEY, | |
'country': 'us', # You can change this as needed | |
'pageSize': 100 | |
} | |
response = requests.get(url, params=params) | |
articles = response.json().get('articles', []) | |
if articles: | |
random_article = random.choice(articles) | |
news_url = random_article.get('url') | |
else: | |
return None | |
if news_url: | |
full_article, title = scrape_article(news_url) | |
return full_article, title | |
else: | |
return "No news article found.", "" | |
def scrape_article(url): | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extracting the title - this is a general approach | |
title = soup.title.string if soup.title else "No Title Available" | |
article_content = soup.find_all('p') # This is a simplification | |
text = ' '.join([p.get_text() for p in article_content]) | |
words = text.split() | |
truncated_text = ' '.join(words[:1024]) # Truncate to first 1024 words | |
return truncated_text, title | |
except Exception as e: | |
return "Error scraping article: " + str(e), "" | |
# Using Gradio Blocks | |
with gr.Blocks() as demo: | |
gr.Markdown("## News Summary App") | |
gr.Markdown("Enter a news text and get its summary, or load a random article.") | |
with gr.Row(): | |
article_title = gr.Label() # Component to display the article title | |
input_text = gr.Textbox(lines=10, label="Input Text") | |
output_text = gr.Textbox(label="Summary") | |
load_dataset_article_button = gr.Button("Load Random Article from Dataset") | |
load_news_article_button = gr.Button("Load News Article") | |
load_dataset_article_button.click(fn=load_article, inputs=[], outputs=input_text) | |
load_news_article_button.click(fn=get_news_article, inputs=[], outputs=[input_text, article_title]) | |
summarize_button = gr.Button("Summarize") | |
summarize_button.click(fn=summarize, inputs=input_text, outputs=output_text) | |
demo.launch() |