Spaces:
Build error
Build error
| import gradio as gr # import Gradio library for creating web-based user interfaces | |
| from transformers import pipeline # import pipeline to use pre-trained models | |
| import torch # import PyTorch library, which is commonly used for Deep Learning tasks | |
| from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts | |
| import requests # To make HTTP requests to retrieve web content. | |
| def summarize_article(url, min_len, max_len): | |
| #Create summarization pipeline | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| try: | |
| # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content | |
| r = requests.get(url) | |
| # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content | |
| results = soup.find_all(['h1','h2','p']) | |
| # Extract the text content from each element and store it in a list called text | |
| text = [result.text for result in results] | |
| # joins all the extracted text into a single string, representing the entire article | |
| ARTICLE = ' '.join(text) | |
| # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization. | |
| ARTICLE = ARTICLE.replace('.', '.<eos>') | |
| ARTICLE = ARTICLE.replace('?', '?<eos>') | |
| ARTICLE = ARTICLE.replace('!', '!<eos>') | |
| # Splits the article into sentences based on the <eos> token and stores them in a list called sentences. | |
| sentences = ARTICLE.split('<eos>') | |
| # Sets the maximum length (in words) for each chunk of text during summarization. | |
| max_chunk = 500 | |
| # Initializes a variable to keep track of the current chunk being processed | |
| current_chunk = 0 | |
| # Creates an empty list called chunks to store the individual chunks of text | |
| chunks = [] | |
| # For loop iterates through each sentence in the sentences list | |
| '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length: | |
| The sentence is added to the current chunk. | |
| Otherwise: | |
| The current_chunk index is incremented to move to the next chunk. | |
| A new chunk is created, and the current sentence becomes the first sentence in this new chunk. | |
| The current chunk is appended to the chunks list. | |
| ''' | |
| for sentence in sentences: | |
| if len(chunks) == current_chunk + 1: | |
| if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk: | |
| chunks[current_chunk].extend(sentence.split(' ')) | |
| else: | |
| current_chunk += 1 | |
| chunks.append(sentence.split(' ')) | |
| else: | |
| chunks.append(sentence.split(' ')) | |
| ''' After processing all sentences, the loop iterates through each chunk, | |
| to ensures that each chunk is represented as a single string (rather than a list of words). | |
| ''' | |
| for chunk_id in range(len(chunks)): | |
| chunks[chunk_id] = ' '.join(chunks[chunk_id]) | |
| # Apply Summarization to text with lenth of 30-120 word for each chunk | |
| res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False) | |
| # Extracting the 'summary_text' value from each summary in the res list | |
| summary = ' '.join([summ['summary_text'] for summ in res]) | |
| return summary | |
| # Handle potential errors during web request or parsing | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Create Gradio Interface | |
| interface = gr.Interface( | |
| fn=summarize_article, | |
| inputs=[ | |
| gr.Textbox(label="Enter the article URL"), | |
| gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"), | |
| gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length") | |
| ], | |
| outputs=gr.Textbox(label="Summary") | |
| ) | |
| interface.launch() |