Spaces:

Kawthar12h
/

Text_Summarization

Build error

App Files Files Community

Text_Summarization / app.py

Kawthar12h

Update app.py

320ee31 verified about 1 year ago

raw

history blame contribute delete

4.18 kB



	import gradio as gr # import Gradio library for creating web-based user interfaces
	from transformers import pipeline # import pipeline to use pre-trained models
	import torch # import PyTorch library, which is commonly used for Deep Learning tasks
	from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts
	import requests # To make HTTP requests to retrieve web content.



	def summarize_article(url, min_len, max_len):
	#Create summarization pipeline
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	try:
	# Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
	r = requests.get(url)

	# Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
	soup = BeautifulSoup(r.text, 'html.parser')

	# To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
	results = soup.find_all(['h1','h2','p'])

	# Extract the text content from each element and store it in a list called text
	text = [result.text for result in results]

	# joins all the extracted text into a single string, representing the entire article
	ARTICLE = ' '.join(text)

	# Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
	ARTICLE = ARTICLE.replace('.', '.<eos>')
	ARTICLE = ARTICLE.replace('?', '?<eos>')
	ARTICLE = ARTICLE.replace('!', '!<eos>')

	# Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
	sentences = ARTICLE.split('<eos>')

	# Sets the maximum length (in words) for each chunk of text during summarization.
	max_chunk = 500

	# Initializes a variable to keep track of the current chunk being processed
	current_chunk = 0

	# Creates an empty list called chunks to store the individual chunks of text
	chunks = []

	# For loop iterates through each sentence in the sentences list
	'''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
	The sentence is added to the current chunk.

	Otherwise:

	The current_chunk index is incremented to move to the next chunk.
	A new chunk is created, and the current sentence becomes the first sentence in this new chunk.

	The current chunk is appended to the chunks list.
	'''
	for sentence in sentences:
	if len(chunks) == current_chunk + 1:
	if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
	chunks[current_chunk].extend(sentence.split(' '))
	else:
	current_chunk += 1
	chunks.append(sentence.split(' '))
	else:
	chunks.append(sentence.split(' '))

	''' After processing all sentences, the loop iterates through each chunk,
	to ensures that each chunk is represented as a single string (rather than a list of words).
	'''
	for chunk_id in range(len(chunks)):
	chunks[chunk_id] = ' '.join(chunks[chunk_id])

	# Apply Summarization to text with lenth of 30-120 word for each chunk
	res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)

	# Extracting the 'summary_text' value from each summary in the res list
	summary = ' '.join([summ['summary_text'] for summ in res])
	return summary

	# Handle potential errors during web request or parsing
	except Exception as e:
	return f"Error: {str(e)}"


	# Create Gradio Interface
	interface = gr.Interface(
	fn=summarize_article,
	inputs=[
	gr.Textbox(label="Enter the article URL"),
	gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
	gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
	],
	outputs=gr.Textbox(label="Summary")
	)

	interface.launch()