Spaces:

Kawthar12h
/

Text_Summarization

Build error

App Files Files Community

Kawthar12h commited on Sep 14, 2024

Commit

c0ea65f

verified ·

1 Parent(s): 62dc08f

Create app.py

Browse files

Files changed (1) hide show

app.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import gradio as gr
+from transformers import pipeline
+import torch
+from bs4 import BeautifulSoup
+import requests
+def summarize_article(url, min_len, max_len):
+  #Create summarization pipeline
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    try:
+        # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
+        r = requests.get(url)
+        # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
+        soup = BeautifulSoup(r.text, 'html.parser')
+        # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
+        results = soup.find_all(['h1','p'])
+        # Extract the text content from each element and store it in a list called text
+        text = [result.text for result in results]
+        # joins all the extracted text into a single string, representing the entire article
+        ARTICLE = ' '.join(text)
+        # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
+        ARTICLE = ARTICLE.replace('\n', '')
+        ARTICLE = ARTICLE.replace('.', '.<eos>')
+        ARTICLE = ARTICLE.replace('?', '?<eos>')
+        ARTICLE = ARTICLE.replace('!', '!<eos>')
+        # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
+        sentences = ARTICLE.split('<eos>')
+        # Sets the maximum length (in words) for each chunk of text during summarization.
+        max_chunk = 500
+        # Initializes a variable to keep track of the current chunk being processed
+        current_chunk = 0
+        # Creates an empty list called chunks to store the individual chunks of text
+        chunks = []
+        # For loop iterates through each sentence in the sentences list
+        '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
+        The sentence is added to the current chunk.
+        Otherwise:
+        The current_chunk index is incremented to move to the next chunk.
+        A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
+        The current chunk is appended to the chunks list.
+        '''
+        for sentence in sentences:
+            if len(chunks) == current_chunk + 1:
+                if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
+                    chunks[current_chunk].extend(sentence.split(' '))
+                else:
+                    current_chunk += 1
+                    chunks.append(sentence.split(' '))
+            else:
+                chunks.append(sentence.split(' '))
+        ''' After processing all sentences, the loop iterates through each chunk,
+        to ensures that each chunk is represented as a single string (rather than a list of words).
+        '''
+        for chunk_id in range(len(chunks)):
+            chunks[chunk_id] = ' '.join(chunks[chunk_id])
+        # Apply Summarization to text with lenth of 30-120 word for each chunk
+        res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
+        # Extracting the 'summary_text' value from each summary in the res list
+        summary = ' '.join([summ['summary_text'] for summ in res])
+        return summary
+    # Handle potential errors during web request or parsing
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio Interface
+interface = gr.Interface(
+    fn=summarize_article,
+    inputs=[
+        gr.Textbox(label="Enter the article URL"),
+        gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
+        gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
+    ],
+    outputs=gr.Textbox(label="Summary")
+)
+interface.launch()