Summarizer

Runtime error

App Files Files Community

eevaw commited on Oct 21, 2024

Commit

d11e31d

verified ·

1 Parent(s): afb302c

Create app.py

Browse files

Files changed (1) hide show

app.py +75 -0

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+!pip install gradio PyMuPDF
+import gradio as gr
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+import fitz  # PyMuPDF
+# Load the fine-tuned tokenizer and model
+model_name = "fine-tuned-mt5"
+new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
+new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Function to extract text from PDF using PyMuPDF
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    # Open the PDF file
+    with fitz.open(pdf_file) as doc:
+        for page in doc:
+            text += page.get_text()  # Extract text from each page
+    return text
+# Summarization function
+def summarize_pdf(pdf_file, max_summary_length):
+    # Extract text from the PDF
+    input_text = extract_text_from_pdf(pdf_file)
+    # Tokenize the input to check length
+    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
+    try:
+        # Generate the summary
+        summary_ids = new_model.generate(
+            tokenized_input,
+            max_length=max_summary_length,
+            min_length=30,
+            num_beams=15,
+            repetition_penalty=5.0,
+            no_repeat_ngram_size=2
+        )
+        # Decode the generated summary
+        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # Clean up the summary to remove unwanted tokens
+        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
+        # Ensure the summary ends with a complete sentence
+        if cleaned_summary:
+            last_period_index = cleaned_summary.rfind('.')
+            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
+                cleaned_summary = cleaned_summary[:last_period_index + 1]
+            else:
+                cleaned_summary = cleaned_summary.strip()
+        return cleaned_summary if cleaned_summary else "No valid summary generated."
+    except Exception as e:
+        return str(e)  # Return the error message for debugging
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=summarize_pdf,
+    inputs=[
+        gr.File(label="Upload PDF"),
+        gr.Slider(50, 300, step=10, label="Max summary length")
+    ],
+    outputs="textbox",  # A textbox for the output summary
+    title="PDF Text Summarizer",
+    description="Upload a PDF file to summarize its content."
+)
+# Launch the interface
+# Launch the interface with debug mode enabled
+interface.launch(debug=True)