eevaw commited on
Commit
d11e31d
·
verified ·
1 Parent(s): afb302c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install gradio PyMuPDF
2
+
3
+ import gradio as gr
4
+ from transformers import T5Tokenizer, MT5ForConditionalGeneration
5
+ import fitz # PyMuPDF
6
+
7
+ # Load the fine-tuned tokenizer and model
8
+ model_name = "fine-tuned-mt5"
9
+ new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
10
+ new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
11
+
12
+ # Function to extract text from PDF using PyMuPDF
13
+ def extract_text_from_pdf(pdf_file):
14
+ text = ""
15
+ # Open the PDF file
16
+ with fitz.open(pdf_file) as doc:
17
+ for page in doc:
18
+ text += page.get_text() # Extract text from each page
19
+ return text
20
+
21
+ # Summarization function
22
+ def summarize_pdf(pdf_file, max_summary_length):
23
+ # Extract text from the PDF
24
+ input_text = extract_text_from_pdf(pdf_file)
25
+
26
+ # Tokenize the input to check length
27
+ tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
28
+
29
+
30
+
31
+ try:
32
+ # Generate the summary
33
+ summary_ids = new_model.generate(
34
+ tokenized_input,
35
+ max_length=max_summary_length,
36
+ min_length=30,
37
+ num_beams=15,
38
+ repetition_penalty=5.0,
39
+ no_repeat_ngram_size=2
40
+ )
41
+
42
+ # Decode the generated summary
43
+ summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
44
+
45
+ # Clean up the summary to remove unwanted tokens
46
+ cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
47
+
48
+ # Ensure the summary ends with a complete sentence
49
+ if cleaned_summary:
50
+ last_period_index = cleaned_summary.rfind('.')
51
+ if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
52
+ cleaned_summary = cleaned_summary[:last_period_index + 1]
53
+ else:
54
+ cleaned_summary = cleaned_summary.strip()
55
+
56
+ return cleaned_summary if cleaned_summary else "No valid summary generated."
57
+
58
+ except Exception as e:
59
+ return str(e) # Return the error message for debugging
60
+
61
+ # Define the Gradio interface
62
+ interface = gr.Interface(
63
+ fn=summarize_pdf,
64
+ inputs=[
65
+ gr.File(label="Upload PDF"),
66
+ gr.Slider(50, 300, step=10, label="Max summary length")
67
+ ],
68
+ outputs="textbox", # A textbox for the output summary
69
+ title="PDF Text Summarizer",
70
+ description="Upload a PDF file to summarize its content."
71
+ )
72
+
73
+ # Launch the interface
74
+ # Launch the interface with debug mode enabled
75
+ interface.launch(debug=True)