englissi commited on
Commit
2a9930b
Β·
verified Β·
1 Parent(s): ff7c6ad

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ import re
5
+
6
+ # μš”μ•½μ„ μœ„ν•œ λͺ¨λΈ λ‘œλ“œ
7
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
8
+
9
+ def extract_text_from_pdf(pdf_path):
10
+ doc = fitz.open(pdf_path)
11
+ text = ""
12
+ for page_num in range(doc.page_count):
13
+ page = doc.load_page(page_num)
14
+ text += page.get_text("text") + "\n"
15
+ return text
16
+
17
+ def find_section(text, section_title):
18
+ # μ •κ·œ ν‘œν˜„μ‹μ„ μ‚¬μš©ν•˜μ—¬ μ„Ήμ…˜ 제λͺ©μ„ μ°ΎμŠ΅λ‹ˆλ‹€.
19
+ pattern = re.compile(r'(?i)^.*{}.*$'.format(section_title), re.MULTILINE)
20
+ matches = list(pattern.finditer(text))
21
+ if not matches:
22
+ return None
23
+ start_idx = matches[0].start()
24
+ end_idx = text.find('\n\n', start_idx)
25
+ if end_idx == -1:
26
+ end_idx = len(text)
27
+ section_text = text[start_idx:end_idx].strip()
28
+ return section_text
29
+
30
+ def summarize_section(text, section_title, max_length=150):
31
+ try:
32
+ section_text = find_section(text, section_title)
33
+ if section_text:
34
+ summary = summarizer(section_text, max_length=max_length, min_length=30, do_sample=False)
35
+ return summary[0]['summary_text']
36
+ return f"Section '{section_title}' not found."
37
+ except Exception as e:
38
+ return f"Error processing section '{section_title}': {str(e)}"
39
+
40
+ def process_pdf(file):
41
+ try:
42
+ text = extract_text_from_pdf(file.name)
43
+ except Exception as e:
44
+ return [f"Error extracting text from PDF: {str(e)}"] * 3
45
+
46
+ abstract_summary = summarize_section(text, "abstract")
47
+ research_question_summary = summarize_section(text, "research question")
48
+ results_summary = summarize_section(text, "results")
49
+
50
+ return [abstract_summary, research_question_summary, results_summary]
51
+
52
+ # Gradio μΈν„°νŽ˜μ΄μŠ€ μ„€μ •
53
+ interface = gr.Interface(
54
+ fn=process_pdf,
55
+ inputs=gr.File(label="Upload PDF"),
56
+ outputs=[
57
+ gr.Textbox(label="Abstract Summary"),
58
+ gr.Textbox(label="Research Question Summary"),
59
+ gr.Textbox(label="Results Summary")
60
+ ]
61
+ )
62
+
63
+ # μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
64
+ interface.launch()