import PyPDF2 import spacy import subprocess from collections import Counter import heapq import io # 自動檢查、下載 spaCy 語言模型(en_core_web_sm),避免 Space 缺模型報錯 try: nlp = spacy.load("en_core_web_sm") except OSError: subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) nlp = spacy.load("en_core_web_sm") def read_pdf(file_stream): """讀取 PDF 文字內容""" text = '' reader = PyPDF2.PdfReader(file_stream) for page in reader.pages: text += page.extract_text() + ' ' return text.strip() def extract_key_phrases(text): """擷取文章中的關鍵詞與專有名詞""" doc = nlp(text) key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents] return key_phrases def score_sentences(text, key_phrases): """根據關鍵詞分數給每個句子計分""" sentence_scores = {} doc = nlp(text) for sent in doc.sents: for phrase in key_phrases: if phrase in sent.text: if sent in sentence_scores: sentence_scores[sent] += 1 else: sentence_scores[sent] = 1 return sentence_scores def summarize_text(sentence_scores, num_points=5): """依據分數挑出重要句子並條列化輸出""" summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get) summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences]) return summary