Spaces:
Running
Running
| import sqlite3 | |
| import json | |
| import random | |
| import os | |
| import re | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # 初始化 GPT 模型 | |
| model_name = "EleutherAI/pythia-410m" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| # 資料夾 | |
| DATA_DIR = "./data" | |
| DB_PATH = os.path.join(DATA_DIR, "sentences.db") | |
| # 建立資料表 | |
| def init_db(): | |
| conn = sqlite3.connect(DB_PATH) | |
| c = conn.cursor() | |
| c.execute(''' | |
| CREATE TABLE IF NOT EXISTS sentences ( | |
| word TEXT PRIMARY KEY, | |
| phonetic TEXT, | |
| sentence TEXT, | |
| created_at DATETIME DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| conn.commit() | |
| conn.close() | |
| # 自動掃描資料夾生成選單 | |
| def get_sources(): | |
| files = os.listdir(DATA_DIR) | |
| sources = [f.split(".json")[0] for f in files if f.endswith(".json")] | |
| return sources | |
| # 查詢句庫 | |
| def get_sentence(word): | |
| conn = sqlite3.connect(DB_PATH) | |
| c = conn.cursor() | |
| c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,)) | |
| result = c.fetchone() | |
| conn.close() | |
| return result | |
| # 保存句子到 SQLite | |
| def save_sentence(word, phonetic, sentence): | |
| conn = sqlite3.connect(DB_PATH) | |
| c = conn.cursor() | |
| c.execute(''' | |
| INSERT INTO sentences (word, phonetic, sentence) | |
| VALUES (?, ?, ?) | |
| ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic | |
| ''', (word, phonetic, sentence)) | |
| conn.commit() | |
| conn.close() | |
| # 清理 GPT 生成句子的雜訊 | |
| def clean_sentence(output): | |
| output = output.split(":")[-1].strip() | |
| output = re.sub(r"^\d+\.\s*", "", output).strip() | |
| output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip() | |
| output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip() | |
| if not output.endswith("."): | |
| output += "." | |
| return output | |
| # 核心:抽單字 + 查句庫 or GPT 生成句子 | |
| def get_words_with_sentences(source, n): | |
| status = [] | |
| display_result = "" | |
| try: | |
| # 讀取單字庫 | |
| data_path = os.path.join(DATA_DIR, f"{source}.json") | |
| with open(data_path, 'r', encoding='utf-8') as f: | |
| words = json.load(f) | |
| # 隨機抽取 n 個單字 | |
| selected_words = random.sample(words, n) | |
| results = [] | |
| for i, word_data in enumerate(selected_words): | |
| word = word_data['word'] | |
| phonetic = word_data['phonetic'] | |
| # 查詢句庫,看是否已有例句 | |
| cached_result = get_sentence(word) | |
| if cached_result: | |
| sentence = cached_result[2] | |
| status.append(f"✅ {word} 已有例句,從句庫讀取") | |
| else: | |
| # 沒有的話,GPT 生成句子 | |
| status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...") | |
| prompt = f"A simple English sentence with the word '{word}':" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate(**inputs, max_new_tokens=30) | |
| sentence = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # 清理生成句子 | |
| sentence = clean_sentence(sentence) | |
| # 存入句庫 | |
| save_sentence(word, phonetic, sentence) | |
| # 美化輸出 | |
| display_result += f""" | |
| <div style="border-bottom: 1px solid #ddd; margin-bottom: 10px; padding-bottom: 5px;"> | |
| <p><strong>📖 單字:</strong> {word}</p> | |
| <p><strong>🔤 音標:</strong> {phonetic}</p> | |
| <p><strong>✍️ 例句:</strong> {sentence}</p> | |
| </div> | |
| """ | |
| status.append("✅ 完成!") | |
| return display_result, "\n".join(status) | |
| except Exception as e: | |
| status.append(f"❌ 發生錯誤: {str(e)}") | |
| return f"<p style='color:red;'>發生錯誤:{str(e)}</p>", "\n".join(status) | |
| # 啟動時自動建表 | |
| init_db() | |