File size: 6,956 Bytes
67add1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# app.py
import os
import threading
from functools import wraps
import google.generativeai as genai
from flask import Flask, request, jsonify, send_from_directory
from flask_cors import CORS
from dotenv import load_dotenv
import chromadb

# 从您的核心逻辑文件中导入类
from app_chromadb import MarkdownKnowledgeBase

# --- 初始化与配置 ---
load_dotenv()
app = Flask(__name__, static_folder='.', static_url_path='')
CORS(app)

# --- API 密钥认证配置 ---
VALID_API_KEYS_STR = os.environ.get("KNOWLEDGE_BASE_API_KEYS", "")
VALID_API_KEYS = {key.strip() for key in VALID_API_KEYS_STR.split(',') if key.strip()}
if not VALID_API_KEYS:
    print("⚠️ 警告: 未配置 KNOWLEDGE_BASE_API_KEYS。API 将对所有人开放!")

# --- ChromaDB, Gemini, SiliconFlow 实例配置 ---
try:
    CHROMA_DATA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chroma_db")
    COLLECTION_NAME = "markdown_knowledge_base_m3"
    chroma_client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)
    collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)
    print(f"✅ ChromaDB 客户端已连接,数据存储在 '{CHROMA_DATA_PATH}'")
except Exception as e:
    chroma_client = None
    collection = None
    print(f"❌ 初始化 ChromaDB 失败: {e}")

try:
    GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
    genai.configure(api_key=GEMINI_API_KEY)
    gemini_model = genai.GenerativeModel('gemini-1.5-flash')
    print("✅ Gemini API 已配置。")
except Exception as e:
    gemini_model = None
    print(f"❌ Gemini API 配置失败: {e}")

try:
    SF_API_TOKEN = os.environ.get("SILICONFLOW_API_TOKEN")
    kb_instance = MarkdownKnowledgeBase(api_token=SF_API_TOKEN, chroma_collection=collection)
    print("✅ SiliconFlow 与知识库实例已配置。")
except Exception as e:
    kb_instance = None
    print(f"❌ 知识库实例配置失败: {e}")

kb_status = { "is_building": False }

# --- API 密钥认证装饰器 ---
def require_api_key(f):
    @wraps(f)
    def decorated_function(*args, **kwargs):
        if not VALID_API_KEYS: return f(*args, **kwargs)
        api_key = request.headers.get('X-API-Key')
        if api_key and api_key in VALID_API_KEYS:
            return f(*args, **kwargs)
        else:
            return jsonify({"error": "授权失败。请提供有效'X-API-Key'请求头。"}), 403
    return decorated_function

# --- 前端页面路由 ---
@app.route('/')
def serve_index():
    return send_from_directory('.', 'index.html')

# --- API 端点 ---
@app.route('/status', methods=['GET'])
def get_status():
    if collection:
        kb_status['total_items'] = collection.count()
        kb_status['is_built'] = kb_status['total_items'] > 0
        if not kb_status['is_building']:
             kb_status['message'] = f"知识库已就绪,共有 {kb_status['total_items']} 个条目。"
    else:
        kb_status['message'] = "ChromaDB 未连接。"
    return jsonify(kb_status)

@app.route('/build', methods=['POST'])
@require_api_key
def build_knowledge_base():
    if kb_status['is_building']:
        return jsonify({"error": "知识库已在构建中,请稍后。"}), 409
    
    if not kb_instance:
        return jsonify({"error": "知识库实例未初始化,无法构建。"}), 500
        
    data = request.get_json()
    clear_existing = data.get('clear_existing', False)
    
    build_params = {
        'folder_path': data.get('folder_path'),
        'chunk_size': data.get('chunk_size', 4096),
        'overlap': data.get('overlap', 400),
        'max_files': data.get('max_files', 500),
        'sample_mode': data.get('sample_mode', 'largest')
    }
    
    def build_in_background():
        global kb_status, collection
        kb_status['is_building'] = True
        kb_status['message'] = "构建任务开始..."
        try:
            if clear_existing and chroma_client:
                print(f"正在清空现有集合: {COLLECTION_NAME}")
                chroma_client.delete_collection(name=COLLECTION_NAME)
                collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)
                kb_instance.collection = collection
                print("集合已清空并重建。")

            kb_instance.build_knowledge_base(**build_params)
            kb_status['message'] = f"构建完成!知识库现有 {collection.count()} 个条目。"
        except Exception as e:
            kb_status['message'] = f"构建时出错: {e}"
            print(f"Error during build: {e}")
        finally:
            kb_status['is_building'] = False

    thread = threading.Thread(target=build_in_background)
    thread.start()
    return jsonify({"message": "知识库构建任务已在后台启动。"}), 202

@app.route('/search', methods=['GET'])
@require_api_key
def search_in_kb():
    if not (collection and collection.count() > 0):
        return jsonify({"error": "知识库为空,请先构建。"}), 400

    if not kb_instance:
        return jsonify({"error": "知识库实例未初始化,无法搜索。"}), 500

    query = request.args.get('query')
    top_k = request.args.get('top_k', default=5, type=int)
    
    if not query:
        return jsonify({"error": "必须提供 'query' 参数"}), 400

    try:
        results = kb_instance.search(query, top_k=top_k)
        return jsonify(results)
    except Exception as e:
        return jsonify({"error": f"搜索时发生错误: {e}"}), 500

@app.route('/summarize', methods=['POST'])
@require_api_key
def summarize_results():
    if not gemini_model:
        return jsonify({"error": "Gemini API 未配置或初始化失败。"}), 500
    
    data = request.get_json()
    query = data.get('query')
    search_results = data.get('results')

    if not query or not search_results:
        return jsonify({"error": "必须提供查询和搜索结果。"}), 400

    context = "\n\n---\n\n".join([item['content'] for item in search_results])
    prompt = f"""
    根据以下本地知识库中搜索到的内容,请用清晰、简洁的中文直接回答用户的问题。
    如果内容不足以回答,请说明现有信息无法直接回答。

    用户问题: "{query}"

    搜索到的内容:
    ---
    {context}
    ---

    你的回答:
    """

    try:
        print(f"正在向 Gemini 模型 '{gemini_model.model_name}' 发送请求...")
        response = gemini_model.generate_content(prompt)
        summary = response.text
        return jsonify({"summary": summary})
    except Exception as e:
        print(f"调用 Gemini API 时出错: {e}")
        return jsonify({"error": f"调用 AI 服务时出错: {e}"}), 500


if __name__ == '__main__':
    print("知识库后端服务 (最终版) 启动...")
    print("✅ 服务已启动!请在浏览器中打开 http://127.0.0.1:5000")
    # 使用生产级服务器时应移除 debug=False
    app.run(host='0.0.0.0', port=5000, debug=False)