parkerjj commited on
Commit
a743ea2
·
1 Parent(s): 3fdaf75

优化情感得分计算逻辑,增加长文本处理功能,改进模型得分组合方式,添加 nltk 依赖

Browse files
Files changed (3) hide show
  1. blkeras.py +2 -4
  2. preprocess.py +103 -54
  3. requirements.txt +2 -1
blkeras.py CHANGED
@@ -96,7 +96,7 @@ def ensure_fixed_shape(data, shape, variable_name=""):
96
 
97
  def predict(text: str, stock_codes: list):
98
  from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
99
- from preprocess import get_document_vector, get_stock_info, preprocessing_entry, process_entities, process_pos_tags, processing_entry
100
 
101
  try:
102
 
@@ -110,10 +110,8 @@ def predict(text: str, stock_codes: list):
110
  #print(f"predict() Input text: {input_text}")
111
 
112
  # 使用预处理函数处理文本
113
- processed_entry = processing_entry(input_text)
114
-
115
  # 解包 processed_entry 中的各个值
116
- lemmatized_entry, pos_tag, ner, _ , sentiment_score = processed_entry
117
 
118
  # 分别打印每个变量,便于调试
119
  #print("Lemmatized Entry:", lemmatized_entry)
 
96
 
97
  def predict(text: str, stock_codes: list):
98
  from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
99
+ from preprocess import get_document_vector, get_stock_info, process_entities, process_pos_tags, processing_entry
100
 
101
  try:
102
 
 
110
  #print(f"predict() Input text: {input_text}")
111
 
112
  # 使用预处理函数处理文本
 
 
113
  # 解包 processed_entry 中的各个值
114
+ lemmatized_entry, pos_tag, ner, _ , sentiment_score = processing_entry(input_text)
115
 
116
  # 分别打印每个变量,便于调试
117
  #print("Lemmatized Entry:", lemmatized_entry)
preprocess.py CHANGED
@@ -223,72 +223,121 @@ def get_document_vector(words, model = word2vec_model):
223
 
224
 
225
  # 函数:获取情感得分
226
- def get_sentiment_score(text):
 
 
 
 
227
  try:
228
- import torch
229
-
230
- # 获取第一个模型的结果 (ProsusAI/finbert)
231
- # result_one = sentiment_analyzer_one(text, truncation=True, max_length=512)[0]
232
- # 获取模型输出
233
- with torch.no_grad():
234
- outputs_one = sa_model_one(**tokenizer_one(text, return_tensors="pt", truncation=False))
235
- predictions_one = torch.nn.functional.softmax(outputs_one.logits, dim=-1)
236
-
237
-
238
- outputs_two = sa_model_two(**tokenizer_two(text, return_tensors="pt", truncation=False))
239
- predictions_two = torch.nn.functional.softmax(outputs_two.logits, dim=-1)
240
-
241
- # 获取所有标签的概率
242
- scores_one = predictions_one[0].tolist()
243
- scores_two = predictions_two[0].tolist()
244
-
245
- # 获取标签映射
246
- # labels_one = sa_model_one.config.id2label
247
- # labels_two = sa_model_two.config.id2label
248
-
249
- # 打印所有标签的概率
250
- score_one_positive = scores_one[0]
251
- score_one_negative = scores_one[1]
252
- score_one_neutral = scores_one[2]
253
-
254
-
255
- final_score_one = 0.0
256
- final_score_one += score_one_positive
257
- final_score_one -= score_one_negative
258
- if score_one_positive > score_one_negative:
259
- final_score_one += score_one_neutral
260
- else:
261
- final_score_one -= score_one_neutral
262
-
263
- final_score_one = max(-1.0, min(1.0, final_score_one))
264
-
265
- score_two_neutral = scores_two[0]
266
- score_two_positive = scores_two[1]
267
- score_two_negative = scores_two[2]
268
 
269
- final_score_two = 0.0
270
- final_score_two += score_two_positive
271
- final_score_two -= score_two_negative
272
- if score_two_positive > score_two_negative:
273
- final_score_two += score_two_neutral
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  else:
275
- final_score_two -= score_two_neutral
 
 
 
 
276
 
 
277
 
278
- # 将两个模型的得分组合(加权平均)
279
- final_score = np.average([final_score_one, final_score_two], weights=[0.3, 0.7])
 
280
 
281
- # 确保最终得分在 [-1, 1] 范围内
282
- final_score = np.clip(final_score, -1.0, 1.0)
 
283
 
284
- return final_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  except Exception as e:
287
  print(f"Error in get_sentiment_score for text: {text[:50]}... Error: {str(e)}")
288
  traceback.print_exc()
289
  return 0.0
290
-
291
-
292
 
293
 
294
  def get_stock_info(stock_code: str, history_days=30):
 
223
 
224
 
225
  # 函数:获取情感得分
226
+ def process_long_text(text, tokenizer, max_length=512):
227
+ """
228
+ 将长文本分段并保持句子完整性
229
+ """
230
+ import nltk
231
  try:
232
+ nltk.data.find('tokenizers/punkt')
233
+ except LookupError:
234
+ nltk.download('punkt')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ try:
237
+ nltk.data.find('tokenizers/punkt_tab')
238
+ except LookupError:
239
+ nltk.download('punkt_tab')
240
+
241
+
242
+ sentences = nltk.sent_tokenize(text)
243
+ segments = []
244
+ current_segment = ""
245
+
246
+ for sentence in sentences:
247
+ print(f"Processing sentence: {sentence}")
248
+ # 检查添加当前句子后是否会超过最大长度
249
+ test_segment = current_segment + " " + sentence if current_segment else sentence
250
+ if len(tokenizer.tokenize(test_segment)) > max_length:
251
+ if current_segment:
252
+ segments.append(current_segment.strip())
253
+ current_segment = sentence
254
  else:
255
+ current_segment = test_segment
256
+
257
+ # 添加最后一个段落
258
+ if current_segment:
259
+ segments.append(current_segment.strip())
260
 
261
+ return segments
262
 
263
+ def get_sentiment_score(text):
264
+ try:
265
+ import torch
266
 
267
+ # 将长文本分段
268
+ segments_one = process_long_text(text, tokenizer_one)
269
+ segments_two = process_long_text(text, tokenizer_two)
270
 
271
+ final_scores_one = []
272
+ final_scores_two = []
273
+ weights_one = []
274
+ weights_two = []
275
+
276
+ # 处理每个段落 - 模型一
277
+ for segment in segments_one:
278
+ with torch.no_grad():
279
+ inputs = tokenizer_one(segment, return_tensors="pt", truncation=True, max_length=512)
280
+ outputs = sa_model_one(**inputs)
281
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
282
+
283
+ scores = predictions[0].tolist()
284
+ score_positive = scores[0]
285
+ score_negative = scores[1]
286
+ score_neutral = scores[2]
287
+
288
+ segment_score = 0.0
289
+ segment_score += score_positive
290
+ segment_score -= score_negative
291
+ if score_positive > score_negative:
292
+ segment_score += score_neutral
293
+ else:
294
+ segment_score -= score_neutral
295
+
296
+ final_scores_one.append(np.clip(segment_score, -1.0, 1.0))
297
+ weights_one.append(len(tokenizer_one.tokenize(segment)))
298
+
299
+ # 处理每个段落 - 模型二
300
+ for segment in segments_two:
301
+ with torch.no_grad():
302
+ inputs = tokenizer_two(segment, return_tensors="pt", truncation=True, max_length=512)
303
+ outputs = sa_model_two(**inputs)
304
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
305
+
306
+ scores = predictions[0].tolist()
307
+ score_neutral = scores[0]
308
+ score_positive = scores[1]
309
+ score_negative = scores[2]
310
+
311
+ segment_score = 0.0
312
+ segment_score += score_positive
313
+ segment_score -= score_negative
314
+ if score_positive > score_negative:
315
+ segment_score += score_neutral
316
+ else:
317
+ segment_score -= score_neutral
318
+
319
+ final_scores_two.append(np.clip(segment_score, -1.0, 1.0))
320
+ weights_two.append(len(tokenizer_two.tokenize(segment)))
321
+
322
+ # 加权平均
323
+ if final_scores_one:
324
+ final_score_one = np.average(final_scores_one, weights=weights_one)
325
+ else:
326
+ final_score_one = 0.0
327
+
328
+ if final_scores_two:
329
+ final_score_two = np.average(final_scores_two, weights=weights_two)
330
+ else:
331
+ final_score_two = 0.0
332
+
333
+ # 组合两个模型的结果
334
+ final_score = np.average([final_score_one, final_score_two], weights=[0.3, 0.7])
335
+ return np.clip(final_score, -1.0, 1.0)
336
 
337
  except Exception as e:
338
  print(f"Error in get_sentiment_score for text: {text[:50]}... Error: {str(e)}")
339
  traceback.print_exc()
340
  return 0.0
 
 
341
 
342
 
343
  def get_stock_info(stock_code: str, history_days=30):
requirements.txt CHANGED
@@ -16,4 +16,5 @@ yfinance==0.2.47
16
  jsonpath==0.82.2
17
  tensorflow==2.16.2
18
  pydantic==2.9.2
19
- pydantic_core==2.23.4
 
 
16
  jsonpath==0.82.2
17
  tensorflow==2.16.2
18
  pydantic==2.9.2
19
+ pydantic_core==2.23.4
20
+ nltk