Spaces:

sundea
/

text-classification

Runtime error

App Files Files Community

sundea commited on Jun 12, 2023

Commit

e43d2e0

1 Parent(s): 06477d9

Upload app.py

Browse files

Files changed (1) hide show

app.py +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import argparse
+import os
+from importlib import import_module
+import gradio as gr
+from tqdm import tqdm
+import models.TextCNN
+import torch
+import pickle as pkl
+from utils import build_dataset
+classes = ['finance', 'realty', 'stocks', 'education', 'science', 'society', 'politics', 'sports', 'game',
+           'entertainment']
+MAX_VOCAB_SIZE = 10000  # 词表长度限制
+UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号
+def build_vocab(file_path, tokenizer, max_size, min_freq):
+    vocab_dic = {}
+    with open(file_path, 'r', encoding='UTF-8') as f:
+        for line in tqdm(f):
+            lin = line.strip()
+            if not lin:
+                continue
+            content = lin.split('\t')[0]
+            for word in tokenizer(content):
+                vocab_dic[word] = vocab_dic.get(word, 0) + 1
+        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[
+                     :max_size]
+        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
+        vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
+    return vocab_dic
+# parser = argparse.ArgumentParser(description='Chinese Text Classification')
+# parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
+# args = parser.parse_args()
+# model_name = 'TextCNN'
+# dataset = 'THUCNews'  # 数据集
+# embedding = 'embedding_SougouNews.npz'
+# x = import_module('models.' + model_name)
+#
+# config = x.Config(dataset, embedding)
+# device = 'cuda:0'
+# model = models.TextCNN.Model(config)
+#
+# # vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
+# model.load_state_dict(torch.load('THUCNews/saved_dict/TextCNN.ckpt'))
+# model.to(device)
+# model.eval()
+#
+# tokenizer = lambda x: [y for y in x]  # char-level
+# if os.path.exists(config.vocab_path):
+#     vocab = pkl.load(open(config.vocab_path, 'rb'))
+# else:
+#     vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
+#     pkl.dump(vocab, open(config.vocab_path, 'wb'))
+# print(f"Vocab size: {len(vocab)}")
+#
+# # content='时评：“国学小天才”录取缘何少佳话'
+# content = input('输入语句:')
+#
+# words_line = []
+# token = tokenizer(content)
+# seq_len = len(token)
+# pad_size = 32
+# contents = []
+#
+# if pad_size:
+#     if len(token) < pad_size:
+#         token.extend([PAD] * (pad_size - len(token)))
+#     else:
+#         token = token[:pad_size]
+#         seq_len = pad_size
+# # word to id
+# for word in token:
+#     words_line.append(vocab.get(word, vocab.get(UNK)))
+#
+# contents.append((words_line, seq_len))
+# print(words_line)
+# # input = torch.LongTensor(words_line).unsqueeze(1).to(device)  # convert words_line to LongTensor and add batch dimension
+# x = torch.LongTensor([_[0] for _ in contents]).to(device)
+#
+# # pad前的长度(超过pad_size的设为pad_size)
+# seq_len = torch.LongTensor([_[1] for _ in contents]).to(device)
+# input = (x, seq_len)
+# # print(input)
+# with torch.no_grad():
+#     output = model(input)
+#     predic = torch.max(output.data, 1)[1].cpu().numpy()
+# print(predic)
+# print('类别为：{}'.format(classes[predic[0]]))
+def greet(text):
+    parser = argparse.ArgumentParser(description='Chinese Text Classification')
+    parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
+    args = parser.parse_args()
+    model_name = 'TextCNN'
+    dataset = 'THUCNews'  # 数据集
+    embedding = 'embedding_SougouNews.npz'
+    x = import_module('models.' + model_name)
+    config = x.Config(dataset, embedding)
+    device = 'cuda:0'
+    model = models.TextCNN.Model(config)
+    # vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
+    model.load_state_dict(torch.load('THUCNews/saved_dict/TextCNN.ckpt'))
+    model.to(device)
+    model.eval()
+    tokenizer = lambda x: [y for y in x]  # char-level
+    if os.path.exists(config.vocab_path):
+        vocab = pkl.load(open(config.vocab_path, 'rb'))
+    else:
+        vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
+        pkl.dump(vocab, open(config.vocab_path, 'wb'))
+    # print(f"Vocab size: {len(vocab)}")
+    # content='时评：“国学小天才”录取缘何少佳话'
+    content = text
+    words_line = []
+    token = tokenizer(content)
+    seq_len = len(token)
+    pad_size = 32
+    contents = []
+    if pad_size:
+        if len(token) < pad_size:
+            token.extend([PAD] * (pad_size - len(token)))
+        else:
+            token = token[:pad_size]
+            seq_len = pad_size
+    # word to id
+    for word in token:
+        words_line.append(vocab.get(word, vocab.get(UNK)))
+    contents.append((words_line, seq_len))
+    # print(words_line)
+    # input = torch.LongTensor(words_line).unsqueeze(1).to(device)  # convert words_line to LongTensor and add batch dimension
+    x = torch.LongTensor([_[0] for _ in contents]).to(device)
+    # pad前的长度(超过pad_size的设为pad_size)
+    seq_len = torch.LongTensor([_[1] for _ in contents]).to(device)
+    input = (x, seq_len)
+    # print(input)
+    with torch.no_grad():
+        output = model(input)
+        predic = torch.max(output.data, 1)[1].cpu().numpy()
+    # print(predic)
+    # print('类别为：{}'.format(classes[predic[0]]))
+    return classes[predic[0]]
+#
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch(server_port=9090)
+# with torch.no_grad():
+#     output=model(input)
+# print(output)
+#
+# start_time = time.time()
+# test_iter = build_iterator(test_data, config)
+# with torch.no_grad():
+#     predict_all = np.array([], dtype=int)
+#     labels_all = np.array([], dtype=int)
+#     for texts, labels in test_iter:
+#         # texts=texts.to(device)
+#         print(texts)
+#         outputs = model(texts)
+#         loss = F.cross_entropy(outputs, labels)
+#         labels = labels.data.cpu().numpy()
+#         predic = torch.max(outputs.data, 1)[1].cpu().numpy()
+#         labels_all = np.append(labels_all, labels)
+#         predict_all = np.append(predict_all, predic)
+#         break
+#     print(labels_all)
+#     print(predict_all)
+#
+#