rubensmau commited on
Commit
5c46efb
·
1 Parent(s): 1f363c0
Files changed (1) hide show
  1. chat_dov.py +110 -0
chat_dov.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from dataclasses import asdict
3
+ import json
4
+ import os
5
+ import streamlit as st
6
+ from datasets import load_dataset
7
+
8
+ from data_driven_characters.character import get_character_definition
9
+ from data_driven_characters.corpus import (
10
+ get_corpus_summaries,
11
+ load_docs,
12
+ )
13
+
14
+ from data_driven_characters.chatbots import (
15
+ SummaryChatBot,
16
+ RetrievalChatBot,
17
+ SummaryRetrievalChatBot,
18
+ )
19
+ from data_driven_characters.interfaces import CommandLine, Streamlit
20
+
21
+
22
+ OUTPUT_ROOT = "output"
23
+
24
+
25
+
26
+
27
+
28
+
29
+ def create_chatbot(corpus, character_name, chatbot_type, retrieval_docs, summary_type):
30
+ # logging
31
+ corpus_name = os.path.splitext(os.path.basename(corpus))[0]
32
+ output_dir = f"{OUTPUT_ROOT}/{corpus_name}/summarytype_{summary_type}"
33
+ #### corpus é fixo do Dov Tzamir, carregado em main()
34
+ ####
35
+ os.makedirs(output_dir, exist_ok=True)
36
+ summaries_dir = f"{output_dir}/summaries"
37
+ character_definitions_dir = f"{output_dir}/character_definitions"
38
+ os.makedirs(character_definitions_dir, exist_ok=True)
39
+
40
+ # load docs
41
+ docs = load_docs(corpus_path=corpus, chunk_size=2048, chunk_overlap=64)
42
+
43
+ # generate summaries
44
+ corpus_summaries = get_corpus_summaries(
45
+ docs=docs, summary_type=summary_type, cache_dir=summaries_dir
46
+ )
47
+
48
+ # get character definition
49
+ character_definition = get_character_definition(
50
+ name=character_name,
51
+ corpus_summaries=corpus_summaries,
52
+ cache_dir=character_definitions_dir,
53
+ )
54
+ print(json.dumps(asdict(character_definition), indent=4))
55
+
56
+ # construct retrieval documents
57
+ if retrieval_docs == "raw":
58
+ documents = [
59
+ doc.page_content
60
+ for doc in load_docs(corpus_path=corpus, chunk_size=256, chunk_overlap=16)
61
+ ]
62
+ elif retrieval_docs == "summarized":
63
+ documents = corpus_summaries
64
+ else:
65
+ raise ValueError(f"Unknown retrieval docs type: {retrieval_docs}")
66
+
67
+ # initialize chatbot
68
+ if chatbot_type == "summary":
69
+ chatbot = SummaryChatBot(character_definition=character_definition)
70
+ elif chatbot_type == "retrieval":
71
+ chatbot = RetrievalChatBot(
72
+ character_definition=character_definition,
73
+ documents=documents,
74
+ )
75
+ elif chatbot_type == "summary_retrieval":
76
+ chatbot = SummaryRetrievalChatBot(
77
+ character_definition=character_definition,
78
+ documents=documents,
79
+ )
80
+ else:
81
+ raise ValueError(f"Unknown chatbot type: {chatbot_type}")
82
+ return chatbot
83
+
84
+
85
+ ## python -m streamlit run chat_dov.py -- --corpus data/tzamir.txt --character_name Dov --chatbot_type retrieval --retrieval_docs raw --interface streamlit
86
+
87
+ def main():
88
+ # parametros fixos para Dov Tzamir, arquivos ja processados , exceto indice que são em memoria
89
+ chatbot = st.cache_resource(create_chatbot)(
90
+ "data/tzamir.txt", #args.corpus,
91
+ "Dov", #args.character_name,
92
+ "retrieval", #args.chatbot_type,
93
+ "raw", #args.retrieval_docs,
94
+ "map_reduce", #args.summary_type,
95
+ )
96
+ st.title("Data Driven Characters")
97
+ st.write("Create your own character chatbots, grounded in existing corpora.")
98
+ st.divider()
99
+ """
100
+ st.markdown(f"**chatbot type**: *{args.chatbot_type}*")
101
+ if "retrieval" in args.chatbot_type:
102
+ st.markdown(f"**retrieving from**: *{args.retrieval_docs} corpus*")
103
+ """
104
+ app = Streamlit(chatbot=chatbot)
105
+
106
+ app.run()
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()