MVPilgrim commited on
Commit
5981d4c
·
1 Parent(s): 17fe29d
Files changed (1) hide show
  1. semsearch.py +15 -123
semsearch.py CHANGED
@@ -19,116 +19,6 @@ logger = logging.getLogger(__name__)
19
  logging.basicConfig(level=logging.INFO)
20
 
21
 
22
- #################################################################
23
- # Create the chunks collection for the Weaviate database.
24
- def createChunksCollection():
25
- logger.info("#### createChunksCollection() entered.")
26
- if client.collections.exists("Chunks"):
27
- client.collections.delete("Chunks")
28
-
29
- class_obj = {
30
- "class": "Chunks",
31
- "description": "Collection for document chunks.",
32
- "vectorizer": "text2vec-transformers",
33
- "moduleConfig": {
34
- "text2vec-transformers": {
35
- "vectorizeClassName": True
36
- }
37
- },
38
- "vectorIndexType": "hnsw",
39
- "vectorIndexConfig": {
40
- "distance": "cosine",
41
- },
42
- "properties": [
43
- {
44
- "name": "chunk",
45
- "dataType": ["text"],
46
- "description": "Single webpage chunk.",
47
- "vectorizer": "text2vec-transformers",
48
- "moduleConfig": {
49
- "text2vec-transformers": {
50
- "vectorizePropertyName": False,
51
- "skip": False,
52
- "tokenization": "lowercase"
53
- }
54
- }
55
- },
56
- {
57
- "name": "chunk_index",
58
- "dataType": ["int"]
59
- },
60
- {
61
- "name": "webpage",
62
- "dataType": ["Documents"],
63
- "description": "Webpage content chunks.",
64
-
65
- "invertedIndexConfig": {
66
- "bm25": {
67
- "b": 0.75,
68
- "k1": 1.2
69
- }
70
- }
71
- }
72
- ]
73
- }
74
- return(client.collections.create_from_dict(class_obj))
75
-
76
-
77
- #####################################################################
78
- # Create the document collection for the Weaviate database.
79
- def createWebpageCollection():
80
- logger.info("#### createWebpageCollection() entered.")
81
- if client.collections.exists("Documents"):
82
- client.collections.delete("Documents")
83
-
84
- class_obj = {
85
- "class": "Documents",
86
- "description": "For first attempt at loading a Weviate database.",
87
- "vectorizer": "text2vec-transformers",
88
- "moduleConfig": {
89
- "text2vec-transformers": {
90
- "vectorizeClassName": False
91
- }
92
- },
93
- "vectorIndexType": "hnsw",
94
- "vectorIndexConfig": {
95
- "distance": "cosine",
96
- },
97
- "properties": [
98
- {
99
- "name": "title",
100
- "dataType": ["text"],
101
- "description": "HTML doc title.",
102
- "vectorizer": "text2vec-transformers",
103
- "moduleConfig": {
104
- "text2vec-transformers": {
105
- "vectorizePropertyName": True,
106
- "skip": False,
107
- "tokenization": "lowercase"
108
- }
109
- },
110
- "invertedIndexConfig": {
111
- "bm25": {
112
- "b": 0.75,
113
- "k1": 1.2
114
- },
115
- }
116
- },
117
- {
118
- "name": "content",
119
- "dataType": ["text"],
120
- "description": "HTML page content.",
121
- "moduleConfig": {
122
- "text2vec-transformers": {
123
- "vectorizePropertyName": True,
124
- "tokenization": "whitespace"
125
- }
126
- }
127
- }
128
- ]
129
- }
130
- return(client.collections.create_from_dict(class_obj))
131
-
132
 
133
  ######################################################################
134
  # MAINLINE
@@ -145,19 +35,6 @@ webpageTitles = []
145
  webpageChunksDocNames = []
146
 
147
 
148
- ######################################################
149
- # Connect to the Weaviate vector database.
150
- logger.info("#### Create Weaviate db client connection.")
151
- client = weaviate.connect_to_custom(
152
- http_host="127.0.0.1",
153
- http_port=8080,
154
- http_secure=False,
155
- grpc_host="127.0.0.1",
156
- grpc_port=50051,
157
- grpc_secure=False
158
- )
159
- client.connect()
160
-
161
  #######################################################
162
  # Read each text input file, parse it into a document,
163
  # chunk it, collect chunks and document name.
@@ -196,6 +73,21 @@ for filename in os.listdir(pathString):
196
 
197
  logger.debug(f"### webpageDocNames: {webpageDocNames}")
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  ######################################################
200
  # Create database webpage and chunks collections.
201
  #wpCollection = createWebpageCollection()
 
19
  logging.basicConfig(level=logging.INFO)
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  ######################################################################
24
  # MAINLINE
 
35
  webpageChunksDocNames = []
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  #######################################################
39
  # Read each text input file, parse it into a document,
40
  # chunk it, collect chunks and document name.
 
73
 
74
  logger.debug(f"### webpageDocNames: {webpageDocNames}")
75
 
76
+
77
+ ######################################################
78
+ # Connect to the Weaviate vector database.
79
+ logger.info("#### Create Weaviate db client connection.")
80
+ client = weaviate.connect_to_custom(
81
+ http_host="127.0.0.1",
82
+ http_port=8080,
83
+ http_secure=False,
84
+ grpc_host="127.0.0.1",
85
+ grpc_port=50051,
86
+ grpc_secure=False
87
+ )
88
+ client.connect()
89
+
90
+
91
  ######################################################
92
  # Create database webpage and chunks collections.
93
  #wpCollection = createWebpageCollection()