Spaces:
Running
Running
MVPilgrim
commited on
Commit
·
5981d4c
1
Parent(s):
17fe29d
fdsa
Browse files- semsearch.py +15 -123
semsearch.py
CHANGED
@@ -19,116 +19,6 @@ logger = logging.getLogger(__name__)
|
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
21 |
|
22 |
-
#################################################################
|
23 |
-
# Create the chunks collection for the Weaviate database.
|
24 |
-
def createChunksCollection():
|
25 |
-
logger.info("#### createChunksCollection() entered.")
|
26 |
-
if client.collections.exists("Chunks"):
|
27 |
-
client.collections.delete("Chunks")
|
28 |
-
|
29 |
-
class_obj = {
|
30 |
-
"class": "Chunks",
|
31 |
-
"description": "Collection for document chunks.",
|
32 |
-
"vectorizer": "text2vec-transformers",
|
33 |
-
"moduleConfig": {
|
34 |
-
"text2vec-transformers": {
|
35 |
-
"vectorizeClassName": True
|
36 |
-
}
|
37 |
-
},
|
38 |
-
"vectorIndexType": "hnsw",
|
39 |
-
"vectorIndexConfig": {
|
40 |
-
"distance": "cosine",
|
41 |
-
},
|
42 |
-
"properties": [
|
43 |
-
{
|
44 |
-
"name": "chunk",
|
45 |
-
"dataType": ["text"],
|
46 |
-
"description": "Single webpage chunk.",
|
47 |
-
"vectorizer": "text2vec-transformers",
|
48 |
-
"moduleConfig": {
|
49 |
-
"text2vec-transformers": {
|
50 |
-
"vectorizePropertyName": False,
|
51 |
-
"skip": False,
|
52 |
-
"tokenization": "lowercase"
|
53 |
-
}
|
54 |
-
}
|
55 |
-
},
|
56 |
-
{
|
57 |
-
"name": "chunk_index",
|
58 |
-
"dataType": ["int"]
|
59 |
-
},
|
60 |
-
{
|
61 |
-
"name": "webpage",
|
62 |
-
"dataType": ["Documents"],
|
63 |
-
"description": "Webpage content chunks.",
|
64 |
-
|
65 |
-
"invertedIndexConfig": {
|
66 |
-
"bm25": {
|
67 |
-
"b": 0.75,
|
68 |
-
"k1": 1.2
|
69 |
-
}
|
70 |
-
}
|
71 |
-
}
|
72 |
-
]
|
73 |
-
}
|
74 |
-
return(client.collections.create_from_dict(class_obj))
|
75 |
-
|
76 |
-
|
77 |
-
#####################################################################
|
78 |
-
# Create the document collection for the Weaviate database.
|
79 |
-
def createWebpageCollection():
|
80 |
-
logger.info("#### createWebpageCollection() entered.")
|
81 |
-
if client.collections.exists("Documents"):
|
82 |
-
client.collections.delete("Documents")
|
83 |
-
|
84 |
-
class_obj = {
|
85 |
-
"class": "Documents",
|
86 |
-
"description": "For first attempt at loading a Weviate database.",
|
87 |
-
"vectorizer": "text2vec-transformers",
|
88 |
-
"moduleConfig": {
|
89 |
-
"text2vec-transformers": {
|
90 |
-
"vectorizeClassName": False
|
91 |
-
}
|
92 |
-
},
|
93 |
-
"vectorIndexType": "hnsw",
|
94 |
-
"vectorIndexConfig": {
|
95 |
-
"distance": "cosine",
|
96 |
-
},
|
97 |
-
"properties": [
|
98 |
-
{
|
99 |
-
"name": "title",
|
100 |
-
"dataType": ["text"],
|
101 |
-
"description": "HTML doc title.",
|
102 |
-
"vectorizer": "text2vec-transformers",
|
103 |
-
"moduleConfig": {
|
104 |
-
"text2vec-transformers": {
|
105 |
-
"vectorizePropertyName": True,
|
106 |
-
"skip": False,
|
107 |
-
"tokenization": "lowercase"
|
108 |
-
}
|
109 |
-
},
|
110 |
-
"invertedIndexConfig": {
|
111 |
-
"bm25": {
|
112 |
-
"b": 0.75,
|
113 |
-
"k1": 1.2
|
114 |
-
},
|
115 |
-
}
|
116 |
-
},
|
117 |
-
{
|
118 |
-
"name": "content",
|
119 |
-
"dataType": ["text"],
|
120 |
-
"description": "HTML page content.",
|
121 |
-
"moduleConfig": {
|
122 |
-
"text2vec-transformers": {
|
123 |
-
"vectorizePropertyName": True,
|
124 |
-
"tokenization": "whitespace"
|
125 |
-
}
|
126 |
-
}
|
127 |
-
}
|
128 |
-
]
|
129 |
-
}
|
130 |
-
return(client.collections.create_from_dict(class_obj))
|
131 |
-
|
132 |
|
133 |
######################################################################
|
134 |
# MAINLINE
|
@@ -145,19 +35,6 @@ webpageTitles = []
|
|
145 |
webpageChunksDocNames = []
|
146 |
|
147 |
|
148 |
-
######################################################
|
149 |
-
# Connect to the Weaviate vector database.
|
150 |
-
logger.info("#### Create Weaviate db client connection.")
|
151 |
-
client = weaviate.connect_to_custom(
|
152 |
-
http_host="127.0.0.1",
|
153 |
-
http_port=8080,
|
154 |
-
http_secure=False,
|
155 |
-
grpc_host="127.0.0.1",
|
156 |
-
grpc_port=50051,
|
157 |
-
grpc_secure=False
|
158 |
-
)
|
159 |
-
client.connect()
|
160 |
-
|
161 |
#######################################################
|
162 |
# Read each text input file, parse it into a document,
|
163 |
# chunk it, collect chunks and document name.
|
@@ -196,6 +73,21 @@ for filename in os.listdir(pathString):
|
|
196 |
|
197 |
logger.debug(f"### webpageDocNames: {webpageDocNames}")
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
######################################################
|
200 |
# Create database webpage and chunks collections.
|
201 |
#wpCollection = createWebpageCollection()
|
|
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
######################################################################
|
24 |
# MAINLINE
|
|
|
35 |
webpageChunksDocNames = []
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
#######################################################
|
39 |
# Read each text input file, parse it into a document,
|
40 |
# chunk it, collect chunks and document name.
|
|
|
73 |
|
74 |
logger.debug(f"### webpageDocNames: {webpageDocNames}")
|
75 |
|
76 |
+
|
77 |
+
######################################################
|
78 |
+
# Connect to the Weaviate vector database.
|
79 |
+
logger.info("#### Create Weaviate db client connection.")
|
80 |
+
client = weaviate.connect_to_custom(
|
81 |
+
http_host="127.0.0.1",
|
82 |
+
http_port=8080,
|
83 |
+
http_secure=False,
|
84 |
+
grpc_host="127.0.0.1",
|
85 |
+
grpc_port=50051,
|
86 |
+
grpc_secure=False
|
87 |
+
)
|
88 |
+
client.connect()
|
89 |
+
|
90 |
+
|
91 |
######################################################
|
92 |
# Create database webpage and chunks collections.
|
93 |
#wpCollection = createWebpageCollection()
|