Spaces:
Runtime error
Runtime error
| from sentence_transformers import SentenceTransformer, util | |
| import json | |
| import time | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import chromadb | |
| from chromadb.config import Settings | |
| from chromadb.utils import embedding_functions | |
| from chromadb.db.clickhouse import NoDatapointsException | |
| def query_aas(query_json, collection, model, metalabel): | |
| query = json.loads(query_json) | |
| name = query["Name"] | |
| definition = query["Definition"] | |
| unit = query["Unit"] | |
| datatype = query["Datatype"] | |
| semantic_id = query["SemanticId"] | |
| return_matches = query["ReturnMatches"] | |
| #model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass") | |
| datatype_mapping = { | |
| "boolean": "BOOLEAN", | |
| "string": "STRING", | |
| "string_translatable": "STRING", | |
| "translatable_string": "STRING", | |
| "non_translatable_string": "STRING", | |
| "date": "DATE", | |
| "data_time": "DATE", | |
| "uri": "URI", | |
| "int": "INT", | |
| "int_measure": "INT", | |
| "int_currency": "INT", | |
| "integer": "INT", | |
| "real": "REAL", | |
| "real_measure": "REAL", | |
| "real_currency": "REAL", | |
| "enum_code": "ENUM_CODE", | |
| "enum_int": "ENUM_CODE", | |
| "ENUM_REAL": "ENUM_CODE", | |
| "ENUM_RATIONAL": "ENUM_CODE", | |
| "ENUM_BOOLEAN": "ENUM_CODE", | |
| "ENUM_STRING": "ENUM_CODE", | |
| "enum_reference": "ENUM_CODE", | |
| "enum_instance": "ENUM_CODE", | |
| "set(b1,b2)": "SET", | |
| "constrained_set(b1,b2,cmn,cmx)": "SET", | |
| "set [0,?]": "SET", | |
| "set [1,?]": "SET", | |
| "set [1, ?]": "SET", | |
| "nan": "NaN", | |
| "media_type": "LARGE_OBJECT_TYPE", | |
| } | |
| unit_mapping = { | |
| "nan": "NaN", | |
| "hertz": "FREQUENCY", | |
| "hz": "FREQUENCY", | |
| "pa": "PRESSURE", | |
| "pascal": "PRESSURE", | |
| "n/m²": "PRESSURE", | |
| "bar": "PRESSURE", | |
| "%": "SCALARS_PERC", | |
| "w": "POWER", | |
| "watt": "POWER", | |
| "kw": "POWER", | |
| "kg/m³": "CHEMISTRY", | |
| "m²/s": "CHEMISTRY", | |
| "pa*s": "CHEMISTRY", | |
| "v": "ELECTRICAL", | |
| "volt": "ELECTRICAL", | |
| "db": "ACOUSTICS", | |
| "db(a)": "ACOUSTICS", | |
| "k": "TEMPERATURE", | |
| "°c": "TEMPERATURE", | |
| "n": "MECHANICS", | |
| "newton": "MECHANICS", | |
| "kg/s": "FLOW", | |
| "kg/h": "FLOW", | |
| "m³/s": "FLOW", | |
| "m³/h": "FLOW", | |
| "l/s": "FLOW", | |
| "l/h": "FLOW", | |
| "µm": "LENGTH", | |
| "mm": "LENGTH", | |
| "cm": "LENGTH", | |
| "dm": "LENGTH", | |
| "m": "LENGTH", | |
| "meter": "LENGTH", | |
| "m/s": "SPEED", | |
| "km/h": "SPEED", | |
| "s^(-1)": "FREQUENCY", | |
| "1/s": "FREQUENCY", | |
| "s": "TIME", | |
| "h": "TIME", | |
| "min": "TIME", | |
| "d": "TIME", | |
| "hours": "TIME", | |
| "a": "ELECTRICAL", | |
| "m³": "VOLUME", | |
| "m²": "AREA", | |
| "rpm": "FLOW", | |
| "nm": "MECHANICS", | |
| "m/m": "MECHANICS", | |
| "m³/m²s": "MECHANICS", | |
| "w(m²*K)": "HEAT_TRANSFER", | |
| "kwh": "ELECTRICAL", | |
| "kg/(s*m²)": "FLOW", | |
| "kg": "MASS", | |
| "w/(m*k)": "HEAT_TRANSFER", | |
| "m²*k/w": "HEAT_TRANSFER", | |
| "j/s": "POWER", | |
| } | |
| #with open( | |
| # "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle", | |
| # "rb", | |
| #) as handle: | |
| # metalabel = pickle.load(handle) | |
| unit_lower = unit.lower() | |
| datatype_lower = datatype.lower() | |
| unit_categ = unit_mapping.get(unit_lower) | |
| datatype_categ = datatype_mapping.get(datatype_lower) | |
| if unit_categ == None: | |
| unit_categ = "NaN" | |
| if datatype_categ == None: | |
| datatype_categ = "NaN" | |
| concat = (unit_categ, datatype_categ) | |
| keys = [k for k, v in metalabel.items() if v == concat] | |
| metadata = keys[0] | |
| name_embedding = model.encode(name) | |
| definition_embedding = model.encode(definition) | |
| concat_name_def_query = np.concatenate( | |
| (definition_embedding, name_embedding), axis=0 | |
| ) | |
| concat_name_def_query = concat_name_def_query.tolist() | |
| queries = [concat_name_def_query] | |
| print(type(queries)) | |
| # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt | |
| # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib | |
| # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden | |
| # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2 | |
| # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich | |
| try: | |
| homogen = collection.query( | |
| query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id} | |
| ) | |
| # except NoDatapointsException: | |
| # homogen = 'Nix' | |
| except Exception: | |
| homogen = "Nix" | |
| if homogen != "Nix": | |
| result = homogen | |
| result["matching_method"] = "Semantic equivalent , same semantic Id" | |
| result["matching_algorithm"] = "None" | |
| result["distances"] = [[0]] | |
| final_result = { | |
| "matching_method": result['matching_method'], | |
| "matching_algorithm": result['matching_algorithm'], | |
| "matching_distance": result['distances'][0][0], | |
| "aas_id": result['metadatas'][0][0]['AASId'], | |
| "aas_id_short": result['metadatas'][0][0]['AASIdShort'], | |
| "submodel_id_short": result['metadatas'][0][0]['SubmodelName'], | |
| "submodel_id": result['metadatas'][0][0]['SubmodelId'], | |
| "matched_object": result['documents'][0][0], | |
| } | |
| final_results = [final_result] | |
| # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten | |
| elif homogen == "Nix": | |
| try: | |
| with_metadata = collection.query( | |
| query_embeddings=queries, | |
| n_results=return_matches, | |
| where={"Metalabel": metadata}, | |
| ) | |
| # except NoDatapointsException: | |
| # with_metadata = 'Nix' | |
| except Exception: | |
| with_metadata = "Nix" | |
| without_metadata = collection.query( | |
| query_embeddings=queries, | |
| n_results=return_matches, | |
| ) | |
| if with_metadata == "Nix": | |
| result = without_metadata | |
| result[ | |
| "matching_method" | |
| ] = "Semantically not equivalent, NLP without Metadata" | |
| result[ | |
| "matching_algorithm" | |
| ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" | |
| elif with_metadata != "Nix": | |
| distance_with_meta = with_metadata["distances"][0][0] | |
| distance_without_meta = without_metadata["distances"][0][0] | |
| print(distance_with_meta) | |
| print(distance_without_meta) | |
| # Vergleich der Abstände von mit und ohne Metadaten | |
| if distance_without_meta <= distance_with_meta: | |
| result = without_metadata | |
| result[ | |
| "matching_method" | |
| ] = "Semantically not equivalent, NLP without Metadata" | |
| result[ | |
| "matching_algorithm" | |
| ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" | |
| else: | |
| result = with_metadata | |
| result[ | |
| "matching_method" | |
| ] = "Semantically not equivalent, NLP without Metadata" | |
| result[ | |
| "matching_algorithm" | |
| ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" | |
| # Aufbereiten des passenden finalen Ergebnisses | |
| final_results = [] | |
| for i in range(0, return_matches): | |
| value = result['documents'][0][i] | |
| value_dict = json.loads(value) | |
| final_result = { | |
| "matching_method": result['matching_method'], | |
| "matching_algorithm": result['matching_algorithm'], | |
| "matching_distance": result['distances'][0][i], | |
| "aas_id": result['metadatas'][0][i]['AASId'], | |
| "aas_id_short": result['metadatas'][0][i]['AASIdShort'], | |
| "submodel_id_short": result['metadatas'][0][i]['SubmodelName'], | |
| "submodel_id": result['metadatas'][0][i]['SubmodelId'], | |
| #"matched_object": result['documents'][0][i] | |
| "matched_object": value_dict | |
| } | |
| final_results.append(final_result) | |
| return final_results | |
| def ask_database(query, metalabel, model, collections, client_chroma): | |
| # Alle AAS werden nacheinaner abgefragt | |
| json_query = json.dumps(query, indent=4) | |
| results = [] | |
| for collection in collections: | |
| print(collection.name) | |
| collection = client_chroma.get_collection(collection.name) | |
| result = query_aas(json_query, collection, model, metalabel) | |
| results.append(result) | |
| #results_json = json.dumps(results) | |
| return results | |