Spaces:
Sleeping
Sleeping
ALLOUNE
commited on
Commit
·
c97a50e
1
Parent(s):
367af23
add dataset
Browse files- app.py +9 -7
- requirements.txt +2 -1
- src/core.py +8 -12
- src/services/processor.py +18 -50
- src/services/utils.py +23 -51
app.py
CHANGED
@@ -27,12 +27,12 @@ class InputConstraints(BaseModel):
|
|
27 |
# This schema defines the structure for a single technology object
|
28 |
class Technology(BaseModel):
|
29 |
"""Represents a single technology entry with its details."""
|
30 |
-
|
31 |
purpose: str
|
32 |
-
|
33 |
advantages: str
|
34 |
limitations: str
|
35 |
-
|
36 |
|
37 |
class OutputPriorArt(BaseModel):
|
38 |
"""Represents the search of prior art using the technology combinations"""
|
@@ -55,12 +55,12 @@ class TechnologyData(BaseModel):
|
|
55 |
|
56 |
@app.post("/process", response_model=TechnologyData)
|
57 |
async def process(data: InputProblem):
|
58 |
-
result= process_input(data,
|
59 |
return {"technologies": result}
|
60 |
|
61 |
@app.post("/process-constraints", response_model=TechnologyData)
|
62 |
async def process_constraints(constraints: InputConstraints):
|
63 |
-
result= process_input(constraints.constraints,
|
64 |
return {"technologies": result}
|
65 |
|
66 |
@app.post("/prior-art-constraints", response_model=OutputPriorArt)
|
@@ -70,7 +70,7 @@ async def prior_art_constraints(data: InputPriorArtConstraints):
|
|
70 |
|
71 |
@app.post("/prior-art-problems", response_model=OutputPriorArt)
|
72 |
async def prior_art_problems(data: InputPriorArtProblem):
|
73 |
-
prior_art = process_prior_art(data.technologies, data.
|
74 |
return prior_art
|
75 |
|
76 |
def make_json_serializable(data):
|
@@ -268,7 +268,6 @@ def process_input_gradio(problem_description: str):
|
|
268 |
# Step 3: Stem Constraints
|
269 |
constraints_stemmed = stem(constraints, "constraints")
|
270 |
save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
|
271 |
-
print(constraints_stemmed)
|
272 |
|
273 |
# Step 4: Global Tech (already loaded, just acknowledge)
|
274 |
# save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
|
@@ -282,6 +281,9 @@ def process_input_gradio(problem_description: str):
|
|
282 |
# Step 6: Find Best List Combinations
|
283 |
best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
|
284 |
|
|
|
|
|
|
|
285 |
# Step 7: Select Technologies
|
286 |
best_technologies_id = select_technologies(best_combinations)
|
287 |
|
|
|
27 |
# This schema defines the structure for a single technology object
|
28 |
class Technology(BaseModel):
|
29 |
"""Represents a single technology entry with its details."""
|
30 |
+
name: str
|
31 |
purpose: str
|
32 |
+
problem_types_solved: str
|
33 |
advantages: str
|
34 |
limitations: str
|
35 |
+
domain_tags: str
|
36 |
|
37 |
class OutputPriorArt(BaseModel):
|
38 |
"""Represents the search of prior art using the technology combinations"""
|
|
|
55 |
|
56 |
@app.post("/process", response_model=TechnologyData)
|
57 |
async def process(data: InputProblem):
|
58 |
+
result= process_input(data, dataset, "problem")
|
59 |
return {"technologies": result}
|
60 |
|
61 |
@app.post("/process-constraints", response_model=TechnologyData)
|
62 |
async def process_constraints(constraints: InputConstraints):
|
63 |
+
result= process_input(constraints.constraints, dataset, "constraints")
|
64 |
return {"technologies": result}
|
65 |
|
66 |
@app.post("/prior-art-constraints", response_model=OutputPriorArt)
|
|
|
70 |
|
71 |
@app.post("/prior-art-problems", response_model=OutputPriorArt)
|
72 |
async def prior_art_problems(data: InputPriorArtProblem):
|
73 |
+
prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
|
74 |
return prior_art
|
75 |
|
76 |
def make_json_serializable(data):
|
|
|
268 |
# Step 3: Stem Constraints
|
269 |
constraints_stemmed = stem(constraints, "constraints")
|
270 |
save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
|
|
|
271 |
|
272 |
# Step 4: Global Tech (already loaded, just acknowledge)
|
273 |
# save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
|
|
|
281 |
# Step 6: Find Best List Combinations
|
282 |
best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
|
283 |
|
284 |
+
print("best_combinations")
|
285 |
+
print(best_combinations)
|
286 |
+
|
287 |
# Step 7: Select Technologies
|
288 |
best_technologies_id = select_technologies(best_combinations)
|
289 |
|
requirements.txt
CHANGED
@@ -9,4 +9,5 @@ pydantic
|
|
9 |
openpyxl
|
10 |
gradio
|
11 |
google.generativeai
|
12 |
-
google.genai
|
|
|
|
9 |
openpyxl
|
10 |
gradio
|
11 |
google.generativeai
|
12 |
+
google.genai
|
13 |
+
datasets
|
src/core.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
from src.services.utils import *
|
2 |
from src.services.processor import *
|
3 |
|
4 |
-
|
5 |
|
6 |
|
7 |
-
def process_input(data,
|
8 |
if data_type == "problem":
|
9 |
prompt = set_prompt(data.problem)
|
10 |
constraints = retrieve_constraints(prompt)
|
@@ -14,19 +14,13 @@ def process_input(data, global_tech, global_tech_embeddings, data_type):
|
|
14 |
|
15 |
constraints_stemmed = stem(constraints, "constraints")
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
save_dataframe(global_tech, "global_tech.xlsx")
|
20 |
-
|
21 |
-
result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, global_tech, global_tech_embeddings, )
|
22 |
|
23 |
save_to_pickle(result_similarities)
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
|
28 |
best_technologies_id = select_technologies(best_combinations)
|
29 |
-
best_technologies = get_technologies_by_id(best_technologies_id,
|
30 |
|
31 |
return best_technologies
|
32 |
|
@@ -38,5 +32,7 @@ def process_prior_art(technologies, data, data_type, techno_type):
|
|
38 |
print(f"An error occured during the process, trying again : {e}")
|
39 |
prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
|
40 |
prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
|
41 |
-
|
|
|
|
|
42 |
return prior_art_search
|
|
|
1 |
from src.services.utils import *
|
2 |
from src.services.processor import *
|
3 |
|
4 |
+
dataset = load_data()
|
5 |
|
6 |
|
7 |
+
def process_input(data, dataset, data_type):
|
8 |
if data_type == "problem":
|
9 |
prompt = set_prompt(data.problem)
|
10 |
constraints = retrieve_constraints(prompt)
|
|
|
14 |
|
15 |
constraints_stemmed = stem(constraints, "constraints")
|
16 |
|
17 |
+
result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, dataset)
|
|
|
|
|
|
|
|
|
18 |
|
19 |
save_to_pickle(result_similarities)
|
20 |
|
21 |
+
best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
|
|
|
|
|
22 |
best_technologies_id = select_technologies(best_combinations)
|
23 |
+
best_technologies = get_technologies_by_id(best_technologies_id, dataset)
|
24 |
|
25 |
return best_technologies
|
26 |
|
|
|
32 |
print(f"An error occured during the process, trying again : {e}")
|
33 |
prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
|
34 |
prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
|
35 |
+
print("PRIOR ART SEARCH")
|
36 |
+
print(prior_art_reponse)
|
37 |
+
print(prior_art_search)
|
38 |
return prior_art_search
|
src/services/processor.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from src.services.utils import
|
2 |
import requests as r
|
3 |
import json
|
4 |
import nltk
|
@@ -23,51 +23,15 @@ def retrieve_constraints(prompt):
|
|
23 |
|
24 |
constraints_json = json.loads("{"+json_str+"}")
|
25 |
|
26 |
-
print(f"Whats returned : {constraints_json}")
|
27 |
return constraints_json
|
28 |
|
29 |
-
|
30 |
-
def preprocess_tech_data(_df):
|
31 |
-
if _df is None or "description" not in _df.columns:
|
32 |
-
return [], []
|
33 |
-
|
34 |
-
technologies_list = _df["description"].to_list()
|
35 |
-
tech_dict_raw = tech_to_dict(technologies_list)
|
36 |
-
|
37 |
-
tech_dict_filtered = [
|
38 |
-
t for t in tech_dict_raw if (
|
39 |
-
len(t.get("title", "")) >= 5 and
|
40 |
-
len(t.get("advantages", "")) >= 5 and
|
41 |
-
len(t.get("key_components", "")) >= 5
|
42 |
-
)
|
43 |
-
]
|
44 |
-
|
45 |
-
if not tech_dict_filtered:
|
46 |
-
return [], []
|
47 |
-
|
48 |
-
processed_tech_wt = stem(tech_dict_filtered,"technologies")
|
49 |
-
|
50 |
-
for t_item_wt in processed_tech_wt:
|
51 |
-
kc = t_item_wt.get("key_components")
|
52 |
-
if isinstance(kc, str):
|
53 |
-
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
|
54 |
-
else:
|
55 |
-
t_item_wt["key_components"] = ""
|
56 |
-
|
57 |
-
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
|
58 |
-
|
59 |
-
|
60 |
-
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
|
61 |
-
return processed_tech_wt, _keys, original_tech_for_display
|
62 |
-
|
63 |
-
|
64 |
def remove_over_repeated_technologies(result):
|
65 |
total_lists = len(result)
|
66 |
tech_title = {}
|
67 |
|
68 |
for idx, item in enumerate(result):
|
69 |
for tech in item['technologies']:
|
70 |
-
tech_title[tech[0]['
|
71 |
|
72 |
threshold = total_lists * 0.3
|
73 |
print(threshold)
|
@@ -79,11 +43,11 @@ def remove_over_repeated_technologies(result):
|
|
79 |
to_delete.append(tech)
|
80 |
|
81 |
for idx, item in enumerate(result):
|
82 |
-
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['
|
83 |
|
84 |
return result
|
85 |
|
86 |
-
def get_contrastive_similarities(constraints,
|
87 |
selected_pairs = []
|
88 |
matrix = []
|
89 |
|
@@ -93,8 +57,8 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
|
|
93 |
for i, constraint in enumerate(constraints):
|
94 |
constraint_embedding = constraint_embeddings[i]
|
95 |
constraint_matrix = []
|
96 |
-
for j,
|
97 |
-
tech_embedding =
|
98 |
|
99 |
purpose_sim = model.similarity(constraint_embedding, tech_embedding)
|
100 |
|
@@ -103,7 +67,7 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
|
|
103 |
|
104 |
selected_pairs.append({
|
105 |
"constraint": constraint,
|
106 |
-
"id2":
|
107 |
"similarity": purpose_sim
|
108 |
})
|
109 |
constraint_matrix.append(purpose_sim)
|
@@ -119,21 +83,25 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
|
|
119 |
MAX_SIMILARITY = 0.8
|
120 |
|
121 |
possible_matches_for_each_l1 = []
|
122 |
-
for i in
|
123 |
valid_matches_for_l1_element = []
|
124 |
-
for j in
|
125 |
score = matrix[i][j]
|
126 |
|
|
|
|
|
127 |
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
128 |
-
|
|
|
|
|
129 |
|
130 |
if not valid_matches_for_l1_element:
|
131 |
-
print(f"No valid matches found in list2 for '{
|
132 |
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
133 |
"Returning an empty list as no complete combinations can be formed.")
|
134 |
|
135 |
else:
|
136 |
-
possible_matches_for_each_l1.append((valid_matches_for_l1_element,
|
137 |
|
138 |
result = []
|
139 |
for tech_list, problem in possible_matches_for_each_l1:
|
@@ -219,10 +187,10 @@ def select_technologies(problem_technology_list):
|
|
219 |
|
220 |
def load_titles(techno, data_type):
|
221 |
if data_type == "pydantic":
|
222 |
-
technology_titles = [tech.
|
223 |
else: # data_type == "dict"
|
224 |
technologies = techno["technologies"]
|
225 |
-
technology_titles = [tech["
|
226 |
return technology_titles
|
227 |
|
228 |
def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
|
|
|
1 |
+
from src.services.utils import load_data, stem, set_gemini
|
2 |
import requests as r
|
3 |
import json
|
4 |
import nltk
|
|
|
23 |
|
24 |
constraints_json = json.loads("{"+json_str+"}")
|
25 |
|
|
|
26 |
return constraints_json
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def remove_over_repeated_technologies(result):
|
29 |
total_lists = len(result)
|
30 |
tech_title = {}
|
31 |
|
32 |
for idx, item in enumerate(result):
|
33 |
for tech in item['technologies']:
|
34 |
+
tech_title[tech[0]['name']] = 0 if tech[0]['name'] not in tech_title else tech_title[tech[0]['name']] + 1
|
35 |
|
36 |
threshold = total_lists * 0.3
|
37 |
print(threshold)
|
|
|
43 |
to_delete.append(tech)
|
44 |
|
45 |
for idx, item in enumerate(result):
|
46 |
+
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['name'] not in to_delete]
|
47 |
|
48 |
return result
|
49 |
|
50 |
+
def get_contrastive_similarities(constraints, dataset):
|
51 |
selected_pairs = []
|
52 |
matrix = []
|
53 |
|
|
|
57 |
for i, constraint in enumerate(constraints):
|
58 |
constraint_embedding = constraint_embeddings[i]
|
59 |
constraint_matrix = []
|
60 |
+
for j, row in enumerate(dataset):
|
61 |
+
tech_embedding = row["embeddings"]
|
62 |
|
63 |
purpose_sim = model.similarity(constraint_embedding, tech_embedding)
|
64 |
|
|
|
67 |
|
68 |
selected_pairs.append({
|
69 |
"constraint": constraint,
|
70 |
+
"id2": j,
|
71 |
"similarity": purpose_sim
|
72 |
})
|
73 |
constraint_matrix.append(purpose_sim)
|
|
|
83 |
MAX_SIMILARITY = 0.8
|
84 |
|
85 |
possible_matches_for_each_l1 = []
|
86 |
+
for i, row_i in enumerate(list1):
|
87 |
valid_matches_for_l1_element = []
|
88 |
+
for j, row_j in enumerate(list2):
|
89 |
score = matrix[i][j]
|
90 |
|
91 |
+
# print(row_j)
|
92 |
+
# print(type(row_j))
|
93 |
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
94 |
+
del row_j["embeddings"]
|
95 |
+
row_j["id"] = j
|
96 |
+
valid_matches_for_l1_element.append((row_j, score))
|
97 |
|
98 |
if not valid_matches_for_l1_element:
|
99 |
+
print(f"No valid matches found in list2 for '{row_i}' from list1 "
|
100 |
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
101 |
"Returning an empty list as no complete combinations can be formed.")
|
102 |
|
103 |
else:
|
104 |
+
possible_matches_for_each_l1.append((valid_matches_for_l1_element, row_i))
|
105 |
|
106 |
result = []
|
107 |
for tech_list, problem in possible_matches_for_each_l1:
|
|
|
187 |
|
188 |
def load_titles(techno, data_type):
|
189 |
if data_type == "pydantic":
|
190 |
+
technology_titles = [tech.name for tech in techno]
|
191 |
else: # data_type == "dict"
|
192 |
technologies = techno["technologies"]
|
193 |
+
technology_titles = [tech["name"] for tech in technologies]
|
194 |
return technology_titles
|
195 |
|
196 |
def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
|
src/services/utils.py
CHANGED
@@ -1,20 +1,20 @@
|
|
1 |
import pickle
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
-
|
5 |
import nltk
|
6 |
from nltk.stem import *
|
7 |
nltk.download("punkt_tab")
|
8 |
-
|
9 |
from pathlib import Path
|
|
|
|
|
10 |
import os
|
11 |
import google.generativeai as genai
|
12 |
import json
|
13 |
from google.genai import Client, types
|
|
|
|
|
14 |
|
15 |
-
BASE_DIR = Path(__file__).resolve().parent.parent
|
16 |
|
17 |
-
FILE_PATH = BASE_DIR / 'ressources' / 'global_tech_embeddings.pkl'
|
18 |
|
19 |
def set_prompt(problem):
|
20 |
prompt = """
|
@@ -51,71 +51,44 @@ Output each constraints in a JSON such as : {"title of the constraints1":"descri
|
|
51 |
""" + problem
|
52 |
return prompt
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
return
|
57 |
-
|
58 |
-
def load_technologies():
|
59 |
-
EMBEDDINGS_FILE = FILE_PATH
|
60 |
-
|
61 |
-
try:
|
62 |
-
with open(EMBEDDINGS_FILE, 'rb') as f:
|
63 |
-
loaded_data = pickle.load(f)
|
64 |
-
global_tech = loaded_data['global_tech']
|
65 |
-
global_tech_embedding = loaded_data['global_tech_embeddings']
|
66 |
-
return global_tech, global_tech_embedding
|
67 |
-
except Exception as e:
|
68 |
-
print(f"Error: {e}")
|
69 |
-
|
70 |
-
def tech_to_dict(technologies):
|
71 |
-
tech_dict = []
|
72 |
-
for index, tech in enumerate(technologies):
|
73 |
-
if not tech.find("<title>") > 1:
|
74 |
-
tab = tech.split("\n")
|
75 |
-
tab.pop(0)
|
76 |
-
tab.pop(len(tab)-1)
|
77 |
-
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
|
78 |
-
"purpose": tab[1][tab[1].find(": ")+2:],
|
79 |
-
"key_components": tab[2][tab[2].find(": ")+2:],
|
80 |
-
"advantages": tab[3][tab[3].find(": ")+2:],
|
81 |
-
"limitations": tab[4][tab[4].find(": ")+2:],
|
82 |
-
"id": index})
|
83 |
-
return tech_dict
|
84 |
-
|
85 |
-
def save_dataframe(df, title):
|
86 |
-
pd.DataFrame(df).to_excel(title)
|
87 |
-
return title
|
88 |
|
89 |
def stem(data,data_type):
|
90 |
stemmer = SnowballStemmer("english")
|
91 |
processed_data = []
|
92 |
if data_type == "technologies":
|
93 |
-
for t_item in data:
|
94 |
processed_data.append({
|
95 |
-
"
|
96 |
"purpose": stemmer.stem(t_item["purpose"]),
|
97 |
-
"
|
98 |
"advantages": stemmer.stem(t_item["advantages"]),
|
99 |
"limitations": stemmer.stem(t_item["limitations"]),
|
100 |
-
"
|
|
|
101 |
})
|
|
|
102 |
else:
|
103 |
for t_item in data:
|
104 |
-
print(t_item)
|
105 |
processed_data.append({
|
106 |
"title": stemmer.stem(t_item),
|
107 |
"description": stemmer.stem(data[t_item])
|
108 |
})
|
109 |
-
|
110 |
return processed_data
|
111 |
|
112 |
|
113 |
-
def get_technologies_by_id(
|
114 |
result = []
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
119 |
return result
|
120 |
|
121 |
def save_to_pickle(result_similarites):
|
@@ -133,7 +106,7 @@ def save_to_pickle(result_similarites):
|
|
133 |
|
134 |
for item in result_similarites:
|
135 |
row_idx = row_label_to_index[item['constraint']['title']]
|
136 |
-
col_idx = item['id2'] - 1
|
137 |
similarity_value = item['similarity'].item()
|
138 |
|
139 |
matrix[row_idx, col_idx] = similarity_value
|
@@ -157,7 +130,6 @@ def save_to_pickle(result_similarites):
|
|
157 |
print(f"\nMatrix and labels saved to {output_filename}")
|
158 |
return output_filename
|
159 |
|
160 |
-
|
161 |
def set_gemini():
|
162 |
gemini_api = os.getenv("GEMINI_API")
|
163 |
client = Client(api_key=gemini_api)
|
|
|
1 |
import pickle
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
|
|
4 |
import nltk
|
5 |
from nltk.stem import *
|
6 |
nltk.download("punkt_tab")
|
|
|
7 |
from pathlib import Path
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
import os
|
11 |
import google.generativeai as genai
|
12 |
import json
|
13 |
from google.genai import Client, types
|
14 |
+
from datasets import load_dataset
|
15 |
+
|
16 |
|
|
|
17 |
|
|
|
18 |
|
19 |
def set_prompt(problem):
|
20 |
prompt = """
|
|
|
51 |
""" + problem
|
52 |
return prompt
|
53 |
|
54 |
+
|
55 |
+
def load_data():
|
56 |
+
return load_dataset("heymenn/Technologies", split="train")
|
57 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
def stem(data,data_type):
|
60 |
stemmer = SnowballStemmer("english")
|
61 |
processed_data = []
|
62 |
if data_type == "technologies":
|
63 |
+
for index, t_item in enumerate(data):
|
64 |
processed_data.append({
|
65 |
+
"name": stemmer.stem(t_item["name"]),
|
66 |
"purpose": stemmer.stem(t_item["purpose"]),
|
67 |
+
"problem_types_solved": stemmer.stem(t_item["problem_types_solved"]),
|
68 |
"advantages": stemmer.stem(t_item["advantages"]),
|
69 |
"limitations": stemmer.stem(t_item["limitations"]),
|
70 |
+
"domain_tags": stemmer.stem(t_item["domain_tags"]),
|
71 |
+
"id": index
|
72 |
})
|
73 |
+
|
74 |
else:
|
75 |
for t_item in data:
|
|
|
76 |
processed_data.append({
|
77 |
"title": stemmer.stem(t_item),
|
78 |
"description": stemmer.stem(data[t_item])
|
79 |
})
|
80 |
+
|
81 |
return processed_data
|
82 |
|
83 |
|
84 |
+
def get_technologies_by_id(technologies,dataset):
|
85 |
result = []
|
86 |
+
for id in technologies:
|
87 |
+
print(id)
|
88 |
+
data = dataset[id]
|
89 |
+
del data["embeddings"]
|
90 |
+
print(data)
|
91 |
+
result.append(data)
|
92 |
return result
|
93 |
|
94 |
def save_to_pickle(result_similarites):
|
|
|
106 |
|
107 |
for item in result_similarites:
|
108 |
row_idx = row_label_to_index[item['constraint']['title']]
|
109 |
+
col_idx = item['id2'] - 1
|
110 |
similarity_value = item['similarity'].item()
|
111 |
|
112 |
matrix[row_idx, col_idx] = similarity_value
|
|
|
130 |
print(f"\nMatrix and labels saved to {output_filename}")
|
131 |
return output_filename
|
132 |
|
|
|
133 |
def set_gemini():
|
134 |
gemini_api = os.getenv("GEMINI_API")
|
135 |
client = Client(api_key=gemini_api)
|