ALLOUNE commited on
Commit
c97a50e
·
1 Parent(s): 367af23

add dataset

Browse files
Files changed (5) hide show
  1. app.py +9 -7
  2. requirements.txt +2 -1
  3. src/core.py +8 -12
  4. src/services/processor.py +18 -50
  5. src/services/utils.py +23 -51
app.py CHANGED
@@ -27,12 +27,12 @@ class InputConstraints(BaseModel):
27
  # This schema defines the structure for a single technology object
28
  class Technology(BaseModel):
29
  """Represents a single technology entry with its details."""
30
- title: str
31
  purpose: str
32
- key_components: str
33
  advantages: str
34
  limitations: str
35
- id: int
36
 
37
  class OutputPriorArt(BaseModel):
38
  """Represents the search of prior art using the technology combinations"""
@@ -55,12 +55,12 @@ class TechnologyData(BaseModel):
55
 
56
  @app.post("/process", response_model=TechnologyData)
57
  async def process(data: InputProblem):
58
- result= process_input(data, global_tech, global_tech_embeddings, "problem")
59
  return {"technologies": result}
60
 
61
  @app.post("/process-constraints", response_model=TechnologyData)
62
  async def process_constraints(constraints: InputConstraints):
63
- result= process_input(constraints.constraints, global_tech, global_tech_embeddings, "constraints")
64
  return {"technologies": result}
65
 
66
  @app.post("/prior-art-constraints", response_model=OutputPriorArt)
@@ -70,7 +70,7 @@ async def prior_art_constraints(data: InputPriorArtConstraints):
70
 
71
  @app.post("/prior-art-problems", response_model=OutputPriorArt)
72
  async def prior_art_problems(data: InputPriorArtProblem):
73
- prior_art = process_prior_art(data.technologies, data.problems, "problem", "pydantic")
74
  return prior_art
75
 
76
  def make_json_serializable(data):
@@ -268,7 +268,6 @@ def process_input_gradio(problem_description: str):
268
  # Step 3: Stem Constraints
269
  constraints_stemmed = stem(constraints, "constraints")
270
  save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
271
- print(constraints_stemmed)
272
 
273
  # Step 4: Global Tech (already loaded, just acknowledge)
274
  # save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
@@ -282,6 +281,9 @@ def process_input_gradio(problem_description: str):
282
  # Step 6: Find Best List Combinations
283
  best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
284
 
 
 
 
285
  # Step 7: Select Technologies
286
  best_technologies_id = select_technologies(best_combinations)
287
 
 
27
  # This schema defines the structure for a single technology object
28
  class Technology(BaseModel):
29
  """Represents a single technology entry with its details."""
30
+ name: str
31
  purpose: str
32
+ problem_types_solved: str
33
  advantages: str
34
  limitations: str
35
+ domain_tags: str
36
 
37
  class OutputPriorArt(BaseModel):
38
  """Represents the search of prior art using the technology combinations"""
 
55
 
56
  @app.post("/process", response_model=TechnologyData)
57
  async def process(data: InputProblem):
58
+ result= process_input(data, dataset, "problem")
59
  return {"technologies": result}
60
 
61
  @app.post("/process-constraints", response_model=TechnologyData)
62
  async def process_constraints(constraints: InputConstraints):
63
+ result= process_input(constraints.constraints, dataset, "constraints")
64
  return {"technologies": result}
65
 
66
  @app.post("/prior-art-constraints", response_model=OutputPriorArt)
 
70
 
71
  @app.post("/prior-art-problems", response_model=OutputPriorArt)
72
  async def prior_art_problems(data: InputPriorArtProblem):
73
+ prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
74
  return prior_art
75
 
76
  def make_json_serializable(data):
 
268
  # Step 3: Stem Constraints
269
  constraints_stemmed = stem(constraints, "constraints")
270
  save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
 
271
 
272
  # Step 4: Global Tech (already loaded, just acknowledge)
273
  # save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
 
281
  # Step 6: Find Best List Combinations
282
  best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
283
 
284
+ print("best_combinations")
285
+ print(best_combinations)
286
+
287
  # Step 7: Select Technologies
288
  best_technologies_id = select_technologies(best_combinations)
289
 
requirements.txt CHANGED
@@ -9,4 +9,5 @@ pydantic
9
  openpyxl
10
  gradio
11
  google.generativeai
12
- google.genai
 
 
9
  openpyxl
10
  gradio
11
  google.generativeai
12
+ google.genai
13
+ datasets
src/core.py CHANGED
@@ -1,10 +1,10 @@
1
  from src.services.utils import *
2
  from src.services.processor import *
3
 
4
- global_tech, global_tech_embeddings = load_technologies()
5
 
6
 
7
- def process_input(data, global_tech, global_tech_embeddings, data_type):
8
  if data_type == "problem":
9
  prompt = set_prompt(data.problem)
10
  constraints = retrieve_constraints(prompt)
@@ -14,19 +14,13 @@ def process_input(data, global_tech, global_tech_embeddings, data_type):
14
 
15
  constraints_stemmed = stem(constraints, "constraints")
16
 
17
- save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
18
-
19
- save_dataframe(global_tech, "global_tech.xlsx")
20
-
21
- result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, global_tech, global_tech_embeddings, )
22
 
23
  save_to_pickle(result_similarities)
24
 
25
- print(f"Matrix : {matrix} \n Constraints : {constraints_stemmed} \n Gloabl tech : {global_tech}")
26
-
27
- best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
28
  best_technologies_id = select_technologies(best_combinations)
29
- best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
30
 
31
  return best_technologies
32
 
@@ -38,5 +32,7 @@ def process_prior_art(technologies, data, data_type, techno_type):
38
  print(f"An error occured during the process, trying again : {e}")
39
  prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
40
  prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
41
-
 
 
42
  return prior_art_search
 
1
  from src.services.utils import *
2
  from src.services.processor import *
3
 
4
+ dataset = load_data()
5
 
6
 
7
+ def process_input(data, dataset, data_type):
8
  if data_type == "problem":
9
  prompt = set_prompt(data.problem)
10
  constraints = retrieve_constraints(prompt)
 
14
 
15
  constraints_stemmed = stem(constraints, "constraints")
16
 
17
+ result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, dataset)
 
 
 
 
18
 
19
  save_to_pickle(result_similarities)
20
 
21
+ best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
 
 
22
  best_technologies_id = select_technologies(best_combinations)
23
+ best_technologies = get_technologies_by_id(best_technologies_id, dataset)
24
 
25
  return best_technologies
26
 
 
32
  print(f"An error occured during the process, trying again : {e}")
33
  prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
34
  prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
35
+ print("PRIOR ART SEARCH")
36
+ print(prior_art_reponse)
37
+ print(prior_art_search)
38
  return prior_art_search
src/services/processor.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.services.utils import tech_to_dict, stem, set_gemini
2
  import requests as r
3
  import json
4
  import nltk
@@ -23,51 +23,15 @@ def retrieve_constraints(prompt):
23
 
24
  constraints_json = json.loads("{"+json_str+"}")
25
 
26
- print(f"Whats returned : {constraints_json}")
27
  return constraints_json
28
 
29
-
30
- def preprocess_tech_data(_df):
31
- if _df is None or "description" not in _df.columns:
32
- return [], []
33
-
34
- technologies_list = _df["description"].to_list()
35
- tech_dict_raw = tech_to_dict(technologies_list)
36
-
37
- tech_dict_filtered = [
38
- t for t in tech_dict_raw if (
39
- len(t.get("title", "")) >= 5 and
40
- len(t.get("advantages", "")) >= 5 and
41
- len(t.get("key_components", "")) >= 5
42
- )
43
- ]
44
-
45
- if not tech_dict_filtered:
46
- return [], []
47
-
48
- processed_tech_wt = stem(tech_dict_filtered,"technologies")
49
-
50
- for t_item_wt in processed_tech_wt:
51
- kc = t_item_wt.get("key_components")
52
- if isinstance(kc, str):
53
- t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
54
- else:
55
- t_item_wt["key_components"] = ""
56
-
57
- original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
58
-
59
-
60
- _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
61
- return processed_tech_wt, _keys, original_tech_for_display
62
-
63
-
64
  def remove_over_repeated_technologies(result):
65
  total_lists = len(result)
66
  tech_title = {}
67
 
68
  for idx, item in enumerate(result):
69
  for tech in item['technologies']:
70
- tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
71
 
72
  threshold = total_lists * 0.3
73
  print(threshold)
@@ -79,11 +43,11 @@ def remove_over_repeated_technologies(result):
79
  to_delete.append(tech)
80
 
81
  for idx, item in enumerate(result):
82
- result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
83
 
84
  return result
85
 
86
- def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded_tech_embeddings):
87
  selected_pairs = []
88
  matrix = []
89
 
@@ -93,8 +57,8 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
93
  for i, constraint in enumerate(constraints):
94
  constraint_embedding = constraint_embeddings[i]
95
  constraint_matrix = []
96
- for j, tech2 in enumerate(pre_encoded_tech_data):
97
- tech_embedding = pre_encoded_tech_embeddings[j]
98
 
99
  purpose_sim = model.similarity(constraint_embedding, tech_embedding)
100
 
@@ -103,7 +67,7 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
103
 
104
  selected_pairs.append({
105
  "constraint": constraint,
106
- "id2": tech2["id"],
107
  "similarity": purpose_sim
108
  })
109
  constraint_matrix.append(purpose_sim)
@@ -119,21 +83,25 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
119
  MAX_SIMILARITY = 0.8
120
 
121
  possible_matches_for_each_l1 = []
122
- for i in range(len(list1)):
123
  valid_matches_for_l1_element = []
124
- for j in range(len(list2)):
125
  score = matrix[i][j]
126
 
 
 
127
  if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
128
- valid_matches_for_l1_element.append((list2[j], score))
 
 
129
 
130
  if not valid_matches_for_l1_element:
131
- print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
132
  f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
133
  "Returning an empty list as no complete combinations can be formed.")
134
 
135
  else:
136
- possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
137
 
138
  result = []
139
  for tech_list, problem in possible_matches_for_each_l1:
@@ -219,10 +187,10 @@ def select_technologies(problem_technology_list):
219
 
220
  def load_titles(techno, data_type):
221
  if data_type == "pydantic":
222
- technology_titles = [tech.title for tech in techno]
223
  else: # data_type == "dict"
224
  technologies = techno["technologies"]
225
- technology_titles = [tech["title"] for tech in technologies]
226
  return technology_titles
227
 
228
  def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
 
1
+ from src.services.utils import load_data, stem, set_gemini
2
  import requests as r
3
  import json
4
  import nltk
 
23
 
24
  constraints_json = json.loads("{"+json_str+"}")
25
 
 
26
  return constraints_json
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def remove_over_repeated_technologies(result):
29
  total_lists = len(result)
30
  tech_title = {}
31
 
32
  for idx, item in enumerate(result):
33
  for tech in item['technologies']:
34
+ tech_title[tech[0]['name']] = 0 if tech[0]['name'] not in tech_title else tech_title[tech[0]['name']] + 1
35
 
36
  threshold = total_lists * 0.3
37
  print(threshold)
 
43
  to_delete.append(tech)
44
 
45
  for idx, item in enumerate(result):
46
+ result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['name'] not in to_delete]
47
 
48
  return result
49
 
50
+ def get_contrastive_similarities(constraints, dataset):
51
  selected_pairs = []
52
  matrix = []
53
 
 
57
  for i, constraint in enumerate(constraints):
58
  constraint_embedding = constraint_embeddings[i]
59
  constraint_matrix = []
60
+ for j, row in enumerate(dataset):
61
+ tech_embedding = row["embeddings"]
62
 
63
  purpose_sim = model.similarity(constraint_embedding, tech_embedding)
64
 
 
67
 
68
  selected_pairs.append({
69
  "constraint": constraint,
70
+ "id2": j,
71
  "similarity": purpose_sim
72
  })
73
  constraint_matrix.append(purpose_sim)
 
83
  MAX_SIMILARITY = 0.8
84
 
85
  possible_matches_for_each_l1 = []
86
+ for i, row_i in enumerate(list1):
87
  valid_matches_for_l1_element = []
88
+ for j, row_j in enumerate(list2):
89
  score = matrix[i][j]
90
 
91
+ # print(row_j)
92
+ # print(type(row_j))
93
  if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
94
+ del row_j["embeddings"]
95
+ row_j["id"] = j
96
+ valid_matches_for_l1_element.append((row_j, score))
97
 
98
  if not valid_matches_for_l1_element:
99
+ print(f"No valid matches found in list2 for '{row_i}' from list1 "
100
  f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
101
  "Returning an empty list as no complete combinations can be formed.")
102
 
103
  else:
104
+ possible_matches_for_each_l1.append((valid_matches_for_l1_element, row_i))
105
 
106
  result = []
107
  for tech_list, problem in possible_matches_for_each_l1:
 
187
 
188
  def load_titles(techno, data_type):
189
  if data_type == "pydantic":
190
+ technology_titles = [tech.name for tech in techno]
191
  else: # data_type == "dict"
192
  technologies = techno["technologies"]
193
+ technology_titles = [tech["name"] for tech in technologies]
194
  return technology_titles
195
 
196
  def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
src/services/utils.py CHANGED
@@ -1,20 +1,20 @@
1
  import pickle
2
  import numpy as np
3
  import pandas as pd
4
-
5
  import nltk
6
  from nltk.stem import *
7
  nltk.download("punkt_tab")
8
-
9
  from pathlib import Path
 
 
10
  import os
11
  import google.generativeai as genai
12
  import json
13
  from google.genai import Client, types
 
 
14
 
15
- BASE_DIR = Path(__file__).resolve().parent.parent
16
 
17
- FILE_PATH = BASE_DIR / 'ressources' / 'global_tech_embeddings.pkl'
18
 
19
  def set_prompt(problem):
20
  prompt = """
@@ -51,71 +51,44 @@ Output each constraints in a JSON such as : {"title of the constraints1":"descri
51
  """ + problem
52
  return prompt
53
 
54
- def load_technologies_excel():
55
- df = pd.read_excel(FILE_PATH)
56
- return df
57
-
58
- def load_technologies():
59
- EMBEDDINGS_FILE = FILE_PATH
60
-
61
- try:
62
- with open(EMBEDDINGS_FILE, 'rb') as f:
63
- loaded_data = pickle.load(f)
64
- global_tech = loaded_data['global_tech']
65
- global_tech_embedding = loaded_data['global_tech_embeddings']
66
- return global_tech, global_tech_embedding
67
- except Exception as e:
68
- print(f"Error: {e}")
69
-
70
- def tech_to_dict(technologies):
71
- tech_dict = []
72
- for index, tech in enumerate(technologies):
73
- if not tech.find("<title>") > 1:
74
- tab = tech.split("\n")
75
- tab.pop(0)
76
- tab.pop(len(tab)-1)
77
- tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
78
- "purpose": tab[1][tab[1].find(": ")+2:],
79
- "key_components": tab[2][tab[2].find(": ")+2:],
80
- "advantages": tab[3][tab[3].find(": ")+2:],
81
- "limitations": tab[4][tab[4].find(": ")+2:],
82
- "id": index})
83
- return tech_dict
84
-
85
- def save_dataframe(df, title):
86
- pd.DataFrame(df).to_excel(title)
87
- return title
88
 
89
  def stem(data,data_type):
90
  stemmer = SnowballStemmer("english")
91
  processed_data = []
92
  if data_type == "technologies":
93
- for t_item in data:
94
  processed_data.append({
95
- "title": stemmer.stem(t_item["title"]),
96
  "purpose": stemmer.stem(t_item["purpose"]),
97
- "key_components": stemmer.stem(t_item["key_components"]),
98
  "advantages": stemmer.stem(t_item["advantages"]),
99
  "limitations": stemmer.stem(t_item["limitations"]),
100
- "id": t_item["id"]
 
101
  })
 
102
  else:
103
  for t_item in data:
104
- print(t_item)
105
  processed_data.append({
106
  "title": stemmer.stem(t_item),
107
  "description": stemmer.stem(data[t_item])
108
  })
109
-
110
  return processed_data
111
 
112
 
113
- def get_technologies_by_id(id_list, technologies):
114
  result = []
115
- id_set = set(id_list)
116
- for tech in technologies:
117
- if tech.get('id') in id_set:
118
- result.append(tech)
 
 
119
  return result
120
 
121
  def save_to_pickle(result_similarites):
@@ -133,7 +106,7 @@ def save_to_pickle(result_similarites):
133
 
134
  for item in result_similarites:
135
  row_idx = row_label_to_index[item['constraint']['title']]
136
- col_idx = item['id2'] - 1 #
137
  similarity_value = item['similarity'].item()
138
 
139
  matrix[row_idx, col_idx] = similarity_value
@@ -157,7 +130,6 @@ def save_to_pickle(result_similarites):
157
  print(f"\nMatrix and labels saved to {output_filename}")
158
  return output_filename
159
 
160
-
161
  def set_gemini():
162
  gemini_api = os.getenv("GEMINI_API")
163
  client = Client(api_key=gemini_api)
 
1
  import pickle
2
  import numpy as np
3
  import pandas as pd
 
4
  import nltk
5
  from nltk.stem import *
6
  nltk.download("punkt_tab")
 
7
  from pathlib import Path
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
  import os
11
  import google.generativeai as genai
12
  import json
13
  from google.genai import Client, types
14
+ from datasets import load_dataset
15
+
16
 
 
17
 
 
18
 
19
  def set_prompt(problem):
20
  prompt = """
 
51
  """ + problem
52
  return prompt
53
 
54
+
55
+ def load_data():
56
+ return load_dataset("heymenn/Technologies", split="train")
57
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def stem(data,data_type):
60
  stemmer = SnowballStemmer("english")
61
  processed_data = []
62
  if data_type == "technologies":
63
+ for index, t_item in enumerate(data):
64
  processed_data.append({
65
+ "name": stemmer.stem(t_item["name"]),
66
  "purpose": stemmer.stem(t_item["purpose"]),
67
+ "problem_types_solved": stemmer.stem(t_item["problem_types_solved"]),
68
  "advantages": stemmer.stem(t_item["advantages"]),
69
  "limitations": stemmer.stem(t_item["limitations"]),
70
+ "domain_tags": stemmer.stem(t_item["domain_tags"]),
71
+ "id": index
72
  })
73
+
74
  else:
75
  for t_item in data:
 
76
  processed_data.append({
77
  "title": stemmer.stem(t_item),
78
  "description": stemmer.stem(data[t_item])
79
  })
80
+
81
  return processed_data
82
 
83
 
84
+ def get_technologies_by_id(technologies,dataset):
85
  result = []
86
+ for id in technologies:
87
+ print(id)
88
+ data = dataset[id]
89
+ del data["embeddings"]
90
+ print(data)
91
+ result.append(data)
92
  return result
93
 
94
  def save_to_pickle(result_similarites):
 
106
 
107
  for item in result_similarites:
108
  row_idx = row_label_to_index[item['constraint']['title']]
109
+ col_idx = item['id2'] - 1
110
  similarity_value = item['similarity'].item()
111
 
112
  matrix[row_idx, col_idx] = similarity_value
 
130
  print(f"\nMatrix and labels saved to {output_filename}")
131
  return output_filename
132
 
 
133
  def set_gemini():
134
  gemini_api = os.getenv("GEMINI_API")
135
  client = Client(api_key=gemini_api)