ALLOUNE commited on
Commit
a95c4ad
·
1 Parent(s): c97a50e

add forced technologies

Browse files
Files changed (3) hide show
  1. app.py +12 -10
  2. src/core.py +10 -2
  3. src/services/processor.py +46 -9
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from fastapi import FastAPI
2
- from pydantic import BaseModel
3
- from typing import Dict, List
4
  import gradio as gr
5
  import pandas as pd
6
  import json
@@ -18,13 +18,6 @@ app = FastAPI(
18
  description="Find relevant technologies from a problem",
19
  )
20
 
21
- class InputProblem(BaseModel):
22
- problem: str
23
-
24
- class InputConstraints(BaseModel):
25
- constraints: Dict[str, str]
26
-
27
- # This schema defines the structure for a single technology object
28
  class Technology(BaseModel):
29
  """Represents a single technology entry with its details."""
30
  name: str
@@ -34,6 +27,14 @@ class Technology(BaseModel):
34
  limitations: str
35
  domain_tags: str
36
 
 
 
 
 
 
 
 
 
37
  class OutputPriorArt(BaseModel):
38
  """Represents the search of prior art using the technology combinations"""
39
  content: str
@@ -60,7 +61,7 @@ async def process(data: InputProblem):
60
 
61
  @app.post("/process-constraints", response_model=TechnologyData)
62
  async def process_constraints(constraints: InputConstraints):
63
- result= process_input(constraints.constraints, dataset, "constraints")
64
  return {"technologies": result}
65
 
66
  @app.post("/prior-art-constraints", response_model=OutputPriorArt)
@@ -73,6 +74,7 @@ async def prior_art_problems(data: InputPriorArtProblem):
73
  prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
74
  return prior_art
75
 
 
76
  def make_json_serializable(data):
77
  if isinstance(data, dict):
78
  return {k: make_json_serializable(v) for k, v in data.items()}
 
1
  from fastapi import FastAPI
2
+ from pydantic import BaseModel, Field
3
+ from typing import Dict, List, Optional
4
  import gradio as gr
5
  import pandas as pd
6
  import json
 
18
  description="Find relevant technologies from a problem",
19
  )
20
 
 
 
 
 
 
 
 
21
  class Technology(BaseModel):
22
  """Represents a single technology entry with its details."""
23
  name: str
 
27
  limitations: str
28
  domain_tags: str
29
 
30
+ class InputProblem(BaseModel):
31
+ problem: str
32
+ forced_technologies: Optional[List[str]] = Field(default=None)
33
+
34
+ class InputConstraints(BaseModel):
35
+ constraints: Dict[str, str]
36
+ forced_technologies: Optional[List[str]] = Field(default=None)
37
+
38
  class OutputPriorArt(BaseModel):
39
  """Represents the search of prior art using the technology combinations"""
40
  content: str
 
61
 
62
  @app.post("/process-constraints", response_model=TechnologyData)
63
  async def process_constraints(constraints: InputConstraints):
64
+ result= process_input(constraints, dataset, "constraints")
65
  return {"technologies": result}
66
 
67
  @app.post("/prior-art-constraints", response_model=OutputPriorArt)
 
74
  prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
75
  return prior_art
76
 
77
+
78
  def make_json_serializable(data):
79
  if isinstance(data, dict):
80
  return {k: make_json_serializable(v) for k, v in data.items()}
src/core.py CHANGED
@@ -5,12 +5,13 @@ dataset = load_data()
5
 
6
 
7
  def process_input(data, dataset, data_type):
 
8
  if data_type == "problem":
9
  prompt = set_prompt(data.problem)
10
  constraints = retrieve_constraints(prompt)
11
 
12
  elif data_type == "constraints":
13
- constraints = data
14
 
15
  constraints_stemmed = stem(constraints, "constraints")
16
 
@@ -18,8 +19,15 @@ def process_input(data, dataset, data_type):
18
 
19
  save_to_pickle(result_similarities)
20
 
 
 
 
 
 
 
 
21
  best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
22
- best_technologies_id = select_technologies(best_combinations)
23
  best_technologies = get_technologies_by_id(best_technologies_id, dataset)
24
 
25
  return best_technologies
 
5
 
6
 
7
  def process_input(data, dataset, data_type):
8
+ print(data)
9
  if data_type == "problem":
10
  prompt = set_prompt(data.problem)
11
  constraints = retrieve_constraints(prompt)
12
 
13
  elif data_type == "constraints":
14
+ constraints = data.constraints
15
 
16
  constraints_stemmed = stem(constraints, "constraints")
17
 
 
19
 
20
  save_to_pickle(result_similarities)
21
 
22
+ if data.forced_technologies:
23
+ forced_technologies = search_technology_by_name(data.forced_technologies, dataset)
24
+ res,forced_matrix = get_contrastive_similarities(constraints_stemmed, dataset)
25
+ forced_combination = find_best_list_combinations(constraints_stemmed, forced_technologies, forced_matrix)
26
+ else:
27
+ forced_combination = []
28
+
29
  best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
30
+ best_technologies_id = select_technologies(best_combinations, forced_combination)
31
  best_technologies = get_technologies_by_id(best_technologies_id, dataset)
32
 
33
  return best_technologies
src/services/processor.py CHANGED
@@ -4,6 +4,8 @@ import json
4
  import nltk
5
  import itertools
6
  import numpy as np
 
 
7
 
8
  from sentence_transformers import *
9
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
@@ -79,6 +81,8 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
79
  print("Warning: One or both input lists are empty. Returning an empty list.")
80
  return []
81
 
 
 
82
  MIN_SIMILARITY = 0.3
83
  MAX_SIMILARITY = 0.8
84
 
@@ -88,12 +92,11 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
88
  for j, row_j in enumerate(list2):
89
  score = matrix[i][j]
90
 
91
- # print(row_j)
92
- # print(type(row_j))
93
  if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
94
- del row_j["embeddings"]
95
- row_j["id"] = j
96
- valid_matches_for_l1_element.append((row_j, score))
 
97
 
98
  if not valid_matches_for_l1_element:
99
  print(f"No valid matches found in list2 for '{row_i}' from list1 "
@@ -119,18 +122,53 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
119
  result = remove_over_repeated_technologies(result)
120
  return result
121
 
122
-
123
- def select_technologies(problem_technology_list):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  distinct_techs = set()
125
  candidate_map = []
126
 
 
 
 
 
 
 
 
 
127
  for problem_data in problem_technology_list:
128
  cand_dict = {}
129
  for tech_info, sim in problem_data['technologies']:
130
  tech_id = tech_info['id']
131
  distinct_techs.add(tech_id)
132
  cand_dict[tech_id] = float(sim)
133
- candidate_map.append(cand_dict)
 
134
 
135
  distinct_techs = sorted(list(distinct_techs))
136
  n = len(problem_technology_list)
@@ -242,7 +280,6 @@ def add_citations_and_collect_uris(response):
242
  for i in support.grounding_chunk_indices:
243
  if i < len(chunks):
244
  uri = chunks[i].web.uri
245
- # Add URI only if not already in text or collected
246
  if uri not in text and uri not in uris_added:
247
  citation_links.append(f"[{i + 1}]({uri})")
248
  uris_added.add(uri)
 
4
  import nltk
5
  import itertools
6
  import numpy as np
7
+ import requests
8
+ from datasets import concatenate_datasets
9
 
10
  from sentence_transformers import *
11
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
81
  print("Warning: One or both input lists are empty. Returning an empty list.")
82
  return []
83
 
84
+ print(list2)
85
+
86
  MIN_SIMILARITY = 0.3
87
  MAX_SIMILARITY = 0.8
88
 
 
92
  for j, row_j in enumerate(list2):
93
  score = matrix[i][j]
94
 
 
 
95
  if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
96
+ data = row_j
97
+ del data["embeddings"]
98
+ data["id"] = j
99
+ valid_matches_for_l1_element.append((data, score))
100
 
101
  if not valid_matches_for_l1_element:
102
  print(f"No valid matches found in list2 for '{row_i}' from list1 "
 
122
  result = remove_over_repeated_technologies(result)
123
  return result
124
 
125
+ def search_technology_by_name(user_input, dataset):
126
+ url = "https://heymenn-search-technologies-api.hf.space/search-technologies"
127
+
128
+ headers = {
129
+ "accept": "application/json",
130
+ "Content-Type": "application/json"
131
+ }
132
+
133
+ results = []
134
+ for input in user_input:
135
+ payload = {
136
+ "title": input,
137
+ "type": "title"
138
+ }
139
+ response = requests.post(url, headers=headers, json=payload)
140
+ print(response.json())
141
+ results.append(response.json())
142
+
143
+ technologies = []
144
+ for result in results:
145
+ technology = dataset.filter(lambda row: row["name"] == result["title"])
146
+ technologies.append(technology)
147
+
148
+ combined_dataset = concatenate_datasets(technologies)
149
+ return combined_dataset
150
+
151
+
152
+ def select_technologies(problem_technology_list, forced_technology_list=[]):
153
  distinct_techs = set()
154
  candidate_map = []
155
 
156
+ if len(forced_technology_list) == 0:
157
+ for problem_data in forced_technology_list:
158
+ cand_dict = {}
159
+ for tech_info, sim in problem_data['technologies']:
160
+ tech_id = tech_info['id']
161
+ distinct_techs.add(tech_id)
162
+ cand_dict[tech_id] = float(sim)
163
+
164
  for problem_data in problem_technology_list:
165
  cand_dict = {}
166
  for tech_info, sim in problem_data['technologies']:
167
  tech_id = tech_info['id']
168
  distinct_techs.add(tech_id)
169
  cand_dict[tech_id] = float(sim)
170
+ if cand_dict not in candidate_map:
171
+ candidate_map.append(cand_dict)
172
 
173
  distinct_techs = sorted(list(distinct_techs))
174
  n = len(problem_technology_list)
 
280
  for i in support.grounding_chunk_indices:
281
  if i < len(chunks):
282
  uri = chunks[i].web.uri
 
283
  if uri not in text and uri not in uris_added:
284
  citation_links.append(f"[{i + 1}]({uri})")
285
  uris_added.add(uri)