Spaces:
Sleeping
Sleeping
ALLOUNE
commited on
Commit
·
a95c4ad
1
Parent(s):
c97a50e
add forced technologies
Browse files- app.py +12 -10
- src/core.py +10 -2
- src/services/processor.py +46 -9
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from fastapi import FastAPI
|
2 |
-
from pydantic import BaseModel
|
3 |
-
from typing import Dict, List
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
import json
|
@@ -18,13 +18,6 @@ app = FastAPI(
|
|
18 |
description="Find relevant technologies from a problem",
|
19 |
)
|
20 |
|
21 |
-
class InputProblem(BaseModel):
|
22 |
-
problem: str
|
23 |
-
|
24 |
-
class InputConstraints(BaseModel):
|
25 |
-
constraints: Dict[str, str]
|
26 |
-
|
27 |
-
# This schema defines the structure for a single technology object
|
28 |
class Technology(BaseModel):
|
29 |
"""Represents a single technology entry with its details."""
|
30 |
name: str
|
@@ -34,6 +27,14 @@ class Technology(BaseModel):
|
|
34 |
limitations: str
|
35 |
domain_tags: str
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
class OutputPriorArt(BaseModel):
|
38 |
"""Represents the search of prior art using the technology combinations"""
|
39 |
content: str
|
@@ -60,7 +61,7 @@ async def process(data: InputProblem):
|
|
60 |
|
61 |
@app.post("/process-constraints", response_model=TechnologyData)
|
62 |
async def process_constraints(constraints: InputConstraints):
|
63 |
-
result= process_input(constraints
|
64 |
return {"technologies": result}
|
65 |
|
66 |
@app.post("/prior-art-constraints", response_model=OutputPriorArt)
|
@@ -73,6 +74,7 @@ async def prior_art_problems(data: InputPriorArtProblem):
|
|
73 |
prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
|
74 |
return prior_art
|
75 |
|
|
|
76 |
def make_json_serializable(data):
|
77 |
if isinstance(data, dict):
|
78 |
return {k: make_json_serializable(v) for k, v in data.items()}
|
|
|
1 |
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel, Field
|
3 |
+
from typing import Dict, List, Optional
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
import json
|
|
|
18 |
description="Find relevant technologies from a problem",
|
19 |
)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
class Technology(BaseModel):
|
22 |
"""Represents a single technology entry with its details."""
|
23 |
name: str
|
|
|
27 |
limitations: str
|
28 |
domain_tags: str
|
29 |
|
30 |
+
class InputProblem(BaseModel):
|
31 |
+
problem: str
|
32 |
+
forced_technologies: Optional[List[str]] = Field(default=None)
|
33 |
+
|
34 |
+
class InputConstraints(BaseModel):
|
35 |
+
constraints: Dict[str, str]
|
36 |
+
forced_technologies: Optional[List[str]] = Field(default=None)
|
37 |
+
|
38 |
class OutputPriorArt(BaseModel):
|
39 |
"""Represents the search of prior art using the technology combinations"""
|
40 |
content: str
|
|
|
61 |
|
62 |
@app.post("/process-constraints", response_model=TechnologyData)
|
63 |
async def process_constraints(constraints: InputConstraints):
|
64 |
+
result= process_input(constraints, dataset, "constraints")
|
65 |
return {"technologies": result}
|
66 |
|
67 |
@app.post("/prior-art-constraints", response_model=OutputPriorArt)
|
|
|
74 |
prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
|
75 |
return prior_art
|
76 |
|
77 |
+
|
78 |
def make_json_serializable(data):
|
79 |
if isinstance(data, dict):
|
80 |
return {k: make_json_serializable(v) for k, v in data.items()}
|
src/core.py
CHANGED
@@ -5,12 +5,13 @@ dataset = load_data()
|
|
5 |
|
6 |
|
7 |
def process_input(data, dataset, data_type):
|
|
|
8 |
if data_type == "problem":
|
9 |
prompt = set_prompt(data.problem)
|
10 |
constraints = retrieve_constraints(prompt)
|
11 |
|
12 |
elif data_type == "constraints":
|
13 |
-
constraints = data
|
14 |
|
15 |
constraints_stemmed = stem(constraints, "constraints")
|
16 |
|
@@ -18,8 +19,15 @@ def process_input(data, dataset, data_type):
|
|
18 |
|
19 |
save_to_pickle(result_similarities)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
|
22 |
-
best_technologies_id = select_technologies(best_combinations)
|
23 |
best_technologies = get_technologies_by_id(best_technologies_id, dataset)
|
24 |
|
25 |
return best_technologies
|
|
|
5 |
|
6 |
|
7 |
def process_input(data, dataset, data_type):
|
8 |
+
print(data)
|
9 |
if data_type == "problem":
|
10 |
prompt = set_prompt(data.problem)
|
11 |
constraints = retrieve_constraints(prompt)
|
12 |
|
13 |
elif data_type == "constraints":
|
14 |
+
constraints = data.constraints
|
15 |
|
16 |
constraints_stemmed = stem(constraints, "constraints")
|
17 |
|
|
|
19 |
|
20 |
save_to_pickle(result_similarities)
|
21 |
|
22 |
+
if data.forced_technologies:
|
23 |
+
forced_technologies = search_technology_by_name(data.forced_technologies, dataset)
|
24 |
+
res,forced_matrix = get_contrastive_similarities(constraints_stemmed, dataset)
|
25 |
+
forced_combination = find_best_list_combinations(constraints_stemmed, forced_technologies, forced_matrix)
|
26 |
+
else:
|
27 |
+
forced_combination = []
|
28 |
+
|
29 |
best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
|
30 |
+
best_technologies_id = select_technologies(best_combinations, forced_combination)
|
31 |
best_technologies = get_technologies_by_id(best_technologies_id, dataset)
|
32 |
|
33 |
return best_technologies
|
src/services/processor.py
CHANGED
@@ -4,6 +4,8 @@ import json
|
|
4 |
import nltk
|
5 |
import itertools
|
6 |
import numpy as np
|
|
|
|
|
7 |
|
8 |
from sentence_transformers import *
|
9 |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
@@ -79,6 +81,8 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
|
|
79 |
print("Warning: One or both input lists are empty. Returning an empty list.")
|
80 |
return []
|
81 |
|
|
|
|
|
82 |
MIN_SIMILARITY = 0.3
|
83 |
MAX_SIMILARITY = 0.8
|
84 |
|
@@ -88,12 +92,11 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
|
|
88 |
for j, row_j in enumerate(list2):
|
89 |
score = matrix[i][j]
|
90 |
|
91 |
-
# print(row_j)
|
92 |
-
# print(type(row_j))
|
93 |
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
97 |
|
98 |
if not valid_matches_for_l1_element:
|
99 |
print(f"No valid matches found in list2 for '{row_i}' from list1 "
|
@@ -119,18 +122,53 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
|
|
119 |
result = remove_over_repeated_technologies(result)
|
120 |
return result
|
121 |
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
distinct_techs = set()
|
125 |
candidate_map = []
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
for problem_data in problem_technology_list:
|
128 |
cand_dict = {}
|
129 |
for tech_info, sim in problem_data['technologies']:
|
130 |
tech_id = tech_info['id']
|
131 |
distinct_techs.add(tech_id)
|
132 |
cand_dict[tech_id] = float(sim)
|
133 |
-
candidate_map
|
|
|
134 |
|
135 |
distinct_techs = sorted(list(distinct_techs))
|
136 |
n = len(problem_technology_list)
|
@@ -242,7 +280,6 @@ def add_citations_and_collect_uris(response):
|
|
242 |
for i in support.grounding_chunk_indices:
|
243 |
if i < len(chunks):
|
244 |
uri = chunks[i].web.uri
|
245 |
-
# Add URI only if not already in text or collected
|
246 |
if uri not in text and uri not in uris_added:
|
247 |
citation_links.append(f"[{i + 1}]({uri})")
|
248 |
uris_added.add(uri)
|
|
|
4 |
import nltk
|
5 |
import itertools
|
6 |
import numpy as np
|
7 |
+
import requests
|
8 |
+
from datasets import concatenate_datasets
|
9 |
|
10 |
from sentence_transformers import *
|
11 |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
81 |
print("Warning: One or both input lists are empty. Returning an empty list.")
|
82 |
return []
|
83 |
|
84 |
+
print(list2)
|
85 |
+
|
86 |
MIN_SIMILARITY = 0.3
|
87 |
MAX_SIMILARITY = 0.8
|
88 |
|
|
|
92 |
for j, row_j in enumerate(list2):
|
93 |
score = matrix[i][j]
|
94 |
|
|
|
|
|
95 |
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
96 |
+
data = row_j
|
97 |
+
del data["embeddings"]
|
98 |
+
data["id"] = j
|
99 |
+
valid_matches_for_l1_element.append((data, score))
|
100 |
|
101 |
if not valid_matches_for_l1_element:
|
102 |
print(f"No valid matches found in list2 for '{row_i}' from list1 "
|
|
|
122 |
result = remove_over_repeated_technologies(result)
|
123 |
return result
|
124 |
|
125 |
+
def search_technology_by_name(user_input, dataset):
|
126 |
+
url = "https://heymenn-search-technologies-api.hf.space/search-technologies"
|
127 |
+
|
128 |
+
headers = {
|
129 |
+
"accept": "application/json",
|
130 |
+
"Content-Type": "application/json"
|
131 |
+
}
|
132 |
+
|
133 |
+
results = []
|
134 |
+
for input in user_input:
|
135 |
+
payload = {
|
136 |
+
"title": input,
|
137 |
+
"type": "title"
|
138 |
+
}
|
139 |
+
response = requests.post(url, headers=headers, json=payload)
|
140 |
+
print(response.json())
|
141 |
+
results.append(response.json())
|
142 |
+
|
143 |
+
technologies = []
|
144 |
+
for result in results:
|
145 |
+
technology = dataset.filter(lambda row: row["name"] == result["title"])
|
146 |
+
technologies.append(technology)
|
147 |
+
|
148 |
+
combined_dataset = concatenate_datasets(technologies)
|
149 |
+
return combined_dataset
|
150 |
+
|
151 |
+
|
152 |
+
def select_technologies(problem_technology_list, forced_technology_list=[]):
|
153 |
distinct_techs = set()
|
154 |
candidate_map = []
|
155 |
|
156 |
+
if len(forced_technology_list) == 0:
|
157 |
+
for problem_data in forced_technology_list:
|
158 |
+
cand_dict = {}
|
159 |
+
for tech_info, sim in problem_data['technologies']:
|
160 |
+
tech_id = tech_info['id']
|
161 |
+
distinct_techs.add(tech_id)
|
162 |
+
cand_dict[tech_id] = float(sim)
|
163 |
+
|
164 |
for problem_data in problem_technology_list:
|
165 |
cand_dict = {}
|
166 |
for tech_info, sim in problem_data['technologies']:
|
167 |
tech_id = tech_info['id']
|
168 |
distinct_techs.add(tech_id)
|
169 |
cand_dict[tech_id] = float(sim)
|
170 |
+
if cand_dict not in candidate_map:
|
171 |
+
candidate_map.append(cand_dict)
|
172 |
|
173 |
distinct_techs = sorted(list(distinct_techs))
|
174 |
n = len(problem_technology_list)
|
|
|
280 |
for i in support.grounding_chunk_indices:
|
281 |
if i < len(chunks):
|
282 |
uri = chunks[i].web.uri
|
|
|
283 |
if uri not in text and uri not in uris_added:
|
284 |
citation_links.append(f"[{i + 1}]({uri})")
|
285 |
uris_added.add(uri)
|