ALLOUNE
commited on
Commit
·
7d1249d
1
Parent(s):
7a7787a
add faiss search
Browse files- main.py +10 -9
- requirements.txt +3 -1
- src/processor.py +18 -20
main.py
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
# api/main.py
|
2 |
-
from fastapi import FastAPI, HTTPException
|
3 |
import sentence_transformers
|
4 |
-
from huggingface_hub import hf_hub_download, login
|
5 |
import pandas as pd
|
|
|
6 |
|
|
|
|
|
7 |
from src.processor import send_to_dataset,search_and_retrieve,generate_tech
|
8 |
from typing import List, Dict
|
9 |
from pydantic import BaseModel
|
10 |
-
|
11 |
from datasets import load_dataset
|
|
|
|
|
|
|
12 |
|
13 |
-
import os
|
14 |
|
15 |
login(token=os.getenv("HF_TOKEN"))
|
16 |
|
@@ -23,12 +25,11 @@ app = FastAPI(
|
|
23 |
|
24 |
|
25 |
model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
26 |
-
dataset = load_dataset("OrganizedProgrammers/Technologies",
|
27 |
-
|
28 |
|
29 |
class SearchInput(BaseModel):
|
30 |
title: str
|
31 |
-
type: str = "title"
|
32 |
|
33 |
class SearchOutput(BaseModel):
|
34 |
title: str
|
@@ -54,7 +55,7 @@ def post_search(payload: SearchInput):
|
|
54 |
"""
|
55 |
Endpoint that returns a search result.
|
56 |
"""
|
57 |
-
config = {"dataset": dataset, "model": model
|
58 |
res = search_and_retrieve(payload.title, config)
|
59 |
return res
|
60 |
|
@@ -64,7 +65,7 @@ def post_generate_and_push(payload: GenerateInput):
|
|
64 |
Endpoint to generate a technology and push it to the dataset
|
65 |
"""
|
66 |
|
67 |
-
config = {"dataset": dataset, "model": model
|
68 |
res = search_and_retrieve(payload.title, config)
|
69 |
if res["score"] >= 0.7 and not payload.force:
|
70 |
raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
|
|
|
1 |
# api/main.py
|
|
|
2 |
import sentence_transformers
|
|
|
3 |
import pandas as pd
|
4 |
+
import os
|
5 |
|
6 |
+
from fastapi import FastAPI, HTTPException
|
7 |
+
from huggingface_hub import hf_hub_download, login
|
8 |
from src.processor import send_to_dataset,search_and_retrieve,generate_tech
|
9 |
from typing import List, Dict
|
10 |
from pydantic import BaseModel
|
|
|
11 |
from datasets import load_dataset
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
|
|
|
16 |
|
17 |
login(token=os.getenv("HF_TOKEN"))
|
18 |
|
|
|
25 |
|
26 |
|
27 |
model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
28 |
+
dataset = load_dataset("OrganizedProgrammers/Technologies", split="train")
|
29 |
+
dataset.add_faiss_index(column='embeddings')
|
30 |
|
31 |
class SearchInput(BaseModel):
|
32 |
title: str
|
|
|
33 |
|
34 |
class SearchOutput(BaseModel):
|
35 |
title: str
|
|
|
55 |
"""
|
56 |
Endpoint that returns a search result.
|
57 |
"""
|
58 |
+
config = {"dataset": dataset, "model": model}
|
59 |
res = search_and_retrieve(payload.title, config)
|
60 |
return res
|
61 |
|
|
|
65 |
Endpoint to generate a technology and push it to the dataset
|
66 |
"""
|
67 |
|
68 |
+
config = {"dataset": dataset, "model": model}
|
69 |
res = search_and_retrieve(payload.title, config)
|
70 |
if res["score"] >= 0.7 and not payload.force:
|
71 |
raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
|
requirements.txt
CHANGED
@@ -4,4 +4,6 @@ sentence_transformers
|
|
4 |
pandas
|
5 |
fuzzywuzzy
|
6 |
google.genai
|
7 |
-
datasets
|
|
|
|
|
|
4 |
pandas
|
5 |
fuzzywuzzy
|
6 |
google.genai
|
7 |
+
datasets
|
8 |
+
faiss-cpu
|
9 |
+
python-dotenv
|
src/processor.py
CHANGED
@@ -10,33 +10,31 @@ def search_and_retrieve(user_input, config):
|
|
10 |
model = config["model"]
|
11 |
|
12 |
user_embedding = model.encode(user_input)
|
13 |
-
results =
|
14 |
-
max_result = {"score":0, "technology": "", "type":""}
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
|
20 |
-
|
21 |
-
if config["type"] == "purpose":
|
22 |
-
token_set_ratio = fuzz.token_set_ratio(user_input, purpose)
|
23 |
-
else:
|
24 |
-
token_set_ratio = fuzz.token_set_ratio(user_input, name)
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
|
29 |
-
result = {"title": name, "purpose": purpose, "score": combined_score.item()}
|
30 |
-
if combined_score > max_result["score"]:
|
31 |
-
max_result = result
|
32 |
|
33 |
-
|
|
|
|
|
34 |
|
|
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
return
|
40 |
|
41 |
|
42 |
def generate_tech(user_input, user_instructions):
|
|
|
10 |
model = config["model"]
|
11 |
|
12 |
user_embedding = model.encode(user_input)
|
13 |
+
results = dataset.get_nearest_examples('embeddings', user_embedding, k=5)
|
|
|
14 |
|
15 |
+
s=results.scores
|
16 |
+
t=results.examples
|
17 |
+
n = len(t['name'])
|
18 |
|
19 |
+
result = []
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
for i in range(n):
|
22 |
+
item = {}
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
for key, value in t.items():
|
25 |
+
if key!="embeddings":
|
26 |
+
item[key] = value[i]
|
27 |
|
28 |
+
result.append(item)
|
29 |
|
30 |
+
for i,r in enumerate(result):
|
31 |
+
r["score"]=float(s[i])
|
32 |
+
|
33 |
+
final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]}
|
34 |
+
final_output["top5"] = result
|
35 |
+
print(final_output)
|
36 |
|
37 |
+
return final_output
|
38 |
|
39 |
|
40 |
def generate_tech(user_input, user_instructions):
|