ALLOUNE
commited on
Commit
·
7d1249d
1
Parent(s):
7a7787a
add faiss search
Browse files- main.py +10 -9
- requirements.txt +3 -1
- src/processor.py +18 -20
main.py
CHANGED
|
@@ -1,16 +1,18 @@
|
|
| 1 |
# api/main.py
|
| 2 |
-
from fastapi import FastAPI, HTTPException
|
| 3 |
import sentence_transformers
|
| 4 |
-
from huggingface_hub import hf_hub_download, login
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
|
|
|
|
|
|
|
| 7 |
from src.processor import send_to_dataset,search_and_retrieve,generate_tech
|
| 8 |
from typing import List, Dict
|
| 9 |
from pydantic import BaseModel
|
| 10 |
-
|
| 11 |
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
import os
|
| 14 |
|
| 15 |
login(token=os.getenv("HF_TOKEN"))
|
| 16 |
|
|
@@ -23,12 +25,11 @@ app = FastAPI(
|
|
| 23 |
|
| 24 |
|
| 25 |
model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 26 |
-
dataset = load_dataset("OrganizedProgrammers/Technologies",
|
| 27 |
-
|
| 28 |
|
| 29 |
class SearchInput(BaseModel):
|
| 30 |
title: str
|
| 31 |
-
type: str = "title"
|
| 32 |
|
| 33 |
class SearchOutput(BaseModel):
|
| 34 |
title: str
|
|
@@ -54,7 +55,7 @@ def post_search(payload: SearchInput):
|
|
| 54 |
"""
|
| 55 |
Endpoint that returns a search result.
|
| 56 |
"""
|
| 57 |
-
config = {"dataset": dataset, "model": model
|
| 58 |
res = search_and_retrieve(payload.title, config)
|
| 59 |
return res
|
| 60 |
|
|
@@ -64,7 +65,7 @@ def post_generate_and_push(payload: GenerateInput):
|
|
| 64 |
Endpoint to generate a technology and push it to the dataset
|
| 65 |
"""
|
| 66 |
|
| 67 |
-
config = {"dataset": dataset, "model": model
|
| 68 |
res = search_and_retrieve(payload.title, config)
|
| 69 |
if res["score"] >= 0.7 and not payload.force:
|
| 70 |
raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
|
|
|
|
| 1 |
# api/main.py
|
|
|
|
| 2 |
import sentence_transformers
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from huggingface_hub import hf_hub_download, login
|
| 8 |
from src.processor import send_to_dataset,search_and_retrieve,generate_tech
|
| 9 |
from typing import List, Dict
|
| 10 |
from pydantic import BaseModel
|
|
|
|
| 11 |
from datasets import load_dataset
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
login(token=os.getenv("HF_TOKEN"))
|
| 18 |
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 28 |
+
dataset = load_dataset("OrganizedProgrammers/Technologies", split="train")
|
| 29 |
+
dataset.add_faiss_index(column='embeddings')
|
| 30 |
|
| 31 |
class SearchInput(BaseModel):
|
| 32 |
title: str
|
|
|
|
| 33 |
|
| 34 |
class SearchOutput(BaseModel):
|
| 35 |
title: str
|
|
|
|
| 55 |
"""
|
| 56 |
Endpoint that returns a search result.
|
| 57 |
"""
|
| 58 |
+
config = {"dataset": dataset, "model": model}
|
| 59 |
res = search_and_retrieve(payload.title, config)
|
| 60 |
return res
|
| 61 |
|
|
|
|
| 65 |
Endpoint to generate a technology and push it to the dataset
|
| 66 |
"""
|
| 67 |
|
| 68 |
+
config = {"dataset": dataset, "model": model}
|
| 69 |
res = search_and_retrieve(payload.title, config)
|
| 70 |
if res["score"] >= 0.7 and not payload.force:
|
| 71 |
raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
|
requirements.txt
CHANGED
|
@@ -4,4 +4,6 @@ sentence_transformers
|
|
| 4 |
pandas
|
| 5 |
fuzzywuzzy
|
| 6 |
google.genai
|
| 7 |
-
datasets
|
|
|
|
|
|
|
|
|
| 4 |
pandas
|
| 5 |
fuzzywuzzy
|
| 6 |
google.genai
|
| 7 |
+
datasets
|
| 8 |
+
faiss-cpu
|
| 9 |
+
python-dotenv
|
src/processor.py
CHANGED
|
@@ -10,33 +10,31 @@ def search_and_retrieve(user_input, config):
|
|
| 10 |
model = config["model"]
|
| 11 |
|
| 12 |
user_embedding = model.encode(user_input)
|
| 13 |
-
results =
|
| 14 |
-
max_result = {"score":0, "technology": "", "type":""}
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
-
if config["type"] == "purpose":
|
| 22 |
-
token_set_ratio = fuzz.token_set_ratio(user_input, purpose)
|
| 23 |
-
else:
|
| 24 |
-
token_set_ratio = fuzz.token_set_ratio(user_input, name)
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
|
| 29 |
-
result = {"title": name, "purpose": purpose, "score": combined_score.item()}
|
| 30 |
-
if combined_score > max_result["score"]:
|
| 31 |
-
max_result = result
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
return
|
| 40 |
|
| 41 |
|
| 42 |
def generate_tech(user_input, user_instructions):
|
|
|
|
| 10 |
model = config["model"]
|
| 11 |
|
| 12 |
user_embedding = model.encode(user_input)
|
| 13 |
+
results = dataset.get_nearest_examples('embeddings', user_embedding, k=5)
|
|
|
|
| 14 |
|
| 15 |
+
s=results.scores
|
| 16 |
+
t=results.examples
|
| 17 |
+
n = len(t['name'])
|
| 18 |
|
| 19 |
+
result = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
for i in range(n):
|
| 22 |
+
item = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
for key, value in t.items():
|
| 25 |
+
if key!="embeddings":
|
| 26 |
+
item[key] = value[i]
|
| 27 |
|
| 28 |
+
result.append(item)
|
| 29 |
|
| 30 |
+
for i,r in enumerate(result):
|
| 31 |
+
r["score"]=float(s[i])
|
| 32 |
+
|
| 33 |
+
final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]}
|
| 34 |
+
final_output["top5"] = result
|
| 35 |
+
print(final_output)
|
| 36 |
|
| 37 |
+
return final_output
|
| 38 |
|
| 39 |
|
| 40 |
def generate_tech(user_input, user_instructions):
|