ALLOUNE commited on
Commit
7d1249d
·
1 Parent(s): 7a7787a

add faiss search

Browse files
Files changed (3) hide show
  1. main.py +10 -9
  2. requirements.txt +3 -1
  3. src/processor.py +18 -20
main.py CHANGED
@@ -1,16 +1,18 @@
1
  # api/main.py
2
- from fastapi import FastAPI, HTTPException
3
  import sentence_transformers
4
- from huggingface_hub import hf_hub_download, login
5
  import pandas as pd
 
6
 
 
 
7
  from src.processor import send_to_dataset,search_and_retrieve,generate_tech
8
  from typing import List, Dict
9
  from pydantic import BaseModel
10
-
11
  from datasets import load_dataset
 
 
 
12
 
13
- import os
14
 
15
  login(token=os.getenv("HF_TOKEN"))
16
 
@@ -23,12 +25,11 @@ app = FastAPI(
23
 
24
 
25
  model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
26
- dataset = load_dataset("OrganizedProgrammers/Technologies", streaming=True, split="train")
27
-
28
 
29
  class SearchInput(BaseModel):
30
  title: str
31
- type: str = "title"
32
 
33
  class SearchOutput(BaseModel):
34
  title: str
@@ -54,7 +55,7 @@ def post_search(payload: SearchInput):
54
  """
55
  Endpoint that returns a search result.
56
  """
57
- config = {"dataset": dataset, "model": model, "type": payload.type}
58
  res = search_and_retrieve(payload.title, config)
59
  return res
60
 
@@ -64,7 +65,7 @@ def post_generate_and_push(payload: GenerateInput):
64
  Endpoint to generate a technology and push it to the dataset
65
  """
66
 
67
- config = {"dataset": dataset, "model": model, "type": "title"}
68
  res = search_and_retrieve(payload.title, config)
69
  if res["score"] >= 0.7 and not payload.force:
70
  raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
 
1
  # api/main.py
 
2
  import sentence_transformers
 
3
  import pandas as pd
4
+ import os
5
 
6
+ from fastapi import FastAPI, HTTPException
7
+ from huggingface_hub import hf_hub_download, login
8
  from src.processor import send_to_dataset,search_and_retrieve,generate_tech
9
  from typing import List, Dict
10
  from pydantic import BaseModel
 
11
  from datasets import load_dataset
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
 
 
16
 
17
  login(token=os.getenv("HF_TOKEN"))
18
 
 
25
 
26
 
27
  model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
28
+ dataset = load_dataset("OrganizedProgrammers/Technologies", split="train")
29
+ dataset.add_faiss_index(column='embeddings')
30
 
31
  class SearchInput(BaseModel):
32
  title: str
 
33
 
34
  class SearchOutput(BaseModel):
35
  title: str
 
55
  """
56
  Endpoint that returns a search result.
57
  """
58
+ config = {"dataset": dataset, "model": model}
59
  res = search_and_retrieve(payload.title, config)
60
  return res
61
 
 
65
  Endpoint to generate a technology and push it to the dataset
66
  """
67
 
68
+ config = {"dataset": dataset, "model": model}
69
  res = search_and_retrieve(payload.title, config)
70
  if res["score"] >= 0.7 and not payload.force:
71
  raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
requirements.txt CHANGED
@@ -4,4 +4,6 @@ sentence_transformers
4
  pandas
5
  fuzzywuzzy
6
  google.genai
7
- datasets
 
 
 
4
  pandas
5
  fuzzywuzzy
6
  google.genai
7
+ datasets
8
+ faiss-cpu
9
+ python-dotenv
src/processor.py CHANGED
@@ -10,33 +10,31 @@ def search_and_retrieve(user_input, config):
10
  model = config["model"]
11
 
12
  user_embedding = model.encode(user_input)
13
- results = []
14
- max_result = {"score":0, "technology": "", "type":""}
15
 
16
- for row in dataset:
17
- name = row["name"]
18
- purpose = row["purpose"]
19
 
20
- cosim = model.similarity(row["embeddings"], user_embedding)
21
- if config["type"] == "purpose":
22
- token_set_ratio = fuzz.token_set_ratio(user_input, purpose)
23
- else:
24
- token_set_ratio = fuzz.token_set_ratio(user_input, name)
25
 
26
- fuzzy_score = token_set_ratio / 100
27
- alpha = 0.6
28
- combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
29
- result = {"title": name, "purpose": purpose, "score": combined_score.item()}
30
- if combined_score > max_result["score"]:
31
- max_result = result
32
 
33
- results.append(result)
 
 
34
 
 
35
 
36
- top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[1:6]
37
- max_result["top5"] = top_5
 
 
 
 
38
 
39
- return max_result
40
 
41
 
42
  def generate_tech(user_input, user_instructions):
 
10
  model = config["model"]
11
 
12
  user_embedding = model.encode(user_input)
13
+ results = dataset.get_nearest_examples('embeddings', user_embedding, k=5)
 
14
 
15
+ s=results.scores
16
+ t=results.examples
17
+ n = len(t['name'])
18
 
19
+ result = []
 
 
 
 
20
 
21
+ for i in range(n):
22
+ item = {}
 
 
 
 
23
 
24
+ for key, value in t.items():
25
+ if key!="embeddings":
26
+ item[key] = value[i]
27
 
28
+ result.append(item)
29
 
30
+ for i,r in enumerate(result):
31
+ r["score"]=float(s[i])
32
+
33
+ final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]}
34
+ final_output["top5"] = result
35
+ print(final_output)
36
 
37
+ return final_output
38
 
39
 
40
  def generate_tech(user_input, user_instructions):