ALLOUNE
add purpose search
62151ed
raw
history blame
4.43 kB
from fuzzywuzzy import fuzz
from google.genai import Client, types
from datasets import load_dataset
import json
import os
def search_and_retrieve(user_input, config):
dataset = config["dataset"]
model = config["model"]
user_embedding = model.encode(user_input)
results = []
max_result = {"score":0, "technology": "", "type":""}
for row in dataset:
name = row["name"]
purpose = row["purpose"]
cosim = model.similarity(row["embeddings"], user_embedding)
if config["type"] == "purpose":
token_set_ratio = fuzz.token_set_ratio(user_input, purpose)
else:
token_set_ratio = fuzz.token_set_ratio(user_input, name)
fuzzy_score = token_set_ratio / 100
alpha = 0.6
combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
result = {"title": name, "purpose": purpose, "score": combined_score.item()}
if combined_score > max_result["score"]:
max_result = result
results.append(result)
top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[1:6]
max_result["top5"] = top_5
return max_result
def generate_tech(user_input, user_instructions):
prompt = f"""
# ROLE
You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
# OBJECTIVE
Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
Create a complete JSON object according to the schema below.
Your final output must be a single, valid JSON document containing a technology you created.
The technology should be described with sentences.
# INSTRUCTIONS & RULES
1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
Do not include any explanatory text before or after the JSON.
2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
Do not use single keywords.
4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
# YAML SCHEMA & EXAMPLE
Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
{{"name": "Generative Watermarking"
"purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
"problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
"advantages": "Way faster to generate by an AI"
"limitations": "Takes a lot of computational time to generate"
"domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
}}
Take into account those additionnal informations if there is any:
{user_instructions}
---
***NOW, BEGIN THE TASK.***
<USER_INPUT>
{user_input}
</USER_INPUT>
"""
client = Client(api_key=os.getenv("GEMINI_API_KEY"))
client = Client(api_key=os.getenv("GEMINI_API_KEY"))
# Define the grounding tool
grounding_tool = types.Tool(
google_search=types.GoogleSearch()
)
# Configure generation settings
config = types.GenerateContentConfig(
tools=[grounding_tool]
)
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=prompt,
config=config,
)
data = response.text
data = data[data.find("{"):data.find("}")+1].replace('\n','')
json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
return json_data
def send_to_dataset(data, model):
data_embedding = model.encode(str(data))
data["embeddings"] = data_embedding
dataset = load_dataset("heymenn/Technologies", split="train")
updated_dataset = dataset.add_item(data)
updated_dataset.push_to_hub("heymenn/Technologies")