|
from fuzzywuzzy import fuzz |
|
from google.genai import Client, types |
|
from datasets import load_dataset |
|
import json |
|
import os |
|
|
|
|
|
def search_and_retrieve(user_input, config): |
|
dataset = config["dataset"] |
|
model = config["model"] |
|
|
|
user_embedding = model.encode(user_input) |
|
results = [] |
|
max_result = {"score":0, "technology": "", "type":""} |
|
|
|
for row in dataset: |
|
name = row["name"] |
|
purpose = row["purpose"] |
|
|
|
cosim = model.similarity(row["embeddings"], user_embedding) |
|
if config["type"] == "purpose": |
|
token_set_ratio = fuzz.token_set_ratio(user_input, purpose) |
|
else: |
|
token_set_ratio = fuzz.token_set_ratio(user_input, name) |
|
|
|
fuzzy_score = token_set_ratio / 100 |
|
alpha = 0.6 |
|
combined_score = alpha * cosim + (1 - alpha) * fuzzy_score |
|
result = {"title": name, "purpose": purpose, "score": combined_score.item()} |
|
if combined_score > max_result["score"]: |
|
max_result = result |
|
|
|
results.append(result) |
|
|
|
|
|
top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[1:6] |
|
max_result["top5"] = top_5 |
|
|
|
return max_result |
|
|
|
|
|
def generate_tech(user_input, user_instructions): |
|
prompt = f""" |
|
# ROLE |
|
|
|
You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object. |
|
|
|
# OBJECTIVE |
|
|
|
Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology. |
|
Create a complete JSON object according to the schema below. |
|
Your final output must be a single, valid JSON document containing a technology you created. |
|
The technology should be described with sentences. |
|
|
|
# INSTRUCTIONS & RULES |
|
|
|
1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. |
|
Do not include any explanatory text before or after the JSON. |
|
2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it. |
|
3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves. |
|
Do not use single keywords. |
|
4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context. |
|
|
|
# YAML SCHEMA & EXAMPLE |
|
|
|
Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences. |
|
|
|
{{"name": "Generative Watermarking" |
|
"purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source." |
|
"problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes." |
|
"advantages": "Way faster to generate by an AI" |
|
"limitations": "Takes a lot of computational time to generate" |
|
"domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation" |
|
}} |
|
|
|
Take into account those additionnal informations if there is any: |
|
{user_instructions} |
|
--- |
|
***NOW, BEGIN THE TASK.*** |
|
|
|
<USER_INPUT> |
|
{user_input} |
|
</USER_INPUT> |
|
""" |
|
|
|
client = Client(api_key=os.getenv("GEMINI_API_KEY")) |
|
|
|
client = Client(api_key=os.getenv("GEMINI_API_KEY")) |
|
|
|
|
|
grounding_tool = types.Tool( |
|
google_search=types.GoogleSearch() |
|
) |
|
|
|
|
|
config = types.GenerateContentConfig( |
|
tools=[grounding_tool] |
|
) |
|
|
|
response = client.models.generate_content( |
|
model="gemini-2.5-flash", |
|
contents=prompt, |
|
config=config, |
|
) |
|
|
|
data = response.text |
|
data = data[data.find("{"):data.find("}")+1].replace('\n','') |
|
json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n','')) |
|
|
|
return json_data |
|
|
|
|
|
def send_to_dataset(data, model): |
|
data_embedding = model.encode(str(data)) |
|
data["embeddings"] = data_embedding |
|
|
|
dataset = load_dataset("heymenn/Technologies", split="train") |
|
updated_dataset = dataset.add_item(data) |
|
updated_dataset.push_to_hub("heymenn/Technologies") |