ALLOUNE
commited on
Commit
·
ceaeaf3
1
Parent(s):
097e47f
add app
Browse files- Dockerfile +25 -0
- __init__.py +0 -0
- main.py +71 -0
- requirements.txt +7 -0
- src/__pycache__/processor.cpython-310.pyc +0 -0
- src/processor.py +113 -0
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.9
|
| 3 |
+
|
| 4 |
+
# Create a non-root user for security
|
| 5 |
+
RUN useradd -m -u 1000 user
|
| 6 |
+
USER user
|
| 7 |
+
|
| 8 |
+
# Set the working directory in the container
|
| 9 |
+
ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
|
| 10 |
+
WORKDIR $HOME/app
|
| 11 |
+
|
| 12 |
+
# Copy the requirements file into the container
|
| 13 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 14 |
+
|
| 15 |
+
# Install any needed packages specified in requirements.txt
|
| 16 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Copy the rest of the application code into the container
|
| 19 |
+
COPY --chown=user . .
|
| 20 |
+
|
| 21 |
+
# Command to run the application.
|
| 22 |
+
# Hugging Face Spaces expects the app to listen on port 7860.
|
| 23 |
+
# The command points to the `api_app` object inside the `api/main.py` file.
|
| 24 |
+
CMD ["uvicorn", "api.main:api_app", "--host", "0.0.0.0", "--port", "7860"]
|
| 25 |
+
|
__init__.py
ADDED
|
File without changes
|
main.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api/main.py
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
import sentence_transformers
|
| 4 |
+
from huggingface_hub import hf_hub_download
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from src.processor import send_to_dataset,search_and_retrieve,generate_tech
|
| 8 |
+
from typing import List, Dict
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
|
| 11 |
+
from datasets import load_dataset
|
| 12 |
+
|
| 13 |
+
# This is the main application object that Uvicorn will run
|
| 14 |
+
app = FastAPI(
|
| 15 |
+
title="My Standalone API",
|
| 16 |
+
description="An API hosted on Hugging Face Spaces",
|
| 17 |
+
version="1.0.0"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 22 |
+
dataset = load_dataset("heymenn/Technologies", split="train")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SearchInput(BaseModel):
|
| 26 |
+
title: str
|
| 27 |
+
|
| 28 |
+
class SearchOutput(BaseModel):
|
| 29 |
+
title: str
|
| 30 |
+
purpose: str
|
| 31 |
+
score: float
|
| 32 |
+
top5: List[Dict]
|
| 33 |
+
|
| 34 |
+
class GenerateInput(BaseModel):
|
| 35 |
+
title: str
|
| 36 |
+
instructions: str
|
| 37 |
+
force: bool = False
|
| 38 |
+
|
| 39 |
+
class GenerateOutput(BaseModel):
|
| 40 |
+
name: str
|
| 41 |
+
purpose: str
|
| 42 |
+
problem_types_solved: str
|
| 43 |
+
advantages: str
|
| 44 |
+
limitations: str
|
| 45 |
+
domain_tags: str
|
| 46 |
+
|
| 47 |
+
@app.post("/search-technologies", response_model=SearchOutput)
|
| 48 |
+
def post_search(payload: SearchInput):
|
| 49 |
+
"""
|
| 50 |
+
Endpoint that returns a search result.
|
| 51 |
+
"""
|
| 52 |
+
config = {"dataset": dataset, "model": model}
|
| 53 |
+
res = search_and_retrieve(payload.title, config)
|
| 54 |
+
return res
|
| 55 |
+
|
| 56 |
+
@app.post("/generate-technology", response_model=GenerateOutput)
|
| 57 |
+
def post_generate_and_push(payload: GenerateInput):
|
| 58 |
+
"""
|
| 59 |
+
Endpoint to generate a technology and push it to the dataset
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
config = {"dataset": dataset, "model": model}
|
| 63 |
+
res = search_and_retrieve(payload.title, config)
|
| 64 |
+
if res["score"] >= 0.7 and not payload.force:
|
| 65 |
+
raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
|
| 66 |
+
|
| 67 |
+
json_response = generate_tech(payload.title, payload.instructions)
|
| 68 |
+
|
| 69 |
+
send_to_dataset(json_response, model)
|
| 70 |
+
|
| 71 |
+
return json_response
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
sentence_transformers
|
| 4 |
+
pandas
|
| 5 |
+
fuzzywuzzy
|
| 6 |
+
google.genai
|
| 7 |
+
datasets
|
src/__pycache__/processor.cpython-310.pyc
ADDED
|
Binary file (4.28 kB). View file
|
|
|
src/processor.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fuzzywuzzy import fuzz
|
| 2 |
+
from google.genai import Client, types
|
| 3 |
+
import json
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def search_and_retrieve(user_input, config):
|
| 8 |
+
dataset = config["dataset"]
|
| 9 |
+
model = config["model"]
|
| 10 |
+
|
| 11 |
+
user_embedding = model.encode(user_input)
|
| 12 |
+
results = []
|
| 13 |
+
max_result = {"score":0, "technology": "", "type":""}
|
| 14 |
+
|
| 15 |
+
for row in dataset:
|
| 16 |
+
name = row["name"]
|
| 17 |
+
purpose = row["purpose"]
|
| 18 |
+
|
| 19 |
+
cosim = model.similarity(row["embeddings"], user_embedding)
|
| 20 |
+
token_set_ratio = fuzz.token_set_ratio(user_input, name)
|
| 21 |
+
|
| 22 |
+
fuzzy_score = token_set_ratio / 100
|
| 23 |
+
alpha = 0.6
|
| 24 |
+
combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
|
| 25 |
+
result = {"title": name, "purpose": purpose, "score": combined_score.item()}
|
| 26 |
+
if combined_score > max_result["score"]:
|
| 27 |
+
max_result = result
|
| 28 |
+
|
| 29 |
+
results.append(result)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[1:6]
|
| 33 |
+
max_result["top5"] = top_5
|
| 34 |
+
|
| 35 |
+
return max_result
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def generate_tech(user_input, user_instructions):
|
| 39 |
+
prompt = f"""
|
| 40 |
+
# ROLE
|
| 41 |
+
|
| 42 |
+
You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
|
| 43 |
+
|
| 44 |
+
# OBJECTIVE
|
| 45 |
+
|
| 46 |
+
Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
|
| 47 |
+
Create a complete JSON object according to the schema below.
|
| 48 |
+
Your final output must be a single, valid JSON document containing a technology you created.
|
| 49 |
+
The technology should be described with sentences.
|
| 50 |
+
|
| 51 |
+
# INSTRUCTIONS & RULES
|
| 52 |
+
|
| 53 |
+
1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
|
| 54 |
+
Do not include any explanatory text before or after the JSON.
|
| 55 |
+
2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
|
| 56 |
+
3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
|
| 57 |
+
Do not use single keywords.
|
| 58 |
+
4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
|
| 59 |
+
|
| 60 |
+
# YAML SCHEMA & EXAMPLE
|
| 61 |
+
|
| 62 |
+
Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
|
| 63 |
+
|
| 64 |
+
{{"name": "Generative Watermarking"
|
| 65 |
+
"purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
|
| 66 |
+
"problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
|
| 67 |
+
"advantages": "Way faster to generate by an AI"
|
| 68 |
+
"limitations": "Takes a lot of computational time to generate"
|
| 69 |
+
"domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
|
| 70 |
+
}}
|
| 71 |
+
|
| 72 |
+
Take into account those additionnal informations if there is any:
|
| 73 |
+
{user_instructions}
|
| 74 |
+
---
|
| 75 |
+
***NOW, BEGIN THE TASK.***
|
| 76 |
+
|
| 77 |
+
<USER_INPUT>
|
| 78 |
+
{user_input}
|
| 79 |
+
</USER_INPUT>
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
client = Client(api_key="AIzaSyCHcw26RRXZAlb1twhc_cRkoqCFW3e8QKk")
|
| 83 |
+
|
| 84 |
+
# Define the grounding tool
|
| 85 |
+
grounding_tool = types.Tool(
|
| 86 |
+
google_search=types.GoogleSearch()
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Configure generation settings
|
| 90 |
+
config = types.GenerateContentConfig(
|
| 91 |
+
tools=[grounding_tool]
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
response = client.models.generate_content(
|
| 95 |
+
model="gemini-2.5-flash",
|
| 96 |
+
contents=prompt,
|
| 97 |
+
config=config,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
data = response.text
|
| 101 |
+
data = data[data.find("{"):data.find("}")+1].replace('\n','')
|
| 102 |
+
json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
|
| 103 |
+
|
| 104 |
+
return json_data
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def send_to_dataset(data, model):
|
| 108 |
+
data_embedding = model.encode(str(data))
|
| 109 |
+
data["embeddings"] = data_embedding
|
| 110 |
+
|
| 111 |
+
dataset = load_dataset("heymenn/Technologies", split="train")
|
| 112 |
+
updated_dataset = dataset.add_item(data)
|
| 113 |
+
updated_dataset.push_to_hub("heymenn/Technologies")
|