ALLOUNE commited on
Commit
ceaeaf3
·
1 Parent(s): 097e47f
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9
3
+
4
+ # Create a non-root user for security
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+
8
+ # Set the working directory in the container
9
+ ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
10
+ WORKDIR $HOME/app
11
+
12
+ # Copy the requirements file into the container
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+
15
+ # Install any needed packages specified in requirements.txt
16
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
17
+
18
+ # Copy the rest of the application code into the container
19
+ COPY --chown=user . .
20
+
21
+ # Command to run the application.
22
+ # Hugging Face Spaces expects the app to listen on port 7860.
23
+ # The command points to the `api_app` object inside the `api/main.py` file.
24
+ CMD ["uvicorn", "api.main:api_app", "--host", "0.0.0.0", "--port", "7860"]
25
+
__init__.py ADDED
File without changes
main.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/main.py
2
+ from fastapi import FastAPI, HTTPException
3
+ import sentence_transformers
4
+ from huggingface_hub import hf_hub_download
5
+ import pandas as pd
6
+
7
+ from src.processor import send_to_dataset,search_and_retrieve,generate_tech
8
+ from typing import List, Dict
9
+ from pydantic import BaseModel
10
+
11
+ from datasets import load_dataset
12
+
13
+ # This is the main application object that Uvicorn will run
14
+ app = FastAPI(
15
+ title="My Standalone API",
16
+ description="An API hosted on Hugging Face Spaces",
17
+ version="1.0.0"
18
+ )
19
+
20
+
21
+ model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
22
+ dataset = load_dataset("heymenn/Technologies", split="train")
23
+
24
+
25
+ class SearchInput(BaseModel):
26
+ title: str
27
+
28
+ class SearchOutput(BaseModel):
29
+ title: str
30
+ purpose: str
31
+ score: float
32
+ top5: List[Dict]
33
+
34
+ class GenerateInput(BaseModel):
35
+ title: str
36
+ instructions: str
37
+ force: bool = False
38
+
39
+ class GenerateOutput(BaseModel):
40
+ name: str
41
+ purpose: str
42
+ problem_types_solved: str
43
+ advantages: str
44
+ limitations: str
45
+ domain_tags: str
46
+
47
+ @app.post("/search-technologies", response_model=SearchOutput)
48
+ def post_search(payload: SearchInput):
49
+ """
50
+ Endpoint that returns a search result.
51
+ """
52
+ config = {"dataset": dataset, "model": model}
53
+ res = search_and_retrieve(payload.title, config)
54
+ return res
55
+
56
+ @app.post("/generate-technology", response_model=GenerateOutput)
57
+ def post_generate_and_push(payload: GenerateInput):
58
+ """
59
+ Endpoint to generate a technology and push it to the dataset
60
+ """
61
+
62
+ config = {"dataset": dataset, "model": model}
63
+ res = search_and_retrieve(payload.title, config)
64
+ if res["score"] >= 0.7 and not payload.force:
65
+ raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
66
+
67
+ json_response = generate_tech(payload.title, payload.instructions)
68
+
69
+ send_to_dataset(json_response, model)
70
+
71
+ return json_response
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ sentence_transformers
4
+ pandas
5
+ fuzzywuzzy
6
+ google.genai
7
+ datasets
src/__pycache__/processor.cpython-310.pyc ADDED
Binary file (4.28 kB). View file
 
src/processor.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fuzzywuzzy import fuzz
2
+ from google.genai import Client, types
3
+ import json
4
+ from datasets import load_dataset
5
+
6
+
7
+ def search_and_retrieve(user_input, config):
8
+ dataset = config["dataset"]
9
+ model = config["model"]
10
+
11
+ user_embedding = model.encode(user_input)
12
+ results = []
13
+ max_result = {"score":0, "technology": "", "type":""}
14
+
15
+ for row in dataset:
16
+ name = row["name"]
17
+ purpose = row["purpose"]
18
+
19
+ cosim = model.similarity(row["embeddings"], user_embedding)
20
+ token_set_ratio = fuzz.token_set_ratio(user_input, name)
21
+
22
+ fuzzy_score = token_set_ratio / 100
23
+ alpha = 0.6
24
+ combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
25
+ result = {"title": name, "purpose": purpose, "score": combined_score.item()}
26
+ if combined_score > max_result["score"]:
27
+ max_result = result
28
+
29
+ results.append(result)
30
+
31
+
32
+ top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[1:6]
33
+ max_result["top5"] = top_5
34
+
35
+ return max_result
36
+
37
+
38
+ def generate_tech(user_input, user_instructions):
39
+ prompt = f"""
40
+ # ROLE
41
+
42
+ You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
43
+
44
+ # OBJECTIVE
45
+
46
+ Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
47
+ Create a complete JSON object according to the schema below.
48
+ Your final output must be a single, valid JSON document containing a technology you created.
49
+ The technology should be described with sentences.
50
+
51
+ # INSTRUCTIONS & RULES
52
+
53
+ 1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
54
+ Do not include any explanatory text before or after the JSON.
55
+ 2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
56
+ 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
57
+ Do not use single keywords.
58
+ 4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
59
+
60
+ # YAML SCHEMA & EXAMPLE
61
+
62
+ Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
63
+
64
+ {{"name": "Generative Watermarking"
65
+ "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
66
+ "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
67
+ "advantages": "Way faster to generate by an AI"
68
+ "limitations": "Takes a lot of computational time to generate"
69
+ "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
70
+ }}
71
+
72
+ Take into account those additionnal informations if there is any:
73
+ {user_instructions}
74
+ ---
75
+ ***NOW, BEGIN THE TASK.***
76
+
77
+ <USER_INPUT>
78
+ {user_input}
79
+ </USER_INPUT>
80
+ """
81
+
82
+ client = Client(api_key="AIzaSyCHcw26RRXZAlb1twhc_cRkoqCFW3e8QKk")
83
+
84
+ # Define the grounding tool
85
+ grounding_tool = types.Tool(
86
+ google_search=types.GoogleSearch()
87
+ )
88
+
89
+ # Configure generation settings
90
+ config = types.GenerateContentConfig(
91
+ tools=[grounding_tool]
92
+ )
93
+
94
+ response = client.models.generate_content(
95
+ model="gemini-2.5-flash",
96
+ contents=prompt,
97
+ config=config,
98
+ )
99
+
100
+ data = response.text
101
+ data = data[data.find("{"):data.find("}")+1].replace('\n','')
102
+ json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
103
+
104
+ return json_data
105
+
106
+
107
+ def send_to_dataset(data, model):
108
+ data_embedding = model.encode(str(data))
109
+ data["embeddings"] = data_embedding
110
+
111
+ dataset = load_dataset("heymenn/Technologies", split="train")
112
+ updated_dataset = dataset.add_item(data)
113
+ updated_dataset.push_to_hub("heymenn/Technologies")