|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
import sys |
|
|
|
def install_packages(): |
|
"""Install required packages""" |
|
packages = ['openai', 'gradio', 'python-dotenv', 'requests', 'pandas'] |
|
for package in packages: |
|
try: |
|
__import__(package) |
|
except ImportError: |
|
print(f"Installing {package}...") |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", package]) |
|
|
|
|
|
install_packages() |
|
|
|
|
|
import gradio as gr |
|
import json |
|
import random |
|
import re |
|
import time |
|
import os |
|
import io |
|
import zipfile |
|
from datetime import datetime |
|
from typing import Dict, List, Any, Optional, Tuple |
|
from openai import OpenAI |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
class MedicalLiteratureSimulator: |
|
"""Simulates medical literature research for health dataset generation""" |
|
|
|
def __init__(self): |
|
self.research_domains = { |
|
"longevity": { |
|
"interventions": ["NAD+ supplementation", "resveratrol", "metformin", "caloric restriction"], |
|
"biomarkers": ["telomere length", "cellular senescence", "inflammatory markers", "mitochondrial function"], |
|
"outcomes": ["biological age reduction", "improved healthspan", "enhanced cellular repair"] |
|
}, |
|
"metabolic_health": { |
|
"interventions": ["berberine", "intermittent fasting", "alpha-lipoic acid", "chromium"], |
|
"biomarkers": ["glucose levels", "insulin sensitivity", "HbA1c", "HOMA-IR"], |
|
"outcomes": ["improved glucose control", "enhanced insulin sensitivity", "reduced inflammation"] |
|
}, |
|
"cardiovascular": { |
|
"interventions": ["omega-3 fatty acids", "coenzyme Q10", "magnesium", "nattokinase"], |
|
"biomarkers": ["blood pressure", "cholesterol levels", "CRP", "endothelial function"], |
|
"outcomes": ["reduced blood pressure", "improved lipid profile", "decreased inflammation"] |
|
}, |
|
"cognitive": { |
|
"interventions": ["lion's mane mushroom", "phosphatidylserine", "bacopa monnieri", "acetyl-L-carnitine"], |
|
"biomarkers": ["cognitive performance", "BDNF levels", "neuroinflammation", "memory function"], |
|
"outcomes": ["enhanced memory", "improved cognitive function", "neuroprotection"] |
|
}, |
|
"hormonal": { |
|
"interventions": ["ashwagandha", "vitamin D", "DHEA", "maca root"], |
|
"biomarkers": ["cortisol levels", "thyroid hormones", "sex hormones", "stress markers"], |
|
"outcomes": ["hormone balance", "improved energy", "better sleep quality"] |
|
}, |
|
"inflammation": { |
|
"interventions": ["curcumin", "omega-3", "quercetin", "boswellia"], |
|
"biomarkers": ["CRP", "IL-6", "TNF-alpha", "oxidative stress"], |
|
"outcomes": ["reduced inflammation", "improved immune function", "enhanced recovery"] |
|
} |
|
} |
|
|
|
def generate_study_data(self, domain: str) -> Dict[str, Any]: |
|
"""Generate realistic medical study data""" |
|
if domain not in self.research_domains: |
|
domain = "longevity" |
|
|
|
domain_data = self.research_domains[domain] |
|
|
|
study = { |
|
"pmid": f"PMID{random.randint(35000000, 40000000)}", |
|
"title": self._generate_study_title(domain, domain_data), |
|
"abstract": self._generate_study_abstract(domain, domain_data), |
|
"journal": random.choice([ |
|
"Nature Medicine", "Cell Metabolism", "Journal of Clinical Medicine", |
|
"Circulation", "Aging Cell", "Nutrients", "Clinical Nutrition" |
|
]), |
|
"year": random.choice([2023, 2024]), |
|
"domain": domain, |
|
"interventions": random.sample(domain_data["interventions"], min(2, len(domain_data["interventions"]))), |
|
"biomarkers": random.sample(domain_data["biomarkers"], min(3, len(domain_data["biomarkers"]))), |
|
"outcomes": random.sample(domain_data["outcomes"], min(2, len(domain_data["outcomes"]))), |
|
"participant_count": random.randint(50, 300), |
|
"duration_weeks": random.choice([8, 12, 16, 24]), |
|
"dosages": self._generate_dosages(domain_data["interventions"][0]) |
|
} |
|
|
|
return study |
|
|
|
def _generate_study_title(self, domain: str, domain_data: Dict) -> str: |
|
intervention = random.choice(domain_data["interventions"]) |
|
outcome = random.choice(domain_data["outcomes"]) |
|
|
|
titles = [ |
|
f"Effects of {intervention} on {outcome}: A randomized controlled trial", |
|
f"{intervention} supplementation improves {outcome} in healthy adults", |
|
f"Clinical evaluation of {intervention} for {outcome} optimization", |
|
f"Randomized trial of {intervention} in {outcome} enhancement" |
|
] |
|
|
|
return random.choice(titles) |
|
|
|
def _generate_study_abstract(self, domain: str, domain_data: Dict) -> str: |
|
intervention = domain_data["interventions"][0] |
|
biomarker = random.choice(domain_data["biomarkers"]) |
|
outcome = random.choice(domain_data["outcomes"]) |
|
|
|
abstract = f""" |
|
Background: {intervention} has shown promise in preliminary studies for health optimization. |
|
|
|
Objective: To evaluate the effects of {intervention} supplementation on {biomarker} and related health outcomes. |
|
|
|
Methods: Randomized, double-blind, placebo-controlled trial with {random.randint(120, 250)} participants aged 40-65 years. |
|
Subjects received {intervention} or placebo for {random.randint(12, 24)} weeks. |
|
|
|
Results: {intervention} supplementation significantly improved {outcome} compared to placebo (p<0.05). |
|
{biomarker.capitalize()} showed {random.randint(15, 35)}% improvement from baseline. |
|
Secondary outcomes included improved quality of life and no serious adverse events. |
|
|
|
Conclusions: {intervention} supplementation provides significant benefits for {outcome} with excellent safety profile. |
|
""".strip() |
|
|
|
return abstract |
|
|
|
def _generate_dosages(self, intervention: str) -> List[str]: |
|
dosage_ranges = { |
|
"NAD+": ["250mg", "500mg", "1000mg"], |
|
"resveratrol": ["100mg", "250mg", "500mg"], |
|
"berberine": ["500mg", "1000mg", "1500mg"], |
|
"omega-3": ["1000mg", "2000mg", "3000mg"], |
|
"magnesium": ["200mg", "400mg", "600mg"], |
|
"curcumin": ["500mg", "1000mg", "1500mg"] |
|
} |
|
|
|
for key in dosage_ranges: |
|
if key.lower() in intervention.lower(): |
|
return random.sample(dosage_ranges[key], min(2, len(dosage_ranges[key]))) |
|
|
|
return ["500mg", "1000mg"] |
|
|
|
class HealthProfileGenerator: |
|
"""Generates realistic health profiles based on medical studies""" |
|
|
|
def __init__(self): |
|
self.severity_levels = { |
|
"optimal": {"multiplier": 1.0, "description": "excellent baseline health with optimization focus"}, |
|
"mild": {"multiplier": 1.2, "description": "minor health concerns with good overall function"}, |
|
"moderate": {"multiplier": 1.5, "description": "noticeable health issues requiring intervention"}, |
|
"severe": {"multiplier": 2.0, "description": "significant health challenges needing intensive protocols"} |
|
} |
|
|
|
def generate_profile_from_study(self, study: Dict[str, Any], severity: str = "moderate") -> Dict[str, Any]: |
|
"""Generate complete health profile based on study data and severity level""" |
|
domain = study.get("domain", "longevity") |
|
severity_data = self.severity_levels.get(severity, self.severity_levels["moderate"]) |
|
multiplier = severity_data["multiplier"] |
|
|
|
age = random.randint(35, 65) |
|
gender = random.choice(["male", "female"]) |
|
|
|
labs = self._generate_lab_values(domain, multiplier) |
|
|
|
health_profile = { |
|
"user_tests_result_data": { |
|
"Labs": labs, |
|
"gut_microbiome": self._generate_gut_microbiome(severity), |
|
"epigenetics": self._generate_epigenetics(severity), |
|
"wearables": self._generate_wearables(severity), |
|
"cgm": self._generate_cgm(severity) |
|
}, |
|
"user_query": self._generate_user_query(study, age, gender, severity), |
|
"source_study": { |
|
"pmid": study.get("pmid"), |
|
"domain": domain, |
|
"severity": severity, |
|
"title": study.get("title") |
|
} |
|
} |
|
|
|
return health_profile |
|
|
|
def _generate_lab_values(self, domain: str, multiplier: float) -> Dict[str, Any]: |
|
"""Generate realistic lab values based on domain and severity""" |
|
base_labs = { |
|
"blood_tests": { |
|
"systolic_bp": int(random.randint(120, 140) * multiplier), |
|
"diastolic_bp": int(random.randint(70, 90) * multiplier), |
|
"total_cholesterol": int(random.randint(180, 220) * multiplier), |
|
"ldl": int(random.randint(100, 140) * multiplier), |
|
"hdl": int(random.randint(40, 60) / multiplier), |
|
"triglycerides": int(random.randint(80, 150) * multiplier), |
|
"apoB": int(random.randint(70, 110) * multiplier), |
|
"lp_a": random.randint(10, 50) |
|
}, |
|
"inflammatory": { |
|
"hscrp": round(random.uniform(1.0, 4.0) * multiplier, 1), |
|
"esr": int(random.randint(5, 25) * multiplier), |
|
"il6": round(random.uniform(1.0, 5.0) * multiplier, 1), |
|
"tnf_alpha": round(random.uniform(1.0, 3.0) * multiplier, 1), |
|
"oxidative_stress_markers": "elevated" if multiplier > 1.3 else "normal", |
|
"homocysteine": round(random.uniform(8, 15) * multiplier, 1) |
|
}, |
|
"nutritional": { |
|
"vitamin_d": int(random.randint(25, 50) / multiplier), |
|
"b12": random.randint(250, 400), |
|
"folate": round(random.uniform(6, 14), 1), |
|
"iron": random.randint(60, 120), |
|
"ferritin": random.randint(30, 100), |
|
"selenium": random.randint(80, 120), |
|
"zinc": random.randint(70, 110), |
|
"magnesium": round(random.uniform(1.5, 2.2), 1), |
|
"omega3_index": round(random.uniform(4, 8) / multiplier, 1) |
|
} |
|
} |
|
|
|
if domain == "metabolic_health": |
|
base_labs["metabolic"] = { |
|
"fasting_glucose": int(random.randint(85, 110) * multiplier), |
|
"hba1c": round(random.uniform(5.2, 6.0) * min(multiplier, 1.4), 1), |
|
"insulin_fasting": round(random.uniform(5, 15) * multiplier, 1), |
|
"homa_ir": round(random.uniform(1.5, 4.0) * multiplier, 1) |
|
} |
|
|
|
return base_labs |
|
|
|
def _generate_gut_microbiome(self, severity: str) -> str: |
|
scores = { |
|
"optimal": random.uniform(8.5, 9.5), |
|
"mild": random.uniform(7.0, 8.5), |
|
"moderate": random.uniform(5.5, 7.0), |
|
"severe": random.uniform(3.5, 5.5) |
|
} |
|
|
|
score = scores.get(severity, 6.5) |
|
|
|
descriptions = { |
|
"optimal": "excellent diversity with optimal bacterial balance", |
|
"mild": "good diversity with minor imbalances", |
|
"moderate": "moderate dysbiosis with reduced beneficial bacteria", |
|
"severe": "significant dysbiosis with pathogenic overgrowth" |
|
} |
|
|
|
desc = descriptions.get(severity, "moderate dysbiosis") |
|
return f"Diversity score {score:.1f}/10, {desc}, beneficial bacteria {random.randint(60, 90)}%" |
|
|
|
def _generate_epigenetics(self, severity: str) -> str: |
|
age_acceleration = { |
|
"optimal": random.randint(-2, 1), |
|
"mild": random.randint(1, 3), |
|
"moderate": random.randint(3, 6), |
|
"severe": random.randint(6, 12) |
|
} |
|
|
|
acceleration = age_acceleration.get(severity, 4) |
|
telomere_percentile = max(10, random.randint(30, 80) - acceleration * 5) |
|
|
|
return f"Biological age acceleration: {acceleration} years, telomere length: {telomere_percentile}th percentile, DunedinPACE: {round(random.uniform(0.9, 1.4), 2)}" |
|
|
|
def _generate_wearables(self, severity: str) -> Dict[str, int]: |
|
base_ranges = { |
|
"optimal": {"hrv": (55, 75), "rhr": (45, 60), "sleep": (85, 95)}, |
|
"mild": {"hrv": (45, 65), "rhr": (55, 70), "sleep": (75, 85)}, |
|
"moderate": {"hrv": (30, 50), "rhr": (65, 80), "sleep": (60, 75)}, |
|
"severe": {"hrv": (20, 35), "rhr": (75, 95), "sleep": (45, 65)} |
|
} |
|
|
|
ranges = base_ranges.get(severity, base_ranges["moderate"]) |
|
|
|
return { |
|
"hrv_avg": random.randint(*ranges["hrv"]), |
|
"rhr": random.randint(*ranges["rhr"]), |
|
"sleep_score": random.randint(*ranges["sleep"]), |
|
"recovery_score": random.randint(ranges["sleep"][0]-10, ranges["sleep"][1]-5), |
|
"stress_score": random.randint(100-ranges["sleep"][1], 100-ranges["sleep"][0]+20), |
|
"vo2_max": random.randint(25, 50), |
|
"fitness_age": random.randint(30, 65) |
|
} |
|
|
|
def _generate_cgm(self, severity: str) -> str: |
|
glucose_ranges = { |
|
"optimal": (80, 95, 92, 98), |
|
"mild": (85, 105, 85, 95), |
|
"moderate": (95, 120, 70, 85), |
|
"severe": (110, 140, 55, 75) |
|
} |
|
|
|
avg_min, avg_max, tir_min, tir_max = glucose_ranges.get(severity, glucose_ranges["moderate"]) |
|
return f"Average glucose {random.randint(avg_min, avg_max)} mg/dL, time in range {random.randint(tir_min, tir_max)}%" |
|
|
|
def _generate_user_query(self, study: Dict[str, Any], age: int, gender: str, severity: str) -> str: |
|
domain = study.get("domain", "longevity") |
|
|
|
base_queries = { |
|
"longevity": f"I'm a {age}-year-old {gender} interested in longevity optimization and anti-aging protocols", |
|
"metabolic_health": f"I'm a {age}-year-old {gender} with metabolic dysfunction seeking evidence-based glucose control", |
|
"cardiovascular": f"I'm a {age}-year-old {gender} with cardiovascular risk factors wanting heart health optimization", |
|
"cognitive": f"I'm a {age}-year-old {gender} seeking cognitive enhancement and brain health optimization", |
|
"hormonal": f"I'm a {age}-year-old {gender} with hormonal imbalances needing optimization protocols", |
|
"inflammation": f"I'm a {age}-year-old {gender} with chronic inflammation seeking anti-inflammatory interventions" |
|
} |
|
|
|
base_query = base_queries.get(domain, base_queries["longevity"]) |
|
|
|
severity_context = { |
|
"optimal": "I have excellent baseline health but want to push the boundaries of optimization", |
|
"mild": "I have minor health concerns and want targeted interventions", |
|
"moderate": "I have noticeable health issues and need comprehensive protocols", |
|
"severe": "I have significant health challenges and require intensive interventions" |
|
} |
|
|
|
context = severity_context.get(severity, "") |
|
return f"{base_query}. {context}." |
|
|
|
class AIProtocolGenerator: |
|
"""Uses OpenAI to generate health optimization protocols""" |
|
|
|
def __init__(self, api_key: str, model: str = "gpt-4"): |
|
self.client = OpenAI(api_key=api_key) |
|
self.model = model |
|
self.total_cost = 0.0 |
|
|
|
def generate_protocol(self, health_profile: Dict[str, Any], study_context: Dict[str, Any], progress_callback=None) -> Optional[str]: |
|
"""Generate comprehensive health optimization protocol""" |
|
|
|
system_prompt = self._create_system_prompt(study_context) |
|
user_prompt = self._create_user_prompt(health_profile, study_context) |
|
|
|
try: |
|
if progress_callback: |
|
progress_callback(f"π Generating protocol using {self.model}...") |
|
|
|
response = self.client.chat.completions.create( |
|
model=self.model, |
|
messages=[ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": user_prompt} |
|
], |
|
max_tokens=4000, |
|
temperature=0.7, |
|
top_p=0.9 |
|
) |
|
|
|
self._update_cost(response.usage) |
|
|
|
if progress_callback: |
|
progress_callback(f"β
Protocol generated ({response.usage.total_tokens} tokens)") |
|
|
|
return response.choices[0].message.content |
|
|
|
except Exception as e: |
|
if progress_callback: |
|
progress_callback(f"β Error generating protocol: {e}") |
|
return None |
|
|
|
def _create_system_prompt(self, study_context: Dict[str, Any]) -> str: |
|
domain = study_context.get("domain", "health") |
|
interventions = ", ".join(study_context.get("interventions", [])) |
|
|
|
return f"""You are an advanced AI health optimization system specializing in evidence-based medicine and personalized protocols. |
|
|
|
RESEARCH CONTEXT: |
|
- Domain: {domain} optimization |
|
- Key Interventions: {interventions} |
|
- Evidence Level: Peer-reviewed clinical research |
|
|
|
PROTOCOL REQUIREMENTS: |
|
1. Executive Summary with current health assessment |
|
2. Multi-Phase Protocol: |
|
- Phase 1: Foundation (0-3 months) |
|
- Phase 2: Optimization (3-6 months) |
|
- Phase 3: Advanced Enhancement (6-12 months) |
|
3. Specific supplement protocols with dosages and timing |
|
4. Lifestyle interventions (exercise, nutrition, sleep) |
|
5. Monitoring and assessment plans |
|
6. Expected outcomes with realistic timelines |
|
|
|
STYLE: Professional, authoritative, using Medicine 3.0 terminology. Reference biological age, biomarkers, and cellular health. |
|
|
|
SAFETY: Keep dosages within evidence-based safe ranges. Include monitoring recommendations. |
|
|
|
Generate comprehensive protocols (3000+ words) with actionable precision medicine recommendations.""" |
|
|
|
def _create_user_prompt(self, health_profile: Dict[str, Any], study_context: Dict[str, Any]) -> str: |
|
return f""" |
|
COMPREHENSIVE HEALTH OPTIMIZATION REQUEST: |
|
|
|
Health Profile Analysis: |
|
{json.dumps(health_profile, indent=2)} |
|
|
|
Research Context: |
|
- Study: {study_context.get('title', 'Health Optimization Study')} |
|
- Domain: {study_context.get('domain', 'general health')} |
|
- Key Findings: Based on clinical research showing significant improvements in health biomarkers |
|
|
|
Please analyze this health profile and generate a detailed, personalized optimization protocol. Address the specific biomarker patterns, deficiencies, and health challenges identified in the data. Provide targeted interventions with precise dosing, timing, and monitoring protocols. |
|
""" |
|
|
|
def _update_cost(self, usage): |
|
pricing = { |
|
"gpt-3.5-turbo": {"input": 0.0015, "output": 0.002}, |
|
"gpt-4": {"input": 0.03, "output": 0.06}, |
|
"gpt-4-turbo": {"input": 0.01, "output": 0.03} |
|
} |
|
|
|
model_pricing = pricing.get(self.model, pricing["gpt-4"]) |
|
input_cost = usage.prompt_tokens * model_pricing["input"] / 1000 |
|
output_cost = usage.completion_tokens * model_pricing["output"] / 1000 |
|
|
|
self.total_cost += input_cost + output_cost |
|
|
|
class HealthDatasetGenerator: |
|
"""Complete system that orchestrates the entire dataset generation process""" |
|
|
|
def __init__(self, api_key: str, model: str = "gpt-4"): |
|
self.literature_sim = MedicalLiteratureSimulator() |
|
self.profile_gen = HealthProfileGenerator() |
|
self.protocol_gen = AIProtocolGenerator(api_key, model) |
|
self.generated_examples = [] |
|
|
|
def generate_dataset(self, |
|
domains: List[str] = None, |
|
examples_per_domain: int = 2, |
|
rate_limit_delay: float = 2.0, |
|
progress_callback=None) -> Tuple[List[Dict[str, Any]], str]: |
|
"""Generate complete health optimization dataset with progress updates""" |
|
|
|
if domains is None: |
|
domains = ["longevity", "metabolic_health", "cardiovascular", "cognitive"] |
|
|
|
if progress_callback: |
|
progress_callback(f"π Starting Health Dataset Generation") |
|
progress_callback(f"Domains: {domains}") |
|
progress_callback(f"Examples per domain: {examples_per_domain}") |
|
progress_callback(f"Total examples to generate: {len(domains) * examples_per_domain}") |
|
|
|
examples = [] |
|
total_examples = len(domains) * examples_per_domain |
|
current_example = 0 |
|
|
|
for domain in domains: |
|
if progress_callback: |
|
progress_callback(f"\nπ Processing domain: {domain}") |
|
|
|
for i in range(examples_per_domain): |
|
current_example += 1 |
|
try: |
|
if progress_callback: |
|
progress_callback(f" Creating example {i+1}/{examples_per_domain} (Overall: {current_example}/{total_examples})") |
|
|
|
|
|
study = self.literature_sim.generate_study_data(domain) |
|
if progress_callback: |
|
progress_callback(f" π Generated study: {study['title'][:50]}...") |
|
|
|
|
|
severity = random.choice(["mild", "moderate", "severe"]) |
|
health_profile = self.profile_gen.generate_profile_from_study(study, severity) |
|
if progress_callback: |
|
progress_callback(f" π€ Created {severity} health profile") |
|
|
|
|
|
protocol = self.protocol_gen.generate_protocol(health_profile, study, progress_callback) |
|
|
|
if protocol: |
|
training_example = { |
|
"user_context": health_profile, |
|
"response": protocol, |
|
"citations": self._generate_citations(study), |
|
"metadata": { |
|
"domain": domain, |
|
"severity": severity, |
|
"study_pmid": study["pmid"], |
|
"generated_at": datetime.now().isoformat() |
|
} |
|
} |
|
|
|
examples.append(training_example) |
|
if progress_callback: |
|
progress_callback(f" β
Complete example generated") |
|
|
|
|
|
if i < examples_per_domain - 1: |
|
if progress_callback: |
|
progress_callback(f" β³ Rate limit delay: {rate_limit_delay}s") |
|
time.sleep(rate_limit_delay) |
|
|
|
except Exception as e: |
|
if progress_callback: |
|
progress_callback(f" β Error generating example: {e}") |
|
continue |
|
|
|
if progress_callback: |
|
progress_callback(f"\nπ Dataset generation complete!") |
|
progress_callback(f"Generated: {len(examples)} examples") |
|
progress_callback(f"Total cost: ${self.protocol_gen.total_cost:.4f}") |
|
|
|
self.generated_examples = examples |
|
return examples, f"Generated {len(examples)} examples. Total cost: ${self.protocol_gen.total_cost:.4f}" |
|
|
|
def _generate_citations(self, study: Dict[str, Any]) -> Dict[str, List[str]]: |
|
return { |
|
"tier_1_peer_reviewed": [study["pmid"], f"PMC{random.randint(1000000, 9999999)}"], |
|
"tier_2_rct": [f"{study['domain'].upper()}.2024.{random.randint(100000, 999999)}"], |
|
"tier_3_cohort": [f"HEALTH.2023.{random.randint(100000, 999999)}"], |
|
"real_world_cases": ["Evidence-based health optimization protocols"] |
|
} |
|
|
|
def export_dataset(self, filename: str = None) -> Tuple[str, List[str]]: |
|
"""Export dataset and return zip file path and file list""" |
|
|
|
if not filename: |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"health_dataset_{timestamp}" |
|
|
|
|
|
files_created = [] |
|
|
|
|
|
raw_data = json.dumps(self.generated_examples, indent=2, ensure_ascii=False) |
|
files_created.append((f"{filename}.json", raw_data)) |
|
|
|
|
|
fine_tune_lines = [] |
|
for example in self.generated_examples: |
|
fine_tune_example = { |
|
"messages": [ |
|
{ |
|
"role": "system", |
|
"content": "You are an advanced AI health optimization system that creates evidence-based protocols." |
|
}, |
|
{ |
|
"role": "user", |
|
"content": f"Create a health optimization protocol for this profile:\n\n{json.dumps(example['user_context'], indent=2)}" |
|
}, |
|
{ |
|
"role": "assistant", |
|
"content": example["response"] |
|
} |
|
] |
|
} |
|
fine_tune_lines.append(json.dumps(fine_tune_example, ensure_ascii=False)) |
|
|
|
fine_tune_data = '\n'.join(fine_tune_lines) |
|
files_created.append((f"{filename}_fine_tuning.jsonl", fine_tune_data)) |
|
|
|
|
|
sample_size = min(3, len(self.generated_examples)) |
|
sample_data = json.dumps(self.generated_examples[:sample_size], indent=2, ensure_ascii=False) |
|
files_created.append((f"{filename}_samples.json", sample_data)) |
|
|
|
|
|
metadata = { |
|
"generation_info": { |
|
"generated_at": datetime.now().isoformat(), |
|
"total_examples": len(self.generated_examples), |
|
"total_cost": self.protocol_gen.total_cost, |
|
"model_used": self.protocol_gen.model |
|
}, |
|
"domains_covered": list(set(ex["metadata"]["domain"] for ex in self.generated_examples)), |
|
"severity_distribution": { |
|
severity: sum(1 for ex in self.generated_examples if ex["metadata"]["severity"] == severity) |
|
for severity in ["mild", "moderate", "severe"] |
|
} |
|
} |
|
|
|
metadata_data = json.dumps(metadata, indent=2, ensure_ascii=False) |
|
files_created.append((f"{filename}_metadata.json", metadata_data)) |
|
|
|
|
|
zip_buffer = io.BytesIO() |
|
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: |
|
for file_name, file_content in files_created: |
|
zip_file.writestr(file_name, file_content) |
|
|
|
|
|
zip_filename = f"{filename}.zip" |
|
with open(zip_filename, 'wb') as f: |
|
f.write(zip_buffer.getvalue()) |
|
|
|
file_list = [f[0] for f in files_created] |
|
return zip_filename, file_list |
|
|
|
|
|
|
|
|
|
|
|
class HealthDatasetGradioInterface: |
|
"""Gradio web interface for the health dataset generator""" |
|
|
|
def __init__(self): |
|
self.generator = None |
|
self.available_domains = list(MedicalLiteratureSimulator().research_domains.keys()) |
|
|
|
def estimate_cost(self, domains, examples_per_domain, model): |
|
"""Estimate generation cost""" |
|
if not domains: |
|
return "Please select at least one domain" |
|
|
|
total_examples = len(domains) * examples_per_domain |
|
|
|
cost_per_example = { |
|
"gpt-3.5-turbo": 0.05, |
|
"gpt-4": 0.25, |
|
"gpt-4-turbo": 0.15 |
|
} |
|
|
|
estimated_cost = total_examples * cost_per_example.get(model, 0.25) |
|
|
|
return f"π° Estimated cost: ${estimated_cost:.2f} for {total_examples} examples" |
|
|
|
def validate_inputs(self, api_key, domains, examples_per_domain): |
|
"""Validate user inputs""" |
|
if not api_key or not api_key.strip(): |
|
return False, "β Please provide your OpenAI API key" |
|
|
|
if not domains: |
|
return False, "β Please select at least one domain" |
|
|
|
if examples_per_domain < 1 or examples_per_domain > 10: |
|
return False, "β Examples per domain must be between 1 and 10" |
|
|
|
return True, "β
Inputs are valid" |
|
|
|
def generate_dataset_interface(self, api_key, domains, examples_per_domain, model, rate_limit): |
|
"""Main dataset generation function for Gradio interface""" |
|
|
|
|
|
is_valid, message = self.validate_inputs(api_key, domains, examples_per_domain) |
|
if not is_valid: |
|
yield message, "", "", None, None |
|
return |
|
|
|
|
|
try: |
|
self.generator = HealthDatasetGenerator(api_key.strip(), model) |
|
except Exception as e: |
|
yield f"β Error initializing generator: {e}", "", "", None, None |
|
return |
|
|
|
|
|
progress_messages = [] |
|
|
|
def progress_callback(message): |
|
progress_messages.append(message) |
|
progress_text = "\n".join(progress_messages[-20:]) |
|
return progress_text |
|
|
|
try: |
|
|
|
yield "π Starting dataset generation...", "", "", None, None |
|
|
|
dataset, summary = self.generator.generate_dataset( |
|
domains=domains, |
|
examples_per_domain=examples_per_domain, |
|
rate_limit_delay=rate_limit, |
|
progress_callback=progress_callback |
|
) |
|
|
|
if not dataset: |
|
yield "β No examples generated", "", "", None, None |
|
return |
|
|
|
|
|
progress_callback("πΎ Exporting dataset...") |
|
zip_filename, file_list = self.generator.export_dataset() |
|
|
|
|
|
preview = self.create_dataset_preview(dataset) |
|
|
|
|
|
final_progress = progress_callback(f"π Generation complete! Files: {', '.join(file_list)}") |
|
|
|
yield final_progress, summary, preview, zip_filename, file_list |
|
|
|
except Exception as e: |
|
yield f"β Error during generation: {e}", "", "", None, None |
|
|
|
def create_dataset_preview(self, dataset): |
|
"""Create a preview of the generated dataset""" |
|
if not dataset: |
|
return "No data to preview" |
|
|
|
preview = "π **Dataset Preview**\n\n" |
|
|
|
|
|
preview += f"**Total Examples:** {len(dataset)}\n" |
|
|
|
|
|
domains = [ex['metadata']['domain'] for ex in dataset] |
|
domain_counts = {d: domains.count(d) for d in set(domains)} |
|
preview += f"**Domain Distribution:** {domain_counts}\n" |
|
|
|
|
|
severities = [ex['metadata']['severity'] for ex in dataset] |
|
severity_counts = {s: severities.count(s) for s in set(severities)} |
|
preview += f"**Severity Distribution:** {severity_counts}\n\n" |
|
|
|
|
|
if dataset: |
|
example = dataset[0] |
|
preview += "**Sample Example:**\n" |
|
preview += f"- **Domain:** {example['metadata']['domain']}\n" |
|
preview += f"- **Severity:** {example['metadata']['severity']}\n" |
|
preview += f"- **User Query:** {example['user_context']['user_query'][:150]}...\n" |
|
preview += f"- **Response Length:** {len(example['response'])} characters\n" |
|
preview += f"- **PMID:** {example['metadata']['study_pmid']}\n" |
|
|
|
return preview |
|
|
|
def analyze_dataset_file(self, zip_file): |
|
"""Analyze uploaded dataset file""" |
|
if zip_file is None: |
|
return "No file uploaded" |
|
|
|
try: |
|
|
|
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: |
|
|
|
json_files = [f for f in zip_ref.namelist() if f.endswith('.json') and not f.endswith('_samples.json') and not f.endswith('_metadata.json')] |
|
|
|
if json_files: |
|
dataset_file = json_files[0] |
|
with zip_ref.open(dataset_file) as f: |
|
dataset = json.load(f) |
|
|
|
analysis = "π **Dataset Analysis**\n\n" |
|
analysis += f"**Total Examples:** {len(dataset)}\n" |
|
analysis += f"**Average Response Length:** {sum(len(ex['response']) for ex in dataset) / len(dataset):.0f} characters\n" |
|
|
|
|
|
long_responses = sum(1 for ex in dataset if len(ex['response']) > 2000) |
|
has_phases = sum(1 for ex in dataset if "Phase" in ex['response']) |
|
has_dosages = sum(1 for ex in dataset if re.search(r'\d+\s*mg', ex['response'])) |
|
|
|
analysis += f"**Quality Metrics:**\n" |
|
analysis += f"- Responses >2000 chars: {long_responses}/{len(dataset)} ({long_responses/len(dataset)*100:.1f}%)\n" |
|
analysis += f"- Responses with phases: {has_phases}/{len(dataset)} ({has_phases/len(dataset)*100:.1f}%)\n" |
|
analysis += f"- Responses with dosages: {has_dosages}/{len(dataset)} ({has_dosages/len(dataset)*100:.1f}%)\n" |
|
|
|
return analysis |
|
else: |
|
return "No dataset JSON file found in zip" |
|
|
|
except Exception as e: |
|
return f"Error analyzing file: {e}" |
|
|
|
def create_interface(self): |
|
"""Create the Gradio interface""" |
|
|
|
with gr.Blocks(title="Medical Literature Health Dataset Generator", theme=gr.themes.Soft()) as interface: |
|
|
|
gr.Markdown(""" |
|
# π₯ Medical Literature Health Dataset Generator |
|
|
|
This tool generates synthetic health optimization datasets based on medical literature patterns. |
|
Perfect for training AI models on evidence-based health protocols. |
|
|
|
β οΈ **Important:** Generated content is for research/educational purposes only. Not medical advice. |
|
""") |
|
|
|
with gr.Tab("π Generate Dataset"): |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### βοΈ Configuration") |
|
|
|
api_key = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password", |
|
info="Your OpenAI API key for generating protocols" |
|
) |
|
|
|
domains = gr.CheckboxGroup( |
|
label="Research Domains", |
|
choices=self.available_domains, |
|
value=["longevity", "metabolic_health"], |
|
info="Select medical research domains to include" |
|
) |
|
|
|
examples_per_domain = gr.Slider( |
|
label="Examples per Domain", |
|
minimum=1, |
|
maximum=10, |
|
value=2, |
|
step=1, |
|
info="Number of examples to generate for each domain" |
|
) |
|
|
|
model = gr.Dropdown( |
|
label="OpenAI Model", |
|
choices=["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"], |
|
value="gpt-4", |
|
info="Model for generating protocols (GPT-4 recommended for quality)" |
|
) |
|
|
|
rate_limit = gr.Slider( |
|
label="Rate Limit Delay (seconds)", |
|
minimum=0.5, |
|
maximum=5.0, |
|
value=2.0, |
|
step=0.5, |
|
info="Delay between API calls to avoid rate limits" |
|
) |
|
|
|
cost_estimate = gr.Textbox( |
|
label="Cost Estimate", |
|
value="Select domains and examples to see estimate", |
|
interactive=False |
|
) |
|
|
|
generate_btn = gr.Button( |
|
"π Generate Dataset", |
|
variant="primary", |
|
size="lg" |
|
) |
|
|
|
with gr.Column(scale=2): |
|
gr.Markdown("### π Progress & Results") |
|
|
|
progress_output = gr.Textbox( |
|
label="Generation Progress", |
|
lines=15, |
|
max_lines=20, |
|
value="Ready to generate dataset...", |
|
interactive=False |
|
) |
|
|
|
summary_output = gr.Textbox( |
|
label="Generation Summary", |
|
lines=3, |
|
interactive=False |
|
) |
|
|
|
preview_output = gr.Markdown( |
|
label="Dataset Preview", |
|
value="Dataset preview will appear here..." |
|
) |
|
|
|
with gr.Row(): |
|
download_file = gr.File( |
|
label="π₯ Download Generated Dataset", |
|
interactive=False |
|
) |
|
|
|
file_list = gr.Textbox( |
|
label="Generated Files", |
|
placeholder="Files included in download will be listed here", |
|
interactive=False |
|
) |
|
|
|
with gr.Tab("π Analyze Dataset"): |
|
gr.Markdown("### π Dataset Analysis") |
|
gr.Markdown("Upload a generated dataset zip file to analyze its quality and structure.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
upload_file = gr.File( |
|
label="Upload Dataset Zip File", |
|
file_types=[".zip"] |
|
) |
|
|
|
analyze_btn = gr.Button( |
|
"π Analyze Dataset", |
|
variant="secondary" |
|
) |
|
|
|
with gr.Column(): |
|
analysis_output = gr.Markdown( |
|
label="Analysis Results", |
|
value="Upload a dataset file to see analysis..." |
|
) |
|
|
|
with gr.Tab("βΉοΈ Information"): |
|
gr.Markdown(""" |
|
### π How It Works |
|
|
|
1. **Literature Simulation**: Creates realistic medical studies with proper abstracts, interventions, and outcomes |
|
2. **Health Profile Generation**: Generates comprehensive health profiles based on study domains and severity levels |
|
3. **AI Protocol Generation**: Uses OpenAI to create detailed health optimization protocols |
|
4. **Dataset Export**: Outputs data in multiple formats including OpenAI fine-tuning format |
|
|
|
### π― Output Files |
|
|
|
- **`dataset.json`**: Complete raw dataset |
|
- **`dataset_fine_tuning.jsonl`**: OpenAI fine-tuning format |
|
- **`dataset_samples.json`**: Sample examples for review |
|
- **`dataset_metadata.json`**: Generation statistics and info |
|
|
|
### π° Cost Information |
|
|
|
- **GPT-3.5-turbo**: ~$0.05 per example |
|
- **GPT-4**: ~$0.25 per example |
|
- **GPT-4-turbo**: ~$0.15 per example |
|
|
|
### β οΈ Important Notes |
|
|
|
- Generated content is for **research/educational purposes only** |
|
- **Not medical advice** - always consult healthcare professionals |
|
- Include appropriate medical disclaimers when using generated content |
|
- Review sample outputs before using in production |
|
|
|
### π§ Recommended Settings |
|
|
|
- **Start small**: Generate 2-4 examples first to test quality |
|
- **Use GPT-4**: Better quality than GPT-3.5-turbo |
|
- **Rate limiting**: Use 2+ second delays to avoid API limits |
|
- **Multiple domains**: Include diverse domains for comprehensive dataset |
|
""") |
|
|
|
|
|
|
|
|
|
def update_cost_estimate(domains, examples_per_domain, model): |
|
return self.estimate_cost(domains, examples_per_domain, model) |
|
|
|
for input_component in [domains, examples_per_domain, model]: |
|
input_component.change( |
|
fn=update_cost_estimate, |
|
inputs=[domains, examples_per_domain, model], |
|
outputs=[cost_estimate] |
|
) |
|
|
|
|
|
generate_btn.click( |
|
fn=self.generate_dataset_interface, |
|
inputs=[api_key, domains, examples_per_domain, model, rate_limit], |
|
outputs=[progress_output, summary_output, preview_output, download_file, file_list] |
|
) |
|
|
|
|
|
analyze_btn.click( |
|
fn=self.analyze_dataset_file, |
|
inputs=[upload_file], |
|
outputs=[analysis_output] |
|
) |
|
|
|
return interface |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
"""Launch the Gradio interface""" |
|
|
|
print("π Launching Medical Literature Health Dataset Generator") |
|
print("This will start a web interface accessible through your browser") |
|
|
|
|
|
interface_creator = HealthDatasetGradioInterface() |
|
interface = interface_creator.create_interface() |
|
|
|
|
|
interface.launch( |
|
share=True, |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
show_error=True, |
|
quiet=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|