import os import re from datetime import datetime from typing import Dict, Any, List from .ontology import expand_terms_with_ontology from .structures import fetch_structures_for_terms from .narration import narrate_text from .graphdb import write_topic_and_papers from .providers import ( run_deepseek_summary, run_gemini_polish, run_openai_image, run_hf_image, pubmed_fallback_search ) # Environment variables UMLS_API_KEY = os.getenv("UMLS_API_KEY") BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY") NCBI_API_KEY = os.getenv("NCBI_API_KEY") NCBI_EMAIL = os.getenv("NCBI_EMAIL") ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY") NEO4J_URI = os.getenv("NEO4J_URI") SYNBIO_MODE = True def extract_citations(text: str) -> List[Dict[str, str]]: """Extract citations from model output.""" citations = [] doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)" pmid_pattern = r"PMID:\s*(\d+)" url_pattern = r"(https?://[^\s)]+)" for match in re.finditer(doi_pattern, text, re.IGNORECASE): citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"}) for match in re.finditer(pmid_pattern, text, re.IGNORECASE): citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"}) for match in re.finditer(url_pattern, text, re.IGNORECASE): if not any(c["url"] == match.group(1) for c in citations): citations.append({"type": "URL", "id": "", "url": match.group(1)}) return citations def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str: """Bias prompt toward synthetic biology domain.""" synbio_context = ( "You are an expert synthetic biologist and AI researcher. " "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, " "biosensors, and biosecurity. Integrate literature, molecular structures, market trends, " "and policy/regulatory outlook. Produce a structured, citation-rich report." ) return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}" def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]: """Main synthetic biology research pipeline.""" # 1. Expand query with ontology expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY) # 2. Inject synthetic biology context enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query # 3. Run DeepSeek summarization raw_summary = run_deepseek_summary(enriched_query) # 4. Polish with Gemini polished_summary = run_gemini_polish(raw_summary) # 5. Extract citations citations = extract_citations(polished_summary) if not citations: fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL) citations.extend(fallback_cites) # 6. Fetch molecular structures structures = fetch_structures_for_terms(expanded_terms) # 7. Generate visual diagram visual_image_url = run_openai_image(query) if not visual_image_url: print("[Image] Falling back to Hugging Face Stable Diffusion...") visual_image_url = run_hf_image(f"Scientific diagram about {query}") # 8. Write to Neo4j if graph_preview and NEO4J_URI: write_topic_and_papers(query, citations, expanded_terms) # 9. Narrate executive summary audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None # 10. Return structured output return { "timestamp": datetime.utcnow().isoformat(), "query": query, "expanded_terms": expanded_terms, "summary": polished_summary, "citations": citations, "structures": structures, "visual_image_url": visual_image_url, "audio_url": audio_url }