File size: 17,425 Bytes
b074e28
 
aa7324d
b074e28
 
 
 
 
 
 
 
aa7324d
 
b074e28
 
 
 
 
028b069
 
 
 
 
 
 
 
 
 
 
 
 
 
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa7324d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3d2c4
aa7324d
 
 
 
 
 
aa3d2c4
 
aa7324d
 
 
 
 
 
 
 
 
aa3d2c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa7324d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3d2c4
aa7324d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b074e28
 
 
 
 
 
 
aa7324d
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
028b069
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3d2c4
b074e28
 
aa3d2c4
aa7324d
 
b074e28
aa7324d
 
 
 
 
 
 
 
 
 
 
 
aa3d2c4
aa7324d
 
 
 
 
 
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
9267c22
b074e28
 
869c661
 
 
 
9267c22
869c661
 
 
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
aa3d2c4
b074e28
 
 
 
 
 
 
 
 
 
 
aa3d2c4
 
 
 
 
 
 
 
 
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa3d2c4
 
 
 
 
 
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9267c22
b074e28
 
 
 
 
 
 
 
 
aa3d2c4
b074e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b617df
b074e28
 
0b617df
 
 
 
 
 
 
 
 
 
 
b074e28
 
 
 
 
 
3b8619b
b074e28
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
"""
DNA-Diffusion Gradio Application
Interactive DNA sequence generation with slot machine visualization and protein analysis
"""

import gradio as gr
import logging
import json
import os
from typing import Dict, Any, Tuple
import html
import requests
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Try to import spaces for GPU decoration
try:
    import spaces
    SPACES_AVAILABLE = True
except ImportError:
    SPACES_AVAILABLE = False
    # Create a dummy decorator if spaces is not available
    class spaces:
        @staticmethod
        def GPU(duration=60):
            def decorator(func):
                return func
            return decorator

# Try to import model, but allow app to run without it for UI development
try:
    from dna_diffusion_model import DNADiffusionModel, get_model
    MODEL_AVAILABLE = True
    logger.info("DNA-Diffusion model module loaded successfully")
except ImportError as e:
    logger.warning(f"DNA-Diffusion model not available: {e}")
    MODEL_AVAILABLE = False

# Load the HTML interface
HTML_FILE = "dna-slot-machine.html"
if not os.path.exists(HTML_FILE):
    raise FileNotFoundError(f"HTML interface file '{HTML_FILE}' not found. Please ensure it exists in the same directory as app.py")

with open(HTML_FILE, "r") as f:
    SLOT_MACHINE_HTML = f.read()

class ProteinAnalyzer:
    """Handles protein translation and analysis using LLM"""
    
    # Genetic code table for DNA to amino acid translation
    CODON_TABLE = {
        'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
        'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
        'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
        'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
        'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
        'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
        'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
        'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
        'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
        'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
        'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
        'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
        'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
    }
    
    @staticmethod
    def dna_to_protein(dna_sequence: str) -> str:
        """Translate DNA sequence to protein sequence"""
        # Ensure sequence is uppercase
        dna_sequence = dna_sequence.upper()
        
        # Remove any non-DNA characters
        dna_sequence = ''.join(c for c in dna_sequence if c in 'ATCG')
        
        # Translate to protein
        protein = []
        for i in range(0, len(dna_sequence) - 2, 3):
            codon = dna_sequence[i:i+3]
            if len(codon) == 3:
                amino_acid = ProteinAnalyzer.CODON_TABLE.get(codon, 'X')
                if amino_acid == '*':  # Stop codon
                    break
                protein.append(amino_acid)
        
        return ''.join(protein)
    
    @staticmethod
    def analyze_protein_with_llm(protein_sequence: str, cell_type: str, language: str = "en") -> str:
        """Analyze protein structure and function using Friendli LLM API"""
        
        # Get API token from environment
        token = os.getenv("FRIENDLI_TOKEN")
        if not token:
            logger.warning("FRIENDLI_TOKEN not found in environment variables")
            if language == "ko":
                return "๋‹จ๋ฐฑ์งˆ ๋ถ„์„ ๋ถˆ๊ฐ€: API ํ† ํฐ์ด ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค"
            return "Protein analysis unavailable: API token not configured"
        
        try:
            url = "https://api.friendli.ai/dedicated/v1/chat/completions"
            headers = {
                "Authorization": f"Bearer {token}",
                "Content-Type": "application/json"
            }
            
            # Create prompt for protein analysis based on language
            if language == "ko":
                prompt = f"""๋‹น์‹ ์€ ์ƒ๋ฌผ์ •๋ณดํ•™ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ๋‹ค์Œ ๋‹จ๋ฐฑ์งˆ ์„œ์—ด์„ ๋ถ„์„ํ•˜๊ณ  ์ž ์žฌ์ ์ธ ๊ตฌ์กฐ์™€ ๊ธฐ๋Šฅ์— ๋Œ€ํ•œ ํ†ต์ฐฐ๋ ฅ์„ ์ œ๊ณตํ•ด์ฃผ์„ธ์š”.

๋‹จ๋ฐฑ์งˆ ์„œ์—ด: {protein_sequence}
์„ธํฌ ์œ ํ˜•: {cell_type}

๋‹ค์Œ ๋‚ด์šฉ์„ ํฌํ•จํ•ด์ฃผ์„ธ์š”:
1. ์„œ์—ด ํŒจํ„ด์„ ๊ธฐ๋ฐ˜์œผ๋กœ ์˜ˆ์ธก๋˜๋Š” ๋‹จ๋ฐฑ์งˆ ํŒจ๋ฐ€๋ฆฌ ๋˜๋Š” ๋„๋ฉ”์ธ
2. ์ž ์žฌ์ ์ธ ๊ตฌ์กฐ์  ํŠน์ง• (์•ŒํŒŒ ๋‚˜์„ , ๋ฒ ํƒ€ ์‹œํŠธ, ๋ฃจํ”„)
3. ๊ฐ€๋Šฅํ•œ ์ƒ๋ฌผํ•™์  ๊ธฐ๋Šฅ
4. {cell_type} ์„ธํฌ ์œ ํ˜•๊ณผ์˜ ๊ด€๋ จ์„ฑ
5. ์ฃผ๋ชฉํ•  ๋งŒํ•œ ์„œ์—ด ๋ชจํ‹ฐํ”„๋‚˜ ํŠน์„ฑ

๊ณผํ•™ ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜์— ํ‘œ์‹œํ•˜๊ธฐ์— ์ ํ•ฉํ•˜๋„๋ก ๊ฐ„๊ฒฐํ•˜๋ฉด์„œ๋„ ์œ ์ตํ•œ ์‘๋‹ต์„ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”."""
            else:
                prompt = f"""You are a bioinformatics expert. Analyze the following protein sequence and provide insights about its potential structure and function.

Protein sequence: {protein_sequence}
Cell type context: {cell_type}

Please provide:
1. Predicted protein family or domain based on sequence patterns
2. Potential structural features (alpha helices, beta sheets, loops)
3. Possible biological functions
4. Relevance to the {cell_type} cell type
5. Any notable sequence motifs or characteristics

Keep the response concise but informative, suitable for display in a scientific application."""
            
            payload = {
                "model": "dep89a2fld32mcm",
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a knowledgeable bioinformatics assistant specializing in protein structure and function prediction." if language == "en" else "๋‹น์‹ ์€ ๋‹จ๋ฐฑ์งˆ ๊ตฌ์กฐ์™€ ๊ธฐ๋Šฅ ์˜ˆ์ธก์„ ์ „๋ฌธ์œผ๋กœ ํ•˜๋Š” ์ง€์‹์ด ํ’๋ถ€ํ•œ ์ƒ๋ฌผ์ •๋ณดํ•™ ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                "max_tokens": 1000,
                "temperature": 0.7,
                "top_p": 0.8,
                "stream": False  # Disable streaming for simplicity
            }
            
            response = requests.post(url, json=payload, headers=headers, timeout=30)
            response.raise_for_status()
            
            result = response.json()
            analysis = result['choices'][0]['message']['content']
            
            return analysis
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Failed to analyze protein with LLM: {e}")
            return f"Protein analysis failed: {str(e)}"
        except Exception as e:
            logger.error(f"Unexpected error during protein analysis: {e}")
            return "Protein analysis unavailable due to an error"

class DNADiffusionApp:
    """Main application class for DNA-Diffusion Gradio interface"""
    
    def __init__(self):
        self.model = None
        self.model_loading = False
        self.model_error = None
        self.protein_analyzer = ProteinAnalyzer()
        
    def initialize_model(self):
        """Initialize the DNA-Diffusion model"""
        if not MODEL_AVAILABLE:
            self.model_error = "DNA-Diffusion model module not available. Please install dependencies."
            return
        
        if self.model_loading:
            return
        
        self.model_loading = True
        try:
            logger.info("Starting model initialization...")
            self.model = get_model()
            logger.info("Model initialized successfully!")
            self.model_error = None
        except Exception as e:
            logger.error(f"Failed to initialize model: {e}")
            self.model_error = str(e)
            self.model = None
        finally:
            self.model_loading = False
    
    @spaces.GPU(duration=60)
    def generate_sequence(self, cell_type: str, guidance_scale: float = 1.0) -> Tuple[str, Dict[str, Any]]:
        """Generate a DNA sequence using the model or mock data"""
        
        # Use mock generation if model is not available
        if not MODEL_AVAILABLE or self.model is None:
            logger.warning("Using mock sequence generation")
            import random
            sequence = ''.join(random.choice(['A', 'T', 'C', 'G']) for _ in range(200))
            metadata = {
                'cell_type': cell_type,
                'guidance_scale': guidance_scale,
                'generation_time': 2.0,
                'mock': True
            }
            # Simulate generation time
            time.sleep(2.0)
            return sequence, metadata
        
        # Use real model
        try:
            result = self.model.generate(cell_type, guidance_scale)
            return result['sequence'], result['metadata']
        except Exception as e:
            logger.error(f"Generation failed: {e}")
            raise
    
    def handle_generation_request(self, cell_type: str, guidance_scale: float, language: str = "en"):
        """Handle sequence generation request from Gradio"""
        try:
            logger.info(f"Generating sequence for cell type: {cell_type}, language: {language}")
            
            # Generate DNA sequence
            sequence, metadata = self.generate_sequence(cell_type, guidance_scale)
            
            # Translate to protein
            logger.info("Translating DNA to protein sequence...")
            protein_sequence = self.protein_analyzer.dna_to_protein(sequence)
            
            # Add protein sequence to metadata
            metadata['protein_sequence'] = protein_sequence
            metadata['protein_length'] = len(protein_sequence)
            
            # Analyze protein with LLM
            logger.info("Analyzing protein structure and function...")
            protein_analysis = self.protein_analyzer.analyze_protein_with_llm(
                protein_sequence, cell_type, language
            )
            
            # Add analysis to metadata
            metadata['protein_analysis'] = protein_analysis
            
            logger.info("Generation and analysis complete")
            return sequence, json.dumps(metadata)
        
        except Exception as e:
            error_msg = str(e)
            logger.error(f"Generation request failed: {error_msg}")
            return "", json.dumps({"error": error_msg})

# Create single app instance
app = DNADiffusionApp()

def create_demo():
    """Create the Gradio demo interface"""
    
    # CSS to hide backend controls and prevent scrolling
    css = """
    #hidden-controls { display: none !important; }
    .gradio-container { 
        overflow: hidden; 
        background-color: #000000 !important;
    }
    #dna-frame { overflow: hidden; position: relative; }
    body {
        background-color: #000000 !important;
    }
    """
    
    # JavaScript for handling communication between iframe and Gradio
    js = """
    function() {
        console.log('Initializing DNA-Diffusion Gradio interface...');
        
        // Set up message listener to receive requests from iframe
        window.addEventListener('message', function(event) {
            console.log('Parent received message:', event.data);
            
            if (event.data.type === 'generate_request') {
                console.log('Triggering generation for cell type:', event.data.cellType);
                console.log('Language:', event.data.language);
                
                // Update the hidden cell type input
                const radioInputs = document.querySelectorAll('#cell-type-input input[type="radio"]');
                radioInputs.forEach(input => {
                    if (input.value === event.data.cellType) {
                        input.checked = true;
                        // Trigger change event
                        input.dispatchEvent(new Event('change'));
                    }
                });
                
                // Update the language input
                const langInputs = document.querySelectorAll('#language-input input[type="radio"]');
                langInputs.forEach(input => {
                    if (input.value === event.data.language) {
                        input.checked = true;
                        input.dispatchEvent(new Event('change'));
                    }
                });
                
                // Small delay to ensure radio button update is processed
                setTimeout(() => {
                    document.querySelector('#generate-btn').click();
                }, 100);
            }
        });
        
        // Function to send sequence to iframe
        window.sendSequenceToIframe = function(sequence, metadata) {
            console.log('Sending sequence to iframe:', sequence);
            const iframe = document.querySelector('#dna-frame iframe');
            if (iframe && iframe.contentWindow) {
                try {
                    const meta = JSON.parse(metadata);
                    if (meta.error) {
                        iframe.contentWindow.postMessage({
                            type: 'generation_error',
                            error: meta.error
                        }, '*');
                    } else {
                        iframe.contentWindow.postMessage({
                            type: 'sequence_generated',
                            sequence: sequence,
                            metadata: meta
                        }, '*');
                    }
                } catch (e) {
                    console.error('Failed to parse metadata:', e);
                    // If parsing fails, still send the sequence
                    iframe.contentWindow.postMessage({
                        type: 'sequence_generated',
                        sequence: sequence,
                        metadata: {}
                    }, '*');
                }
            } else {
                console.error('Could not find iframe');
            }
        };
    }
    """
    
    with gr.Blocks(css=css, js=js, theme=gr.themes.Base()) as demo:
        
        # Hidden controls for backend processing
        with gr.Column(elem_id="hidden-controls", visible=False):
            cell_type_input = gr.Radio(
                ["K562", "GM12878", "HepG2"],
                value="K562",
                label="Cell Type",
                elem_id="cell-type-input"
            )
            language_input = gr.Radio(
                ["en", "ko"],
                value="en",
                label="Language",
                elem_id="language-input"
            )
            guidance_input = gr.Slider(
                minimum=1.0,
                maximum=10.0,
                value=1.0,
                step=0.5,
                label="Guidance Scale",
                elem_id="guidance-input"
            )
            generate_btn = gr.Button("Generate", elem_id="generate-btn")
            
            sequence_output = gr.Textbox(label="Sequence", elem_id="sequence-output")
            metadata_output = gr.Textbox(label="Metadata", elem_id="metadata-output")
        
        # Main interface - the slot machine in an iframe
        # Escape the HTML content for srcdoc
        escaped_html = html.escape(SLOT_MACHINE_HTML, quote=True)
        iframe_html = f'<iframe srcdoc="{escaped_html}" style="width: 100%; height: 800px; border: none; display: block;"></iframe>'
        
        html_display = gr.HTML(
            iframe_html,
            elem_id="dna-frame"
        )
        
        # Wire up the generation
        generate_btn.click(
            fn=app.handle_generation_request,
            inputs=[cell_type_input, guidance_input, language_input],
            outputs=[sequence_output, metadata_output]
        ).then(
            fn=None,
            inputs=[sequence_output, metadata_output],
            outputs=None,
            js="(seq, meta) => sendSequenceToIframe(seq, meta)"
        )
        
        # Initialize model on load
        demo.load(
            fn=app.initialize_model,
            inputs=None,
            outputs=None
        )
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_demo()
    
    # Parse any command line arguments
    import argparse
    parser = argparse.ArgumentParser(description="DNA-Diffusion Gradio App")
    parser.add_argument("--share", action="store_true", help="Create a public shareable link")
    parser.add_argument("--port", type=int, default=7860, help="Port to run the app on")
    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the app on")
    args = parser.parse_args()
    
    # For Hugging Face Spaces deployment
    import os
    if os.getenv("SPACE_ID"):
        # Running on Hugging Face Spaces
        args.host = "0.0.0.0"
        args.port = 7860
        args.share = False
        inbrowser = False
    else:
        inbrowser = True
    
    logger.info(f"Starting DNA-Diffusion Gradio app on {args.host}:{args.port}")
    
    demo.launch(
        share=args.share,
        server_name=args.host,
        server_port=args.port,
        inbrowser=inbrowser
    )