File size: 4,563 Bytes
9654d01 1bfd68c 9654d01 68ddf4b fa121ee 68ddf4b 386475b 8e2b835 fa121ee 6e89c52 9f87e41 6e89c52 20f146a 6e89c52 20f146a 6e89c52 4f64cd9 57943d9 6e89c52 8e2b835 884a74d 8e2b835 4f64cd9 8e2b835 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
---
library_name: transformers
pipeline_tag: text-generation
---
# ChatNT
[ChatNT](https://www.biorxiv.org/content/10.1101/2024.04.30.591835v1) is the first multimodal conversational agent designed with a deep understanding of biological sequences (DNA, RNA, proteins).
It enables users — even those with no coding background — to interact with biological data through natural language and it generalizes across multiple biological tasks and modalities.
**Developed by:** [InstaDeep](https://huggingface.co/InstaDeepAI)
### Model Sources
<!-- Provide the basic links for the model. -->
- **Repository:** [Nucleotide Transformer](https://github.com/instadeepai/nucleotide-transformer)
- **Paper:** [ChatNT: A Multimodal Conversational Agent for DNA, RNA and Protein Tasks](https://www.biorxiv.org/content/10.1101/2024.04.30.591835v1.full.pdf)
### How to use
Until its next release, the transformers library needs to be installed from source with the following command in order to use the models.
PyTorch should also be installed.
```
pip install --upgrade git+https://github.com/huggingface/transformers.git
pip install torch
```
A small snippet of code is given here in order to **generate ChatNT answers from a pipeline (high-level)**.
```
# Load pipeline
from transformers import pipeline
pipe = pipeline(model="InstaDeepAI/ChatNT", trust_remote_code=True)
# Define custom inputs (note that the number of <DNA> token in the english sequence must be equal to len(dna_sequences))
english_sequence = "Is there any evidence of an acceptor splice site in this sequence <DNA> ?"
dna_sequences = ["ATCGGAAAAAGATCCAGAAAGTTATACCAGGCCAATGGGAATCACCTATTACGTGGATAATAGCGATAGTATGTTACCTATAAATTTAACTACGTGGATATCAGGCAGTTACGTTACCAGTCAAGGAGCACCCAAAACTGTCCAGCAACAAGTTAATTTACCCATGAAGATGTACTGCAAGCCTTGCCAACCAGTTAAAGTAGCTACTCATAAGGTAATAAACAGTAATATCGACTTTTTATCCATTTTGATAATTGATTTATAACAGTCTATAACTGATCGCTCTACATAATCTCTATCAGATTACTATTGACACAAACAGAAACCCCGTTAATTTGTATGATATATTTCCCGGTAAGCTTCGATTTTTAATCCTATCGTGACAATTTGGAATGTAACTTATTTCGTATAGGATAAACTAATTTACACGTTTGAATTCCTAGAATATGGAGAATCTAAAGGTCCTGGCAATGCCATCGGCTTTCAATATTATAATGGACCAAAAGTTACTCTATTAGCTTCCAAAACTTCGCGTGAGTACATTAGAACAGAAGAATAACCTTCAATATCGAGAGAGTTACTATCACTAACTATCCTATG"]
# Generate sequence
generated_english_sequence = pipe(
inputs={
"english_sequence": english_sequence,
"dna_sequences": dna_sequences
}
)
# Expected output: "Yes, an acceptor splice site is without question present in the sequence."
```
A small snippet of code is given here in order to **infer with the model without any abstraction (low-level)**.
```
import numpy as np
from transformers import AutoModel, AutoTokenizer
# Load model and tokenizers
model = AutoModel.from_pretrained("InstaDeepAI/ChatNT", trust_remote_code=True)
english_tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/ChatNT", subfolder="english_tokenizer")
bio_tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/ChatNT", subfolder="bio_tokenizer")
# Define custom inputs (note that the number of <DNA> token in the english sequence must be equal to len(dna_sequences))
english_sequence = "A chat between a curious user and an artificial intelligence assistant that can handle bio sequences. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Is there any evidence of an acceptor splice site in this sequence <DNA> ?"
dna_sequences = ["ATCGGAAAAAGATCCAGAAAGTTATACCAGGCCAATGGGAATCACCTATTACGTGGATAATAGCGATAGTATGTTACCTATAAATTTAACTACGTGGATATCAGGCAGTTACGTTACCAGTCAAGGAGCACCCAAAACTGTCCAGCAACAAGTTAATTTACCCATGAAGATGTACTGCAAGCCTTGCCAACCAGTTAAAGTAGCTACTCATAAGGTAATAAACAGTAATATCGACTTTTTATCCATTTTGATAATTGATTTATAACAGTCTATAACTGATCGCTCTACATAATCTCTATCAGATTACTATTGACACAAACAGAAACCCCGTTAATTTGTATGATATATTTCCCGGTAAGCTTCGATTTTTAATCCTATCGTGACAATTTGGAATGTAACTTATTTCGTATAGGATAAACTAATTTACACGTTTGAATTCCTAGAATATGGAGAATCTAAAGGTCCTGGCAATGCCATCGGCTTTCAATATTATAATGGACCAAAAGTTACTCTATTAGCTTCCAAAACTTCGCGTGAGTACATTAGAACAGAAGAATAACCTTCAATATCGAGAGAGTTACTATCACTAACTATCCTATG"]
# Tokenize
english_tokens = english_tokenizer(english_sequence, return_tensors="pt", padding="max_length", truncation=True, max_length=512).input_ids
bio_tokens = bio_tokenizer(dna_sequences, return_tensors="pt", padding="max_length", max_length=512, truncation=True).input_ids.unsqueeze(0) # unsqueeze to simulate batch_size = 1
# Predict
outs = model(
multi_omics_tokens_ids=(english_tokens, bio_tokens),
projection_english_tokens_ids=english_tokens,
projected_bio_embeddings=None,
)
# Expected output: Dictionary of logits and projected_bio_embeddings
``` |