Spaces:
Running
Running
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="description" | |
| content="LLaSM: Large Language and Speech Model"> | |
| <meta name="keywords" content="speech-language, multi-modal, LLM, LLaSM"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <title>LLaSM: Large Language and Speech Model</title> | |
| <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" | |
| rel="stylesheet"> | |
| <link rel="stylesheet" href="./static/css/bulma.min.css"> | |
| <link rel="stylesheet" href="./static/css/bulma-carousel.min.css"> | |
| <link rel="stylesheet" href="./static/css/bulma-slider.min.css"> | |
| <link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> | |
| <link rel="stylesheet" | |
| href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> | |
| <link rel="stylesheet" href="./static/css/index.css"> | |
| <link rel="icon" href="./static/images/favicon.svg"> | |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
| <script defer src="./static/js/fontawesome.all.min.js"></script> | |
| <script src="./static/js/bulma-carousel.min.js"></script> | |
| <script src="./static/js/bulma-slider.min.js"></script> | |
| <!-- for LLaSM demo --> | |
| <link rel="stylesheet" href="./static/css/bootsrap.min.css"> | |
| <link rel="stylesheet" href="./static/css/styles.css"> | |
| <script src="./static/js/recorder.mp3.min.js"></script> | |
| <script src="./static/js/waveview.js"></script> | |
| <!-- / for LLaSM demo --> | |
| </head> | |
| <body> | |
| <nav class="navbar" role="navigation" aria-label="main navigation"> | |
| <div class="navbar-brand"> | |
| <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false"> | |
| <span aria-hidden="true"></span> | |
| <span aria-hidden="true"></span> | |
| <span aria-hidden="true"></span> | |
| </a> | |
| </div> | |
| <div class="navbar-menu"> | |
| <div class="navbar-start" style="flex-grow: 1; justify-content: center;"> | |
| <a class="navbar-item" href="https://keunhong.com"> | |
| <span class="icon"> | |
| <i class="fas fa-home"></i> | |
| </span> | |
| </a> | |
| <div class="navbar-item has-dropdown is-hoverable"> | |
| <a class="navbar-link"> | |
| More Research | |
| </a> | |
| <div class="navbar-dropdown"> | |
| <a class="navbar-item" href="https://huggingface.co/spaces/LinkSoul/Chinese-LLaVA" target="_blank"> | |
| Chinese-LLaVA | |
| </a> | |
| <a class="navbar-item" href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank"> | |
| Chinese-Llama-2-7B | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </nav> | |
| <section class="hero"> | |
| <div class="hero-body"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column has-text-centered"> | |
| <h1 class="title is-1 publication-title">LLaSM: Large Language and Speech Model</h1> | |
| <div class="is-size-5 publication-authors"> | |
| <span class="author-block" style="color:#008AD7;font-weight:normal;"> | |
| Yu Shu<sup>2</sup>,</span> | |
| <span class="author-block" style="color:#008AD7;font-weight:normal;"> | |
| Siwei Dong<sup>2</sup>,</span> | |
| <span class="author-block" style="color:#ed2f09;font-weight:normal;"> | |
| Guangyao Chen<sup>1,3</sup>, | |
| </span> | |
| <span class="author-block" style="color:#cc00d7;font-weight:normal;"> | |
| Wenhao Huang<sup>4</sup>, | |
| </span> | |
| <span class="author-block" style="color:#19e706;font-weight:normal;"> | |
| Rita Zhang, | |
| <!-- Rita Zhang<sup>5</sup>, --> | |
| </span> | |
| <span class="author-block" style="color:#19e706;font-weight:normal;"> | |
| Daochen Shi, | |
| <!-- Daochen Shi<sup>5</sup>, --> | |
| </span> | |
| <span class="author-block" style="color:#f68946;font-weight:normal;"> | |
| Yemin Shi<sup>1*</sup> | |
| </span> | |
| </div> | |
| <div class="is-size-5 publication-authors"> | |
| <span class="author-block" style="color:#f68946;font-weight:normal;"><sup>1</sup>LinkSoul.AI,</span> | |
| <span class="author-block" style="color:#008AD7;font-weight:normal;"><sup>2</sup>Beijing Academy of Artificial Intelligence, China,</span> | |
| <span class="author-block" style="color:#ed2f09;font-weight:normal;"><sup>3</sup>Peking University, China</span> | |
| <span class="author-block" style="color:#cc00d7;font-weight:normal;"><sup>4</sup>01.ai</span> | |
| </div> | |
| <div> | |
| <span class="author-block"><sup>*</sup>Corresponding author: [email protected]</span> | |
| </div> | |
| <div class="column has-text-centered"> | |
| <div class="publication-links"> | |
| <!-- PDF Link. --> | |
| <!-- <span class="link-block"> | |
| <a href="" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fas fa-file-pdf"></i> | |
| </span> | |
| <span>Paper</span> | |
| </a> | |
| </span> | |
| <span class="link-block"> | |
| <a href="" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="ai ai-arxiv"></i> | |
| </span> | |
| <span>arXiv</span> | |
| </a> | |
| </span> --> | |
| <!-- Model Link. --> | |
| <span class="link-block"> | |
| <a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fas fa-atom"></i> | |
| </span> | |
| <span>Model</span> | |
| </a> | |
| </span> | |
| <!-- Code Link. --> | |
| <span class="link-block"> | |
| <a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Code</span> | |
| </a> | |
| </span> | |
| <!-- Dataset Link. --> | |
| <span class="link-block"> | |
| <a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank" | |
| class="external-link button is-normal is-rounded is-dark"> | |
| <span class="icon"> | |
| <i class="far fa-images"></i> | |
| </span> | |
| <span>Data</span> | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <section class="section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Abstract. --> | |
| <div class="columns is-centered has-text-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3">Abstract</h2> | |
| <div class="content has-text-justified"> | |
| <p> | |
| Multi-modal large language models have garnered significant interest recently. Though, | |
| most of the works are focusing on vision-language multi-modal models providing | |
| strong capabilities in following vision-and-language instructions. However, we claim | |
| that speech is also an important modality through which human interact with the | |
| world. Hence, it is crucial for a general-purpose assistant to be able to follow multi-modal | |
| speech-and-language instructions. In this work, we propose <b>L</b>arge <b>L</b>anguage | |
| <b>a</b>nd <b>S</b>peech <b>M</b>odel (<b>LLaSM</b>). <b>LLaSM</b> is an end-to-end trained large multi-modal | |
| speech-language model with cross-modal conversational abilities, capable of following | |
| speech-and-language instructions. Our early experiments show that <b>LLaSM</b> demonstrates | |
| a more convenient and natural way for human to interact with artificial intelligence. | |
| Specifically, we also release a large Speech Instruction Following data set <b>LLaSM-Audio-Instruction</b>. | |
| </p> | |
| <p> | |
| Our paper makes the following contributions: | |
| </p> | |
| <ui> | |
| <li> | |
| We build a speech-language multi-modal assistant that can understand and follow the speech-language instructions, which provides a more convenient and natural way for humans to interact with artificial intelligence. | |
| </li> | |
| <li> | |
| We construct and release <a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank">LLaSM-Audio-Instructions</a>, a large scale Chinese and English speech-text cross-modal instruction following dataset. | |
| </li> | |
| <li> | |
| We release the code in <a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank">https://github.com/LinkSoul-AI/LLaSM</a>. | |
| </li> | |
| <li> | |
| We release the models in <a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank">LLaSM-Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/LinkSoul/LLaSM-Baichuan" target="_blank">LLaSM-Baichuan-7B</a>. | |
| </li> | |
| </ui> | |
| </div> | |
| </div> | |
| </div> | |
| <!--/ Abstract. --> | |
| </div> | |
| </section> | |
| <section class="hero is-light is-small"> | |
| <div class="hero-body"> | |
| <h2 class="title is-3" style="text-align: center;">Demo</h2> | |
| <!-- LLaSM Demo --> | |
| <div id="llasaLoading" style="position: absolute; width: 100%; z-index: 1; display: flex; justify-content: center; align-items: center;"> | |
| <div style="text-align: center;"> | |
| <img src="./images/duck.gif" alt="loading" /> | |
| <h3>Loading...</h3> | |
| </div> | |
| </div> | |
| <div class="container" id="llasa" style="opacity: 0;"> | |
| <div class="row mt-5 justify-content-center"> | |
| <div class="col-md-12 mt-3"> | |
| <div id="chat-window" class="card p-2"> | |
| <div class="container my-3"> | |
| <!-- <div id="info"></div> --> | |
| <div id="results" class="results"> | |
| </div> | |
| <fieldset id="temp_audio" style="text-align: center; height: 100px; border: 1.4px solid #ddd;"> | |
| <legend style="float: initial; | |
| text-align: initial; | |
| width: initial; | |
| margin-left: 10px; | |
| font-size: initial;">Audio preview</legend> | |
| <div id="waveform" style="text-align: center; height: 50px; width: 100%;"></div> | |
| <audio id="audioPlayer" style="height: 50px; width: 100%; display: none; padding: 0 20px 0 20px;" controls src=""></audio> | |
| </fieldset> | |
| </div> | |
| </div> | |
| <div id="user-input" class="mt-2"> | |
| <div class="input-group"> | |
| <textarea type="text" id="user-text" style="height: 60px; padding: 10px 150px 5px 10px;" placeholder="Type in your message or press record button to speak..."></textarea> | |
| <div id="input-audio" class="input-group-append p-2"> | |
| <button id="delete_button" class="mb-2 p-2"> | |
| <img id="delete_img" class="mb-2" src="images/error.png" alt="Del"> | |
| </button> | |
| <button id="start_button" class="mb-2 p-2"> | |
| <img id="start_img" class="mb-2" src="images/microphone.png" alt="Record"> | |
| </button> | |
| <button id="send_button" class="mb-2 p-2"> | |
| <img id="send_text_img" class="mb-2" src="images/paper-plane.png" alt="Start"> | |
| </button> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- / LLaSM Demo --> | |
| </div> | |
| </section> | |
| <section class="section"> | |
| <div class="container is-max-desktop"> | |
| <!-- Demo Tips. --> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3">Tips</h2> | |
| <div class="content has-text-justified"> | |
| <h4> | |
| Demo 试用教程 | |
| </h4> | |
| <ul> | |
| <li> | |
| 文本框输入文字,点击最右侧发送按钮即可发送消息,开始聊天。 | |
| </li> | |
| <li> | |
| 点击语音按钮,开始录音,再次点击,结束录音。点击发送按钮,即可发送语音消息。 | |
| </li> | |
| <li> | |
| 语音未发送之前可在音频预览区检查,聊天框中的历史语音消息同样支持回放。 | |
| </li> | |
| <li> | |
| 点击重置按钮可清空历史对话信息。 | |
| </li> | |
| <li> | |
| 注:本 demo 仅作为 LLaSM 的模型能力展示,对多轮对话中话题切换支持不足。切换聊天话题时,建议清空历史以获得更好的体验。 | |
| </li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| <!--/ Demo Tips. --> | |
| </div> | |
| </section> | |
| <section class="section" id="BibTeX"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title">BibTeX</h2> | |
| <pre> | |
| <code>@article{Shu2023llasm, | |
| author = {Yu Shu, Siwei Dong, Guangyao Chen, Wenhao Huang, Rita Zhang, Daochen Shi & Yemin Shi}, | |
| title = {LLaSM: Large Language and Speech Model}, | |
| journal = {arXiv}, | |
| year = {2023}, | |
| } | |
| </code> | |
| </pre> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <section class="section" id="Acknowledgement"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title">Acknowledgement</h2> | |
| <p> | |
| This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies</a>, licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative | |
| Commons Attribution-ShareAlike 4.0 International License</a>. We thank the open-source projects for giving us access to their models, including <a href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank">Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/openai/whisper-large-v2" target="_blank">Whisper</a> and <a href="https://huggingface.co/baichuan-inc/Baichuan-7B" target="_blank">Baichuan-7B</a>. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- for LLaSM demo --> | |
| <script src="./static/js/index_demo.js"></script> | |
| <!-- / for LLaSM demo --> | |
| </body> | |
| </html> | |