Spaces:

liuhaozhe6788
/

CelebChat

Runtime error

App Files Files Community

lhzstar commited on Oct 27, 2023

Commit

6bc94ac

0 Parent(s):

initial commits

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/main.yml +20 -0
.gitignore +24 -0
Dockerfile +31 -0
README.md +13 -0
app.py +122 -0
celebbot.py +150 -0
data.json +290 -0
gen_embeds.py +124 -0
requirements.txt +31 -0
rtvc/.gitattributes +1 -0
rtvc/.gitignore +27 -0
rtvc/CHANGELOG.md +18 -0
rtvc/LICENSE.md +24 -0
rtvc/README.md +132 -0
rtvc/css/bootstrap.min.css +0 -0
rtvc/css/custom.css +196 -0
rtvc/css/normalize.css +427 -0
rtvc/css/skeleton.css +418 -0
rtvc/demo_cli.py +330 -0
rtvc/demo_results/text1/1688-142285-0000_syn.wav +0 -0
rtvc/demo_results/text1/260-123286-0000_syn.wav +0 -0
rtvc/demo_results/text1/4294-9934-0000_syn.wav +0 -0
rtvc/demo_results/text1/7176-88083-0000_syn.wav +0 -0
rtvc/demo_results/text1/README.md +1 -0
rtvc/demo_results/text2/1688-142285-0000_syn.wav +0 -0
rtvc/demo_results/text2/260-123286-0000_syn.wav +0 -0
rtvc/demo_results/text2/4294-9934-0000_syn.wav +0 -0
rtvc/demo_results/text2/7176-88083-0000_syn.wav +0 -0
rtvc/demo_results/text2/README.md +1 -0
rtvc/demo_results/text3/1688-142285-0000_syn.wav +0 -0
rtvc/demo_results/text3/260-123286-0000_syn.wav +0 -0
rtvc/demo_results/text3/4294-9934-0000_syn.wav +0 -0
rtvc/demo_results/text3/7176-88083-0000_syn.wav +0 -0
rtvc/demo_results/text3/README.md +1 -0
rtvc/demo_toolbox.py +41 -0
rtvc/docs/images/audio_icon.png +0 -0
rtvc/docs/images/voice_cloning_arch.png +0 -0
rtvc/encoder/__init__.py +0 -0
rtvc/encoder/audio.py +136 -0
rtvc/encoder/config.py +45 -0
rtvc/encoder/data_objects/__init__.py +2 -0
rtvc/encoder/data_objects/random_cycler.py +37 -0
rtvc/encoder/data_objects/speaker.py +40 -0
rtvc/encoder/data_objects/speaker_batch.py +13 -0
rtvc/encoder/data_objects/speaker_verification_dataset.py +76 -0
rtvc/encoder/data_objects/utterance.py +29 -0
rtvc/encoder/data_objects/utterance_batch.py +10 -0
rtvc/encoder/inference.py +178 -0
rtvc/encoder/model.py +135 -0
rtvc/encoder/params_data.py +34 -0

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://liuhaozhe6788:[email protected]/spaces/liuhaozhe6788/CelebChat main

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+launch.json
+*.pyc
+*.aux
+*.log
+*.out
+*.synctex.gz
+*.suo
+*__pycache__
+*.idea
+*.ipynb_checkpoints
+*.pickle
+*.npy
+*.bz2
+*.blg
+*.bbl
+*.bcf
+*.toc
+*.sh
+*.pt
+*.whl
+*.m4a
+*.csv
+input_audios/
+syn_results/

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.9
+WORKDIR /code
+# Install ffmpeg, git and other dependencies for Whisper
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    ffmpeg \
+    git \
+    libsndfile1 \
+    software-properties-common \
+    && rm -rf /var/lib/apt/lists/*
+# Update pip and install dependencies
+RUN pip install --upgrade pip
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+# Launch app when container is run
+CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Img2txt App
+emoji: 🌍
+colorFrom: blue
+colorTo: green
+sdk: streamlit
+sdk_version: 1.27.2
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# CelebChat

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from celebbot import CelebBot
+import streamlit as st
+import re
+import spacy
+import json
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
+from utils import *
+@st.cache_resource
+def get_seq2seq_model(model_id):
+    return AutoModelForSeq2SeqLM.from_pretrained(model_id)
+@st.cache_resource
+def get_auto_model(model_id):
+    return AutoModel.from_pretrained(model_id)
+@st.cache_resource
+def get_tokenizer(model_id):
+    return AutoTokenizer.from_pretrained(model_id)
+@st.cache_data
+def get_celeb_data(fpath):
+    with open(fpath) as json_file:
+        return json.load(json_file)
+@st.cache_resource
+def preprocess_text(name, gender, text, model_id):
+    lname = name.split(" ")[-1]
+    lname_regex = re.compile(rf'\b({lname})\b')
+    name_regex = re.compile(rf'\b({name})\b')
+    lnames = lname+"’s" if not lname.endswith("s") else lname+"’"
+    lnames_regex = re.compile(rf'\b({lnames})\b')
+    names = name+"’s" if not name.endswith("s") else name+"’"
+    names_regex = re.compile(rf'\b({names})\b')
+    if gender == "M":
+        text = re.sub(he_regex, "I", text)
+        text = re.sub(his_regex, "my", text)
+    elif gender == "F":
+        text = re.sub(she_regex, "I", text)
+        text = re.sub(her_regex, "my", text)
+    text = re.sub(names_regex, "my", text)
+    text = re.sub(lnames_regex, "my", text)
+    text = re.sub(name_regex, "I", text)
+    text = re.sub(lname_regex, "I", text)
+    spacy_model = spacy.load(model_id)
+    texts = [i.text.strip() for i in spacy_model(text).sents]
+    return spacy_model, texts
+def main():
+    hide_footer()
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = []
+    if "QA_model_path" not in st.session_state:
+        st.session_state["QA_model_path"] = "google/flan-t5-base"
+    if "sentTr_model_path" not in st.session_state:
+        st.session_state["sentTr_model_path"] = "sentence-transformers/all-mpnet-base-v2"
+    if "start_chat" not in st.session_state:
+        st.session_state["start_chat"] = False
+    model_list = ["base", "large", "xl", "xxl"]
+    for message in st.session_state["messages"]:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    celeb_data = get_celeb_data(f'data.json')
+    # Create a Form Component on the Sidebar for accepting input data and parameters
+    celeb_name = st.sidebar.selectbox('Choose a celebrity', options=list(celeb_data.keys()))
+    celeb_gender = celeb_data[celeb_name]["gender"]
+    knowledge = celeb_data[celeb_name]["knowledge"]
+    model_choice = st.sidebar.selectbox("Choose Your Flan-T5 model",options=model_list)
+    st.session_state["QA_model_path"] = f"google/flan-t5-{model_choice}"
+    #     submitted = st.form_submit_button(label="Start Chatting")
+    # if submitted:
+    #     st.session_state["start_chat"] = True
+    # if st.session_state["start_chat"]:
+    celeb_bot = CelebBot(celeb_name,
+                         get_tokenizer(st.session_state["QA_model_path"]),
+                         get_seq2seq_model(st.session_state["QA_model_path"]),
+                         get_tokenizer(st.session_state["sentTr_model_path"]),
+                         get_auto_model(st.session_state["sentTr_model_path"]),
+                         *preprocess_text(celeb_name, celeb_gender, knowledge, "en_core_web_sm")
+                         )
+    prompt = st.chat_input("Say something")
+    print(prompt)
+    if prompt:
+        celeb_bot.text = prompt
+        # Display user message in chat message container
+        st.chat_message("user").markdown(prompt)
+        # Add user message to chat history
+        st.session_state["messages"].append({"role": "user", "content": prompt})
+        # Add assistant response to chat history
+        response = celeb_bot.question_answer()
+        # disable autoplay to play in HTML
+        b64 = celeb_bot.text_to_speech(autoplay=False)
+        md = f"""
+        <p>{response}</p>
+        <audio controls autoplay style="display:none;">
+        <source src="data:audio/wav;base64,{b64}" type="audio/wav">
+        Your browser does not support the audio element.
+        </audio>
+        """
+        st.chat_message("assistant").markdown(
+            md,
+            unsafe_allow_html=True,
+        )
+        # Display assistant response in chat message container
+        st.session_state["messages"].append({"role": "assistant", "content": response})
+if __name__ == "__main__":
+    main()

celebbot.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import datetime
+import numpy as np
+import torch
+import torch.nn.functional as F
+import os
+import json
+import speech_recognition as sr
+import re
+import time
+import spacy
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
+import pickle
+import streamlit as st
+from sklearn.metrics.pairwise import cosine_similarity
+import run_tts
+# Build the AI
+class CelebBot():
+    def __init__(self, name, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents):
+        self.name = name
+        print("--- starting up", self.name, "---")
+        self.text = ""
+        self.QA_tokenizer = QA_tokenizer
+        self.QA_model = QA_model
+        self.sentTr_tokenizer = sentTr_tokenizer
+        self.sentTr_model = sentTr_model
+        self.spacy_model = spacy_model
+        self.all_knowledge = knowledge_sents
+    @st.cache_resource
+    def get_seq2seq_model(self, _model_id):
+        return AutoModelForSeq2SeqLM.from_pretrained(_model_id)
+    @st.cache_resource
+    def get_model(self,_model_id):
+        return AutoModel.from_pretrained(_model_id)
+    @st.cache_resource
+    def get_tokenizer(self,_model_id):
+        return AutoTokenizer.from_pretrained(_model_id)
+    def speech_to_text(self):
+        recognizer = sr.Recognizer()
+        with sr.Microphone() as mic:
+            recognizer.adjust_for_ambient_noise(mic, duration=1)
+            # flag = input("Are you ready to record?\nProceed (Y/n)")
+            # try:
+            #     assert flag=='Y'
+            # except:
+            #     self.text = ""
+            #     print(f"me -->  Permission denied")
+            time.sleep(1)
+            print("listening")
+            audio = recognizer.listen(mic)
+            try:
+                self.text = recognizer.recognize_google(audio)
+            except:
+                self.text = ""
+                print(f"me -->  No audio recognized")
+    def wake_up(self, text):
+        return True if "hey " + self.name in text.lower() else False
+    def text_to_speech(self, autoplay=True):
+        return run_tts.tts(self.text, "_".join(self.name.split(" ")), self.spacy_model, autoplay)
+    def sentence_embeds_inference(self, texts: list):
+        def _mean_pooling(model_output, attention_mask):
+            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        # Tokenize sentences
+        encoded_input = self.sentTr_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+        encoded_input["input_ids"] = encoded_input["input_ids"]
+        encoded_input["attention_mask"] = encoded_input["attention_mask"]
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = self.sentTr_model(**encoded_input)
+        # Perform pooling
+        sentence_embeddings = _mean_pooling(model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+        return sentence_embeddings
+    def retrieve_knowledge_assertions(self):
+        question_embeddings = self.sentence_embeds_inference([self.name + ', ' + self.text])
+        all_knowledge_embeddings = self.sentence_embeds_inference(self.all_knowledge)
+        similarity = cosine_similarity(all_knowledge_embeddings.cpu(), question_embeddings.cpu())
+        similarity = np.reshape(similarity, (1, -1))[0]
+        K = min(8, len(self.all_knowledge))
+        top_K = np.sort(np.argpartition(similarity, -K)[-K: ])
+        all_knowledge_assertions = np.array(self.all_knowledge)[top_K]
+        # similarities = np.array(similarity)[top_K]
+        # print(*list(zip(all_knowledge_assertions, similarities)), sep='\n')
+        return ' '.join(all_knowledge_assertions)
+    def question_answer(self, instruction1='', knowledge=''):
+        if self.text != "":
+            ## wake up
+            if self.wake_up(self.text) is True:
+                self.text = f"Hello I am {self.name} the AI, what can I do for you?"
+            ## have a conversation
+            else:
+                # if re.search(you_regex, self.text) != None:
+                instruction1 = f'[Instruction] You are a celebrity named {self.name}. You need to answer the question based on knowledge and commonsense.'
+                knowledge = self.retrieve_knowledge_assertions()
+                # else:
+                #     instruction1 = f'[Instruction] You need to answer the question based on commonsense.'
+                query = f"{instruction1} [knowledge] {knowledge} [question] {self.text} {self.name}!"
+                input_ids = self.QA_tokenizer(f"{query}", return_tensors="pt").input_ids
+                outputs = self.QA_model.generate(input_ids, max_length=1024)
+                self.text = self.QA_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            #     instruction2 = f'[Instruction] You are a celebrity named {self.name}. You need to answer the question based on knowledge'
+            #     query = f"{instruction2} [knowledge] {self.text} {answer} [question] {self.name}, {self.text}"
+            #     input_ids = self.QA_tokenizer(f"{query}", return_tensors="pt").input_ids
+            #     outputs = self.QA_model.generate(input_ids, max_length=1024)
+            #     self.text = self.QA_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return self.text
+    @staticmethod
+    def action_time():
+        return f"it's {datetime.datetime.now().time().strftime('%H:%M')}"
+    @staticmethod
+    def save_kb(kb, filename):
+        with open(filename, "wb") as f:
+            pickle.dump(kb, f)
+    @staticmethod
+    def load_kb(filename):
+        res = None
+        with open(filename, "rb") as f:
+            res = pickle.load(f)
+        return res

data.json ADDED Viewed

	@@ -0,0 +1,290 @@

+{
+        "Cate Blanchett": {
+            "knowledge": "Blanchett, (born May 14, 1969, Melbourne, Victoria, Australia), Australian actress known for her multidimensional characters and wide range of roles. Blanchett grew up in suburban Melbourne with an Australian mother and an American father, who died when Blanchett was 10 years old. She studied art history at the University of Melbourne before graduating from the National Institute of Dramatic Art in 1992. Her professional acting career began on the Australian stage. She performed with the Sydney Theatre Company in Caryl Churchill’s Top Girls and Timothy Daly’s Kafka Dances. In 1993 she starred opposite Geoffrey Rush in David Mamet’s Oleanna, as a student who accuses her teacher of sexual harassment. Blanchett made her television debut in 1993, and she soon landed leading roles in the miniseries Heartland (1994) and Bordertown (1995). She moved to feature films with Paradise Road (1997), a historical drama about a Japanese war camp in World War II. Blanchett’s reputation grew with her next two feature films: the bittersweet romantic comedy Thank God He Met Lizzie (1997; later released as The Wedding Party) and Oscar and Lucinda (1997), in which she played a rebellious heiress ostracized from Australian society. Her breakthrough role was as young Queen Elizabeth I in the 1998 film Elizabeth, which earned her an Academy Award nomination and a Golden Globe Award for best actress. Blanchett was praised for capturing the emotional complexity of the queen’s development from a lovestruck adolescent to an indomitable political force who represses her emotional vulnerability. Blanchett subsequently appeared in films that covered numerous genres and character types, securing her reputation as a versatile actress. She took supporting parts in Pushing Tin (1999), a comedy about air traffic controllers, and in the dramatic thriller The Talented Mr. Ripley (1999). As the lead character in The Gift (2000), she played a psychic whose visions involve her in the investigation of a local woman’s murder. In 2001 she portrayed a kidnapped housewife who falls in love with her captors in Bandits. She next appeared as the elf queen Galadriel in The Lord of the Rings trilogy (2001, 2002, and 2003), the film adaptations of J.R.R. Tolkien’s epic fantasy. In the western The Missing (2003), Blanchett brought her trademark complexity to the role of a young woman forced to confront her estranged father (played by Tommy Lee Jones) in order to reclaim her kidnapped daughter. She earned further critical acclaim for her performance as an Irish journalist who runs afoul of the mob in Veronica Guerin (2003). In 2004 she starred in Wes Anderson’s offbeat comedy The Life Aquatic with Steve Zissou, playing a pregnant reporter in a bizarre love triangle with the ship’s captain (played by Bill Murray) and someone who may be his son (played by Owen Wilson). Returning to her study of historical characters, Blanchett portrayed Hollywood star Katharine Hepburn in The Aviator (2004), Martin Scorsese’s biopic of the eccentric millionaire Howard Hughes, and won an Academy Award for the role. She later appeared in the dramas Babel (2006), The Good German (2006), and Notes on a Scandal (2006). In the unconventional biopic I’m Not There (2007), she starred as one of several characters based on the musician Bob Dylan at different stages in his life. As the character Jude, a star making the dramatic shift from acoustic folk to electric rock, Blanchett was praised for capturing the elusive and bewildering qualities attributed to Dylan. Her performance earned her another Academy Award nomination and a Golden Globe Award for best supporting actress. In 2007 Blanchett reprised her role as the English queen in Elizabeth: The Golden Age, which explores Elizabeth’s political battles with Spain and her personal relationship with Sir Walter Raleigh; she earned another Oscar nomination for her performance. The following year she played the Soviet villain Irina Spalko in Steven Spielberg’s Indiana Jones and the Kingdom of the Crystal Skull (2008), another addition to the series of action-adventure films following the dashing archaeologist. In 2008 she also starred opposite Brad Pitt in The Curious Case of Benjamin Button, a drama about a man who ages backward. Two years later she appeared as Marion Loxley in Ridley Scott’s Robin Hood. The action drama starred Russell Crowe in the title role as the outlaw hero. In the thriller Hanna (2011), Blanchett portrayed a CIA agent in pursuit of a former agent and his teenage daughter, whom he has trained to be an assassin. Blanchett again assumed the role of Galadriel in the Hobbit trilogy—An Unexpected Journey (2012), The Desolation of Smaug (2013), and The Battle of the Five Armies (2014), all based on the Tolkien novel that preceded The Lord of the Rings. Her performance in Woody Allen’s Blue Jasmine (2013), as a socialite struggling to cope with a decline in circumstances, won her further acclaim, including an Oscar for best actress; she also received her third Golden Globe. She played a French art historian and Resistance member in The Monuments Men (2014), which fictionalized Allied efforts to recover art stolen by the Nazis during World War II. Blanchett then sank her teeth into the role of the wicked stepmother of the title character in Cinderella (2015). In Truth (2015) she played CBS producer Mary Mapes, who was fired after the accuracy of a segment by reporter Dan Rather on U.S. Pres. George W. Bush’s military service was called into question. Carol, a drama in which she played a married socialite who enters a romantic relationship with a younger store clerk (Rooney Mara), earned her a seventh Oscar nomination. She then joined the ensemble of Knight of Cups (2015), Terrence Malick’s experimental meditation on Hollywood, and later appeared in his film Song to Song (2017), a romantic drama set against the Austin, Texas, music scene. Also in 2017 Blanchett earned critical praise for her vivacious portrayal of Hela, the goddess of death, in Thor: Ragnarok. The next year she starred in Ocean’s 8, the female-driven reboot of the Ocean’s Eleven franchise from the early 2000s, and The House with a Clock in Its Walls, an adaptation of a 1973 children’s fantasy novel. Blanchett was then lauded for her guest appearance as a performance artist akin to Marina Abramović on the mockumentary TV series Documentary Now! in 2019. That year she also played the eponymous character in Where’d You Go, Bernadette, a film based on the best-selling novel. Blanchett’s credits from 2020 included the TV miniseries Mrs. America, in which she portrayed the activist Phyllis Schlafly, who opposed the Equal Rights Amendment. In 2021 Blanchett appeared in the films Don’t Look Up, a dramedy about an impending comet strike that will destroy Earth, and  Guillermo del Toro’s Nightmare Alley. In the latter, a film noir adapted from a novel by William Lindsay Gresham, the actress played a manipulative psychoanalyst who meets a scheming carnival worker (Bradley Cooper). Blanchett earned widespread acclaim for her performance in Tár (2022), a character study about a trailblazing conductor whose career is derailed by allegations of sexual misconduct. In addition to winning a Golden Globe, she also earned her eighth Oscar nomination. In addition to her film work, Blanchett remained active in the theatre. In 2008 she and her husband, writer Andrew Upton, became artistic directors of the Sydney Theatre Company. Blanchett left the position in 2013, though Upton remained. Her performances with the company included Hedda Gabler (2004) and The War of the Roses (2009). In 2017 she made her Broadway debut in The Present, which was based on a play by Anton Chekhov. For her performance, Blanchett received a Tony Award nomination.",
+            "questions": [
+                "When and where were you born?",
+                "What was your educational background, and where did you study art history?",
+                "Where did you begin your professional acting career?",
+                "What was your breakthrough role, and what award did you receive for it?",
+                "In which films did you play historical figures, and what were your characters' names?",
+                "In which film did you portray a character with psychic abilities?",
+                "What role did you play in 'The Lord of the Rings' trilogy?",
+                "What character did you portray in 'The Missing' (2003)?",
+                "In which film did you play a Hollywood star and win an Academy Award?",
+                "Mention three films in which you appeared in 2006.",
+                "In 'I'm Not There' (2007), which musician's various stages of life did you portray?",
+                "What was your role in 'Elizabeth: The Golden Age' (2007), and what was the movie about?",
+                "Who did you play in 'Indiana Jones and the Kingdom of the Crystal Skull' (2008)?",
+                "Describe your character in 'Blue Jasmine' (2013) and the recognition you received for it.",
+                "What was your role in 'Cinderella' (2015)?",
+                "In 'Carol' (2015), what was the character you portrayed, and who played your love interest?",
+                "Name two films you appeared in directed by Terrence Malick.",
+                "What character did you play in 'Thor: Ragnarok' (2017)?",
+                "In which TV series did you portray an activist opposing the Equal Rights Amendment?",
+                "What were two of your notable film appearances in 2021, and what were these movies about?"
+                ],
+            "answers": [
+                "I was born on May 14, 1969, in Melbourne, Victoria, Australia.",
+                "I studied art history at the University of Melbourne and graduated from the National Institute of Dramatic Art in 1992.",
+                "I began my professional acting career in Australia.",
+                "My breakthrough role was as young Queen Elizabeth I in the 1998 film 'Elizabeth,' for which I earned a Golden Globe Award.",
+                "I played historical figures in 'Elizabeth' (Queen Elizabeth I) and 'The Aviator' (Katharine Hepburn).",
+                "I portrayed a character with psychic abilities in 'The Gift'.",
+                "I played the character Galadriel.",
+                "I portrayed a young woman forced to confront my estranged father in order to reclaim my kidnapped daughter.",
+                "The Aviator",
+                "In 2006, I appeared in 'Babel,' 'The Good German,' and 'Notes on a Scandal.'",
+                "In 'I'm Not There' (2007), I portrayed Bob Dylan's character, Jude.",
+                "My role in 'Elizabeth: The Golden Age' (2007) was Queen Elizabeth I. The movie explored her political battles with Spain and her personal relationship with Sir Walter Raleigh.",
+                "I played the character Irina Spalko.",
+                "I played a socialite struggling to cope with a decline in circumstances and won an Oscar for Best Actress and a Golden Globe Award.",
+                "My role was the wicked stepmother.",
+                "I portrayed Carol, and Rooney Mara played my love interest.",
+                "I appeared in Terrence Malick's films 'Knight of Cups' and 'Song to Song.'",
+                "I played the character Hela, the goddess of death.",
+                "In the TV series 'Mrs. America.'",
+                "In 2021, I appeared in 'Don't Look Up', a dramedy about an impending comet strike  that will destroy Earth, and 'Nightmare Alley', a film noir adapted from a novel by William Lindsay Gresham."
+            ],
+        "gender": "F"
+        },
+    "David Beckham": {
+        "knowledge": "Beckham, (born May 2, 1975, Leytonstone, East London, England), English football (soccer) player who gained international fame for his on-field play as well as for his highly publicized personal life. At age 11 Beckham won a football contest, and as a teenager he competed on Manchester United’s youth squad, leading it to a national championship in 1992. Three years later he began playing with the professional team in league competition, and during the 1995–96 season he helped Manchester United win the league title and the Football Association (FA) Cup. Beckham attracted national attention in August 1996 when he scored a goal from the halfway line (a feat roughly equivalent to a golfer’s hole in one). The following year Manchester United successfully defended its league title, and Beckham was voted Young Player of the Year. In the 1998–99 season Manchester United won the league title, the FA Cup, and the European Cup. Beckham was named best midfielder and Most Valuable Player. Considered one of the sport’s elite players, he was perhaps best known for his free kicks and crosses; the 2002 film Bend It Like Beckham paid homage to his kicking ability. After helping Manchester United win three more league titles (2000, 2001, and 2003), he left the team in 2003 to join the Spanish football club Real Madrid. Four years later he signed a record-setting deal with the Los Angeles Galaxy of Major League Soccer (MLS) in the United States. In October 2008 Beckham signed to play with Italian football powerhouse AC Milan during the MLS off-season. In 2011 he helped the Galaxy win an MLS Cup title. The Galaxy won a second MLS Cup title in 2012, and Beckham left the team at the end of the season. In 2013 he joined the French first-league team Paris Saint-Germain (PSG), and PSG won the French domestic title in his one season with the team. Beckham retired from football soon after winning his championship with PSG. In 1996 Beckham first played on England’s national team, in a World Cup qualifying match. At the 1998 World Cup he drew much criticism after he was ejected from a game for kicking an opponent. England lost the match and was eliminated from the competition. In 2000 Beckham was made captain of the national team. At the 2002 and 2006 World Cups, England was defeated in the quarterfinals. After the 2006 tournament, Beckham stepped down as captain, and he was later dropped from England’s national team. He was recalled to the team in 2007, and the following year he posted his 100th international appearance, becoming the fifth person to do so in the history of English football. Beckham was poised to be the first Englishman to appear in four World Cups, but he tore his Achilles tendon while playing for AC Milan in March 2010 and was ruled out for the 2010 tournament. A healthy but older Beckham was not selected for the English side at the 2012 European Championship, and he finished his national career with 115 international games played, the most in his country’s history for a non-goalkeeper. After his playing days ended, Beckham remained involved in soccer. He notably was the owner and president of the MLS team Inter Miami CF, which made its debut in 2020. In 1999 Beckham married singer Victoria Adams, best known as “Posh Spice” of the Spice Girls pop group, in a lavish ceremony. The intense media attention to the couple increased Beckham’s popularity around the world, as did his style of dress and ever-changing hairstyles. In 2003 he was made an Officer of the Order of the British Empire (OBE). He released an eponymous memoir in 2014.",
+        "questions": [
+            "When and where were you born?",
+            "What did you win at the age of 11?",
+            "Which professional football team did you join as a teenager?",
+            "In which season did you and your team win the league title and the FA Cup?",
+            "What memorable feat did you achieve in August 1996?",
+            "In which season did you and your team win the league title, FA Cup, and the European Cup?",
+            "What were you particularly known for in your football career?",
+            "Which Spanish football club did you join in 2003?",
+            "Which team did you sign a record-setting deal with in 2007?",
+            "In which year did you help the LA Galaxy win an MLS Cup title?",
+            "How many MLS Cup titles did the LA Galaxy win during your time with the team?",
+            "Which French first-league team did you join in 2013, and what did you achieve during your time with them?",
+            "When did you retire from professional football?",
+            "In which year did you first play for England's national team, and in what type of match?",
+            "What controversy surrounded you at the 1998 World Cup?",
+            "When were you made the captain of England's national team?",
+            "How far did England go in the 2002 and 2006 World Cups with you as part of the team?",
+            "What happened to you in your football career after the 2006 World Cup?",
+            "How many international appearances did you make for England, and what record did you set in 2008?",
+            "What did you do after retiring from professional football, and which soccer team were you notably involved with?"
+            ],
+        "answers": [
+            "I was born on May 2, 1975, in Leytonstone, East London, England.",
+            "I won a football contest at the age of 11.",
+            "I joined Manchester United's youth squad as a teenager.",
+            "In the 1995–96 season.",
+            "I scored a memorable goal from the halfway line.",
+            "In the 1998–99 season, Manchester United won the league title, the FA Cup, and the European Cup.",
+            "I was particularly known for my free kicks and crosses.",
+            "I joined the Spanish football club Real Madrid.",
+            "I signed a record-setting deal with the Los Angeles Galaxy of Major League Soccer (MLS) in 2007.",
+            "I helped the LA Galaxy win an MLS Cup title in 2011.",
+            "The LA Galaxy won two MLS Cup titles during my time with the team.",
+            "I joined Paris Saint-Germain (PSG) in 2013, and PSG won the French domestic title in my one season with the team.",
+            "I retired from football soon after winning the championship with PSG.",
+            "I first played on England's national team in a World Cup qualifying match in 1996.",
+            "I drew criticism at the 1998 World Cup when I was ejected from a game for kicking an opponent.",
+            "I was made captain of the national team in 2000.",
+            "At the 2002 and 2006 World Cups, England was defeated in the quarterfinals with me as part of the team.",
+            "I stepped down as captain, and I was later dropped from England's national team.",
+            "I posted my 100th international appearance, becoming the fifth person to do so in the history of English football.",
+            "I remained involved in soccer and was the owner and president of the MLS team Inter Miami CF."
+            ],
+        "gender": "M"
+        },
+    "Emma Watson": {
+        "knowledge": "Watson, (born April 15, 1990, Paris, France), British actress and activist who was perhaps best known for playing the young wizard Hermione Granger in the Harry Potter films. She also garnered attention as a spokesperson for women’s equality. Watson was born in Paris to British parents who divorced when she was young. She and her brother went to live with their mother in Oxfordshire, England. While a child, Watson decided she wanted to be an actress. Besides attending school, she took acting and singing classes. She also appeared in several school plays. Watson began acting in earnest in 1999 after she auditioned for a part in the film adaptation of J.K. Rowling’s Harry Potter and the Sorcerer’s Stone (2001). She won the role of smart and logical Hermione, one of Harry Potter’s best friends. The film was a box-office hit, and Watson reprised her role in the franchise’s other movies: Harry Potter and the Chamber of Secrets (2002), Harry Potter and the Prisoner of Azkaban (2004), Harry Potter and the Goblet of Fire (2005), Harry Potter and the Order of the Phoenix (2007), Harry Potter and the Half-Blood Prince (2009), Harry Potter and the Deathly Hallows: Part 1 (2010), and Harry Potter and the Deathly Hallows: Part 2 (2011). After the Potter films ended, Watson began to look for more mature roles. Her first major part was in the drama The Perks of Being a Wallflower (2012), in which she played a high-school senior who becomes friends with a clinically depressed freshman. She subsequently appeared in the crime drama The Bling Ring (2013), the historical thriller The Colony (also known as Colonia; 2015), and the sci-fi thriller The Circle (2017). These films had limited success at the box-office, but Watson had another blockbuster hit with the live-action Disney adaptation (2017) of Beauty and the Beast. In 2019 she appeared as Meg March in Greta Gerwig’s acclaimed Little Women, which was based on Louisa May Alcott’s classic children’s book. Meanwhile, in the midst of her acting career, Watson pursued a college degree. In 2009 she began attending Brown University in Providence, Rhode Island. She took time off as needed for filming, and she also studied for a year at the University of Oxford. Watson graduated from Brown in 2014 with a bachelor’s degree in English literature. That year she was named a UN Women Goodwill Ambassador. She was an advocate for women’s rights and gender equality. From 2016 to 2020 Watson ran an online feminist book club, Our Shared Shelf, to read and discuss books by and about women.",
+        "questions": [
+            "When and where were you born?",
+            "What are you best known for in your career?",
+            "Where did you grow up and who did you live with after your parents' divorce?",
+            "What did you decide to be as a child?",
+            "What steps did you take to pursue your interest in acting as a child?",
+            "In which year did you begin your acting career?",
+            "How did you land the role of Hermione Granger in the Harry Potter films?",
+            "Can you name some of the Harry Potter films in which you played Hermione?",
+            "What was your first major role after the Harry Potter series ended?",
+            "What was the plot of 'The Perks of Being a Wallflower'?",
+            "In which films did you subsequently appear after 'The Perks of Being a Wallflower'?",
+            "Can you name the Disney adaptation in which you had a blockbuster hit?",
+            "In 2019, who did you portray in the film 'Little Women,' and what is the source of the story?",
+            "Where and when did you pursue your college degree?",
+            "What was your major at Brown University, and when did you graduate?",
+            "What role did you take on in 2014, and for what organization?",
+            "What were some of your advocacies and causes as a UN Women Goodwill Ambassador?",
+            "What was the purpose of 'Our Shared Shelf,' the online book club you ran from 2016 to 2020?",
+            "How did you balance your education with your acting career?",
+            "What are some of your notable achievements and contributions?"
+        ],
+        "answers": [
+            "I was born on April 15, 1990, in Paris, France.",
+            "I am best known for playing the young wizard Hermione Granger in the Harry Potter films.",
+            "I grew up in Paris, France, and after my parents' divorce, I lived with my mother and brother in Oxfordshire, England.",
+            "I decided I wanted to be an actress when I was a child.",
+            "I took acting and singing classes and appeared in several school plays.",
+            "I began my acting career in 1999.",
+            "I auditioned for the role of Hermione Granger in the film adaptation of 'Harry Potter and the Sorcerer's Stone' (2001) and won the part.",
+            "I played Hermione Granger in the entire Harry Potter film series, including 'Harry Potter and the Chamber of Secrets' (2002), 'Harry Potter and the Prisoner of Azkaban' (2004), 'Harry Potter and the Goblet of Fire' (2005), 'Harry Potter and the Order of the Phoenix' (2007), 'Harry Potter and the Half-Blood Prince' (2009), 'Harry Potter and the Deathly Hallows: Part 1' (2010), and 'Harry Potter and the Deathly Hallows: Part 2' (2011).",
+            "My first major role after the Harry Potter series was in the drama 'The Perks of Being a Wallflower' (2012).",
+            "The film 'The Perks of Being a Wallflower' is about the journey of a clinically depressed high school freshman and the friendships he forms.",
+            "After 'The Perks of Being a Wallflower,' I appeared in films like 'The Bling Ring' (2013), 'The Colony' (also known as 'Colonia'; 2015), and 'The Circle' (2017).",
+            "I had a blockbuster hit with the live-action Disney adaptation of 'Beauty and the Beast' in 2017.",
+            "In 2019, I portrayed Meg March in Greta Gerwig's acclaimed adaptation of 'Little Women,' based on Louisa May Alcott's classic children's book.",
+            "I began attending Brown University in Providence, Rhode Island, in 2009. I took time off as needed for filming and also studied for a year at the University of Oxford.",
+            "I graduated from Brown in 2014 with a bachelor's degree in English literature.",
+            "In 2014, I was named a UN Women Goodwill Ambassador.",
+            "As a UN Women Goodwill Ambassador, I advocated for women's rights and gender equality.",
+            "I ran the book club to read and discuss books by and about women.",
+            "I took time off from university as needed for filming.",
+            "Some of my notable achievements and contributions include my acting career and advocacy for women's rights."
+        ],
+        "gender": "F"
+        },
+        "Lady Gaga":{
+            "knowledge": "Lady Gaga, (born March 28, 1986, New York City, New York, U.S.), American singer-songwriter and performance artist, known for her flamboyant costumes, provocative lyrics, and strong vocal talents, who achieved enormous popular success with songs such as “Just Dance,” “Bad Romance,” and “Born This Way.” Germanotta was born into an Italian American family in New York City. She learned music at an early age and was performing onstage in New York City clubs by the time she was a teenager. She attended an all-girls school, Convent of the Sacred Heart, in Manhattan before going on to study music at the Tisch School of the Arts at New York University. She studied at Tisch for two years before dropping out to manage her own career. After dropping out, she began transforming herself from Germanotta into Lady Gaga, whose style combined glam rock and over-the-top fashion design. In 2007 she and performance artist Lady Starlight formed a revue called the Ultimate Pop Burlesque Rockshow. That same year Lady Gaga, who also wrote songs for other pop artists such as Fergie, the Pussycat Dolls, and Britney Spears, was signed by the singer Akon to Interscope Records and began preparing her debut album, The Fame, which was released in 2008. Although she modeled herself on such theatrical performers as David Bowie during his Ziggy Stardust period, the New York Dolls, Grace Slick, and Freddie Mercury—her adopted stage name was derived from Queen’s song “Radio Ga Ga”—she created a character that came to occupy a unique space in the music world. Her fashion combined with her up-tempo, synthetic dance music and her edgy, theatrical performance to create stunning sounds and visuals. Indeed, while producing music, Lady Gaga also created her own sexually charged fashions—replete with dazzling wigs and space-age bodysuits—through her creative team Haus of Gaga. Her first single, “Just Dance,” became popular in clubs throughout the United States and Europe and eventually landed at number one on the Billboard Pop Songs chart (also called the radio chart). Three other singles off The Fame—“Poker Face,” “LoveGame,” and “Paparazzi”—also reached number one on the radio chart, making Lady Gaga the first artist in the 17-year history of that chart to have four number ones from a debut album. The Fame was well received critically and proved enormously successful commercially, selling more than eight million copies worldwide by the end of 2009. The album also yielded Lady Gaga five Grammy nominations, including for album of the year and song of the year (“Poker Face”); she captured two Grammys—best dance recording (“Poker Face”) and best electronic/dance album (The Fame)—and her opening duet with Sir Elton John was among the most talked-about elements of the 2010 Grammys telecast. In February 2010 she also picked up three Brit Awards (the British equivalent of the Grammys)—for best international female, best album, and breakthrough act. Her second album, The Fame Monster, was released in November 2009 (it was originally conceived as a bonus disc) and almost instantly produced another hit, “Bad Romance.” Other popular singles from the album followed, including “Telephone” (which featured Beyoncé, as did a nine-minute video produced by Jonas Åkerlund starring the pair and referencing Quentin Tarantino’s film Kill Bill: Vol. 1 [2003]) and “Alejandro.” During 2010 Lady Gaga proved to be one of the most commercially successful artists, with a sold-out concert tour (which had been launched to coincide with the release of The Fame Monster), while she also headlined Chicago’s Lollapalooza music festival and played in front of a record 20,000 people at NBC’s Today show. She was named one of Time magazine’s 100 Most Influential People and was named by Forbes magazine as one of the world’s most powerful women, and she capped off 2010 by being named Billboard magazine’s artist of the year. After arriving at the 2011 Grammy Awards ceremony encased in a giant egg, Lady Gaga went on to claim honours for best pop vocal album (for The Fame Monster) and best female pop vocal performance and best short form video (for “Bad Romance”). Lady Gaga’s third album, Born This Way (2011), found the entertainer reaching back to earlier musical eras for inspiration. As a blonde dance-pop performer with a penchant for provocation, Lady Gaga had often earned comparisons to the singer Madonna, and on the album’s first two singles the similarities were especially pronounced. The title track was a self-empowerment anthem in the style of Madonna’s 1989 single “Express Yourself,” while “Judas” brazenly mixed sexual and religious imagery. Both songs quickly became hits. Other tracks on the album featured guest appearances from guitarist Brian May of Queen and saxophonist Clarence Clemons of Bruce Springsteen’s E Street Band. In 2013 Lady Gaga released Artpop. Although the energetic lead single “Applause” extended her string of chart successes, the album was perceived as a commercial disappointment. She came back the following year with Cheek to Cheek, a collection of standards that she recorded with Tony Bennett. The recording topped the Billboard 200 as well as the jazz and traditional jazz album charts, and it earned the Grammy for best traditional pop vocal album. The duo also won that award for their second collaboration, Love for Sale (2021), a tribute album to Cole Porter. During this time Lady Gaga continued to record solo albums. The relatively understated Joanne (2016) performed poorly until Lady Gaga’s halftime Super Bowl performance in February 2017 brought it favourable attention. For her sixth studio album, Chromatica (2020), Lady Gaga returned to her earlier music, mixing disco and electronic-pop. In addition to recording music, Lady Gaga made occasional film appearances, notably in Machete Kills (2013) and Sin City: A Dame to Kill For (2014). She played a vampiric countess with no regard for life or suffering in the fifth season of the television show American Horror Story: Hotel (2015–16). For her performance in the anthology series, Lady Gaga received a Golden Globe Award. She also appeared in the sixth season, which aired in 2016. Lady Gaga garnered critical acclaim and an Academy Award nomination for her first lead role, a guileless up-and-coming singer-songwriter in the 2018 remake of the movie A Star Is Born. She cowrote most of that movie’s songs, many of which she performed with costar and director Bradley Cooper. The lead single, “Shallow,” won two Grammy Awards and the Oscar for best original song. In 2021 Lady Gaga appeared in Ridley Scott’s House of Gucci, which centres on the true story of the murder of Maurizio Gucci, who headed his family’s luxury fashion brand. Lady Gaga also contributed songs to other films. She notably cowrote and performed “Til It Happens to You” for the documentary The Hunting Ground (2015) and “Hold My Hand” for Top Gun: Maverick (2022). Both tracks received Oscar nominations for best original song. Lady Gaga cultivated a devoted following, particularly among gay men (she acknowledged her own bisexuality), who became some of her most loyal fans. She became particularly outspoken on gay rights, especially same-sex marriage, and was a featured speaker at the 2009 National Equality March in Washington, D.C. In 2021 Lady Gaga sang the national anthem at the U.S. presidential inauguration of Joe Biden.",
+            "questions": [
+                "When and where were you born?",
+                "What is your family background?",
+                "Which New York City school did you attend before pursuing music?",
+                "Where did you study music before dropping out to manage your own career?",
+                "What was the name of the revue you formed with Lady Starlight in 2007?",
+                "Who signed you to Interscope Records in 2007?",
+                "What was the title of your debut album, and when was it released?",
+                "How many Grammy nominations did you receive for 'The Fame' album, and in which categories did you win?",
+                "What was the title of your second album, and what was its initial purpose?",
+                "Which hit single from 'The Fame Monster' was released in November 2009?",
+                "Which famous artist was featured in the song 'Telephone'?",
+                "In 2010, what was the capacity of the crowd at your performance on NBC's Today show?",
+                "Which magazines named you one of the most influential people and one of the world's most powerful women in 2010?",
+                "Who did you collaborate with for the album 'Cheek to Cheek'?",
+                "What type of songs did you record for 'Cheek to Cheek,' and which Grammy did it win?",
+                "What was the title of your sixth studio album, released in 2020?",
+                "In which television series did you portray a vampiric countess and receive a Golden Globe Award?",
+                "For which role did you receive an Academy Award nomination, and what was the name of the movie?",
+                "For which song from the movie 'A Star Is Born' did you win two Grammy Awards and an Oscar for Best Original Song?",
+                "In Ridley Scott's 'House of Gucci,' what true story does the movie center on?"
+            ],
+            "answers": [
+                "I was born on March 28, 1986, in New York City.",
+                "I was born into an Italian American family.",
+                "I attended the Convent of the Sacred Heart in Manhattan.",
+                "I studied music at the Tisch School of the Arts at New York University before dropping out to manage my own career.",
+                "The revue I formed with Lady Starlight in 2007 was called the 'Ultimate Pop Burlesque Rockshow.'",
+                "I was signed by the singer Akon to Interscope Records in 2007.",
+                "My debut album was titled 'The Fame,' and it was released in 2008.",
+                "I received five Grammy nominations for 'The Fame,' and I won two Grammys for 'Poker Face' (best dance recording) and 'The Fame' (best electronic/dance album).",
+                "My second album was titled 'The Fame Monster,' and it was originally conceived as a bonus disc.",
+                "The hit single from 'The Fame Monster' that was released in November 2009 was 'Bad Romance.'",
+                "Beyoncé was featured in the song 'Telephone.'",
+                "In 2010, I played in front of a record 20,000 people at NBC's Today show.",
+                "I was named one of Time magazine's 100 Most Influential People and was named by Forbes magazine as one of the world's most powerful women in 2010.",
+                "I collaborated with Tony Bennett for the album 'Cheek to Cheek.'",
+                "'Cheek to Cheek' featured standards and won the Grammy for best traditional pop vocal album.",
+                "My sixth studio album, released in 2020, was titled 'Chromatica.'",
+                "In the fifth season of the television show 'American Horror Story: Hotel' (2015–16).",
+                "I received an Academy Award nomination for my lead role in the 2018 remake of the movie 'A Star Is Born.'",
+                "The song 'Shallow' from 'A Star Is Born'.",
+                "It centers on the true story of the murder of Maurizio Gucci, who headed his family's luxury fashion brand."
+            ],
+            "gender": "F"
+        },
+        "Madonna":{
+        "knowledge": "Madonna, (born August 16, 1958, Bay City, Michigan, U.S.), American singer, songwriter, actress, and entrepreneur whose immense popularity in the 1980s and ’90s allowed her to achieve levels of power and control that were nearly unprecedented for a woman in the entertainment industry. Born into a large Italian American family, Madonna studied dance at the University of Michigan and with the Alvin Ailey American Dance Theater in New York City in the late 1970s before relocating briefly to Paris as a member of Patrick Hernandez’s disco revue. Returning to New York City, she performed with a number of rock groups before signing with Sire Records. Her first hit, “Holiday,” in 1983, provided the blueprint for her later material—an upbeat dance club sound with sharp production and an immediate appeal. Madonna’s melodic pop incorporated catchy choruses, and her lyrics concerned love, sex, and relationships—ranging from the breezy innocence of “True Blue” (1986) to the erotic fantasies of “Justify My Love” (1990) to the spirituality of later songs such as “Ray of Light” (1998). Criticized by some as being limited in range, her sweet girlish voice nonetheless was well suited to pop music. Madonna was the first female artist to exploit fully the potential of the music video. She collaborated with top designers (Jean-Paul Gaultier), photographers (Steven Meisel and Herb Ritts), and directors (Mary Lambert and David Fincher), drawing inspiration from underground club culture or the avant-garde to create distinctive sexual and satirical images—from the knowing ingenue of “Like a Virgin” (1984) to the controversial red-dressed “sinner” who kisses a Black saint in “Like a Prayer” (1989). By 1991 she had scored 21 top ten hits in the United States and sold some 70 million albums internationally, generating $1.2 billion in sales. Committed to controlling her image and career herself, Madonna became the head of Maverick, a subsidiary of Time Warner created by the entertainment giant as part of a $60 million deal with the performer. Her success signaled a clear message of financial control to other women in the industry, but in terms of image she was a more ambivalent role model. In 1992 Madonna took her role as a sexual siren to its full extent when she published Sex, a soft-core pornographic coffee-table book featuring her in a variety of “erotic” poses. She was criticized for being exploitative and overcalculating, and writer Norman Mailer said she had become “secretary to herself.” Soon afterward Madonna temporarily withdrew from pop music to concentrate on a film career that had begun with a strong performance in Desperately Seeking Susan (1985), faltered with the flimsy Shanghai Surprise (1986) and Dick Tracy (1990), and recovered with Truth or Dare (1991, also known as In Bed with Madonna), a documentary of one of her tours, and A League of Their Own (1992). She scored massive success in 1996 with the starring role in the film musical Evita. That year she also gave birth to a daughter. In 1998 Madonna released her first album of new material in four years, Ray of Light. A fusion of techno music and self-conscious lyrics, it was a commercial and critical success, earning the singer her first musical Grammy Awards, among them the award for best pop album (her previous win had been for a video). She won another Grammy the following year, for the song “Beautiful Stranger,” which she cowrote and performed for the movie Austin Powers: The Spy Who Shagged Me (1999). Her experimentation in electronica continued with Music (2000). In 2005 she returned to her roots with Confessions on a Dance Floor, which took the Grammy for best electronic/dance album. Despite a marriage in the 1980s to actor Sean Penn and another to English director Guy Ritchie (married 2000; divorced 2008), with whom she had a son, Madonna remained resolutely independent. (She also later adopted four children from Malawi.) That independent streak, however, did not prevent her from enlisting the biggest names in music to assist on specific projects. This fact was clear on Hard Candy (2008), a hip-hop-infused effort with writing and vocal and production work by Justin Timberlake, Timbaland, and Pharrell Williams of the hit-making duo the Neptunes. With MDNA (2012), which featured cameos from rappers M.I.A. and Nicki Minaj, she continued to prove herself a shrewd assimilator of cutting-edge musical styles. Rebel Heart (2015), featuring production work by Diplo and Kanye West and guest appearances from Minaj and Chance the Rapper, was an ode to her career. In 2019 Madonna released her 14th studio album, Madame X, which was inspired by her 2017 move to Lisbon, Portugal, and contained music influenced by Latin pop, art pop, and hip-hop. Madonna was inducted into the Rock and Roll Hall of Fame in 2008. In addition to acting in movies—she also starred in the romantic comedy The Next Best Thing (2000) and in Ritchie’s Swept Away (2002)—Madonna pursued work behind the camera. She cowrote and directed Filth and Wisdom (2008), a comedy about a trio of mismatched flatmates in London, as well as the drama W.E. (2011), which juxtaposed the historical romance between Wallis Simpson and King Edward VIII with the fictional story of a woman in the 1990s researching Simpson’s life.",
+        "questions": [
+            "When and where were you born?",
+            "Where did you study dance before your career took off?",
+            "Which was your first hit song, and what year was it released?",
+            "How did you pioneer the use of music videos in your career?",
+            "What was your role in Maverick, a subsidiary of Time Warner?",
+            "In 1992, you published a controversial coffee-table book called 'Sex.' What was the book about?",
+            "What significant documentary film did you release in 1991?",
+            "What major musical role did you play in the film 'Evita' in 1996?",
+            "Which album brought you your first Grammy Awards?",
+            "Who did you collaborate with on your albums 'Hard Candy' and 'MDNA'?",
+            "Where did you move to in 2017 and the place inspired which album in 2019?",
+            "When were you inducted into the Rock and Roll Hall of Fame?",
+            "Besides singing, in which films did you take on acting roles?",
+            "What was the subject of the film 'W.E.' that you directed?",
+            "Who were your notable spouses, and how many children did you adopt?",
+            "Which song did you perform for the movie 'Austin Powers: The Spy Who Shagged Me (1999)'?",
+            "How did your 1980s and '90s popularity empower you in the entertainment industry?",
+            "By 1991 how many top ten hits in the US had you scored and how many albums did you sell internationally?",
+            "What kind of music is your album 'Music' known for?",
+            "What was the focus of your 2015 album 'Rebel Heart'?"
+            ],
+        "answers": [
+            "I was born on August 16, 1958, in Bay City, Michigan, U.S.",
+            "I studied dance at the University of Michigan and with the Alvin Ailey American Dance Theater in New York City in the late 1970s.",
+            "My first hit song was 'Holiday,' and it was released in 1983.",
+            "I was the first female artist to exploit fully the potential of the music video, collaborating with top designers, photographers, and directors to create distinctive sexual and satirical images.",
+            "I became the head of Maverick.",
+            "It was a controversial coffee-table book featuring soft-core pornographic content and her various erotic poses.",
+            "I released 'Truth or Dare,' also known as 'In Bed with Madonna.'",
+            "I played the starring role in the film musical 'Evita' in 1996.",
+            "I won my first Grammy Awards for the album, Ray of Light.",
+            "I collaborated with Justin Timberlake, Timbaland, and Pharrell Williams on my album 'Hard Candy' and rappers M.I.A. and Nicki Minaj on 'MDNA.'",
+            "I moved to Lisbon, Portugal in 2017, which inspired my 2019 album 'Madame X.'",
+            "I was inducted into the Rock and Roll Hall of Fame in 2008.",
+            "I took on acting roles in films such as 'Desperately Seeking Susan,' 'Shanghai Surprise,' and 'Dick Tracy.'",
+            "The subject of the film 'W.E.' juxtaposed the historical romance between Wallis Simpson and King Edward VIII with the fictional story of a woman in the 1990s researching Simpson's life.",
+            "My notable spouses included Sean Penn and Guy Ritchie, and I adopted four children from Malawi.",
+            "I performed the song 'Beautiful Stranger,'.",
+            "My immense popularity in the 1980s and '90s allowed me to achieve levels of power and control nearly unprecedented for a woman in the entertainment industry.",
+            "I had scored 21 top ten hits in the United States and sold some 70 million albums internationally.",
+            "My album 'Music' is known for my experimentation in electronica.",
+            "The focus of my 2015 album 'Rebel Heart' was an ode to my career."
+        ],
+        "gender": "F"
+         },
+    "Mark Zuckerberg":{
+        "knowledge": "Zuckerberg, (born May 14, 1984, White Plains, New York, U.S.), American computer programmer who was cofounder and CEO (2004– ) of Facebook, a social networking website. After attending Phillips Exeter Academy, Zuckerberg enrolled at Harvard University in 2002. On February 4, 2004, he launched thefacebook dot com (renamed Facebook in 2005), a directory in which fellow Harvard students entered their own information and photos into a template that he had devised. Within two weeks half of the student body had signed up. Zuckerberg’s roommates, Dustin Moskovitz and Chris Hughes, helped him add features and make the site available to other campuses across the country. Facebook quickly became popular as registered users could create profiles, upload photos and other media, and keep in touch with friends. It differed from other social networking sites, however, in its emphasis on real names (and e-mail addresses), or “trusted connections.” It also laid particular emphasis on networking, with information disseminated not only to each individual’s network of friends but also to friends of friends—what Zuckerberg called the “social graph.” In the summer of 2004 the trio moved their headquarters to Palo Alto, California, where Zuckerberg talked venture capitalist Peter Thiel into giving them seed money. Zuckerberg dropped out of Harvard to concentrate on the fledgling company, of which he became CEO and president. In May 2005 Facebook received its first major infusion of venture capital ($12.7 million). Four months later Facebook opened to registration by high-school students. Meanwhile, foreign colleges and universities also began to sign up, and by September 2006 anyone with an e-mail address could join a regional network based on where he or she lived. About that time Zuckerberg turned down a $1 billion buyout offer from Yahoo!, but in 2007 Facebook struck a deal with Microsoft in which the software company paid $240 million for a 1.6 percent stake in Facebook; two years later Digital Sky Technologies purchased a 1.96 percent share for $200 million. In 2008 Zuckerberg’s new worth was estimated at about $1.5 billion. After Facebook’s initial public offering (IPO) of stock in 2012, Zuckerberg’s net worth was estimated at more than $19 billion. In October 2021 Facebook announced that it was changing the name of its parent company to Meta Platforms. The name change reflected an emphasis on the “metaverse”, in which users would interact in virtual reality environments.",
+        "questions": [
+            "When and where were you born?",
+            "Where did you attend school before enrolling at Harvard University?",
+            "What was the original name of Facebook when it was launched in 2004?",
+            "Who were your roommates and what role did they play in the development of Facebook?",
+            "What was the distinctive feature of Facebook in terms of user information?",
+            "In which city did you move the headquarters of Facebook in the summer of 2004?",
+            "How did you secure initial funding for Facebook?",
+            "Why did you decide to drop out of Harvard?",
+            "How much venture capital did Facebook receive in May 2005?",
+            "When did Facebook open registration to high-school students?",
+            "How did Facebook expand its user base to include regional networks?",
+            "Which company made a $240 million investment in Facebook in 2007?",
+            "What was your estimated net worth in 2008?",
+            "How did your net worth change after Facebook's initial public offering (IPO) in 2012?",
+            "In October 2021, Facebook announced a name change to Meta Platforms. What was the reason behind this name change?",
+            "What term did you use to describe the emphasis on virtual reality environments?",
+            "Can you describe the concept of the 'social graph' that Facebook emphasized?",
+            "What could registered Facebook users do in the beginning?",
+            "How quickly did half of the student body at Harvard sign up for Facebook when it was launched?",
+            "Who made an offer to buy Facebook for $1 billion in the mid-2000s?"
+        ],
+        "answers": [
+            "I was born on May 14, 1984, in White Plains, New York, U.S.",
+            "I attended Phillips Exeter Academy before enrolling at Harvard University.",
+            "The original name of Facebook when it was launched in 2004 was 'thefacebook dot com.'",
+            "My roommates were Dustin Moskovitz and Chris Hughes, and they helped me add features and make Facebook available to other campuses across the country.",
+            "The distinctive feature of Facebook was its emphasis on real names and e-mail addresses.",
+            "I moved the headquarters of Facebook to Palo Alto, California, in the summer of 2004.",
+            "I talked venture capitalist Peter Thiel into giving me seed money to fund Facebook.",
+            "I dropped out of Harvard to concentrate on my fledgling company.",
+            "Facebook received $12.7 million in its first major infusion of venture capital in May 2005.",
+            "Facebook opened registration to high-school students four months after its launch.",
+            "Facebook expanded its user base to include regional networks, allowing anyone with an email address to join a network based on their location.",
+            "In 2007, Microsoft made a $240 million investment in Facebook.",
+            "My estimated net worth in 2008 was about $1.5 billion.",
+            "After Facebook's IPO in 2012, my net worth was estimated at more than $19 billion.",
+            "Facebook changed its name to Meta Platforms with an emphasis on the 'metaverse.'",
+            "I used the term 'metaverse' to describe the emphasis on virtual reality environments.",
+            "The 'social graph' in Facebook referred to the network of friends and friends of friends through which information was disseminated.",
+            "They could create profiles, upload photos and other media, and keep in touch with friends.",
+            "Half of the student body at Harvard signed up for Facebook within two weeks of its launch.",
+            "Yahoo! made the offer."
+        ],
+        "gender": "M"
+        }
+}

gen_embeds.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import argparse
+from ctypes import alignment
+import os
+import sys
+sys.path.append('rtvc/')
+from pathlib import Path
+import spacy
+import matplotlib.pyplot as plt
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+import noisereduce as nr
+from rtvc.encoder import inference as encoder
+from rtvc.encoder.params_data import *
+from rtvc.synthesizer.inference import Synthesizer_infer
+from rtvc.utils.argutils import print_args
+from rtvc.utils.default_models import ensure_default_models
+from rtvc.vocoder import inference as vocoder
+from rtvc.speed_changer.fixSpeed import *
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--run_id", type=str, default="default", help= \
+    "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
+    "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
+    "states and restart from scratch.")
+    parser.add_argument("-m", "--models_dir", type=Path, default="rtvc/saved_models",
+                        help="Directory containing all saved models")
+    parser.add_argument("--weight", type=float, default=1,
+                        help="weight of input audio for voice filter")
+    parser.add_argument("--griffin_lim",
+                        action="store_true",
+                        help="if True, use vocoder, else use griffin-lim")
+    parser.add_argument("--cpu", action="store_true", help=\
+        "If True, processing is done on CPU, even when a GPU is available.")
+    parser.add_argument("--no_sound", action="store_true", help=\
+        "If True, audio won't be played.")
+    parser.add_argument("--seed", type=int, default=None, help=\
+        "Optional random number seed value to make toolbox deterministic.")
+    args = parser.parse_args()
+    arg_dict = vars(args)
+    print_args(args, parser)
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    print("Running a test of your configuration...\n")
+    if torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        ## Print some environment information (for debugging purposes)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" %
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Using CPU for inference.\n")
+    ## Load the models one by one.
+    if not args.griffin_lim:
+        print("Preparing the encoder, the synthesizer and the vocoder...")
+    else:
+        print("Preparing the encoder and the synthesizer...")
+    ensure_default_models(args.run_id, Path("rtvc/saved_models"))
+    encoder.load_model(list(args.models_dir.glob(f"{args.run_id}/encoder.pt"))[0])
+    synthesizer = Synthesizer_infer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0])
+    if not args.griffin_lim:
+        vocoder.load_model(list(args.models_dir.glob(f"{args.run_id}/vocoder.pt"))[0])
+    nlp = spacy.load('en_core_web_sm')
+    weight = arg_dict["weight"] # 声音美颜的用户语音权重
+    amp = 1
+    directory = "input_audios"
+    pathlist = Path(directory).rglob('*.*')
+    for path in pathlist:
+        path = str(path)
+        print(path)
+        # enter the number of reference audios
+        # Computing the embedding
+        # First, we load the wav using the function that the speaker encoder provides. This is
+        # important: there is preprocessing that must be applied.
+        # The following two methods are equivalent:
+        # - Directly load from the filepath:
+        # preprocessed_wav = encoder.preprocess_wav(in_fpath)
+        # - If the wav is already loaded:
+        # get duration info from input audio
+        in_fpath = Path(path.replace("\"", "").replace("\'", ""))
+        fpath_without_ext = os.path.splitext(str(in_fpath))[0]
+        speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1]
+        is_wav_file, wav, wav_path = TransFormat(in_fpath, 'wav')
+        # 除了m4a格式无法工作而必须转换以外，无论原格式是否为wav，从稳定性的角度考虑也最好再转为wav（因为某些wav本身不带比特率属性，无法在此代码中工作，因此需要转换以赋予其该属性）
+        if not is_wav_file:
+            os.remove(wav_path)  # remove intermediate wav files
+        preprocessed_wav = encoder.preprocess_wav(wav)
+        print("Loaded input audio file succesfully")
+        # Then we derive the embedding. There are many functions and parameters that the
+        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
+        # only use this function (with its default parameters):
+        embed = encoder.embed_utterance(preprocessed_wav)
+        embed[embed < set_zero_thres]=0 # 噪声值置零
+        if not os.path.exists("embeds"):
+            os.mkdir("embeds")
+        np.save(f"embeds/{speaker_name}.npy", embed)

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+inflect==5.3.0
+librosa==0.8.1
+matplotlib==3.5.1
+Pillow==8.4.0
+PyQt5==5.15.6
+scikit-learn==1.0.2
+scipy==1.7.3
+sounddevice==0.4.3
+SoundFile==0.10.3.post1
+tqdm==4.62.3
+umap-learn==0.5.2
+Unidecode==1.3.2
+urllib3==1.26.7
+visdom==0.1.8.9
+noisereduce==2.0.1
+pydub==0.25.1
+ffmpeg==1.4
+seaborn==0.12.1
+spacy==3.7.2
+praat-parselmouth==0.4.1
+torch==1.11.0
+torchaudio==0.11.0
+tensorflow-cpu==2.9.0
+denoiser==0.1.5
+SpeechRecognition==3.10.0
+transformers==4.25.1
+streamlit==1.27.2
+sentence-transformers==2.2.2
+evaluate==0.4.1
+https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
+protobuf==3.20

rtvc/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ rtvc/saved_models/default/*.pt filter=lfs diff=lfs merge=lfs -text

rtvc/.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+out_audios/
+launch.json
+*.pyc
+*.aux
+*.log
+*.out
+*.synctex.gz
+*.suo
+*__pycache__
+*.idea
+*.ipynb_checkpoints
+*.pickle
+*.npy
+*.bz2
+*.blg
+*.bbl
+*.bcf
+*.toc
+*.sh
+*.pt
+*.whl
+*.m4a
+log/
+syn_results
+toolbox_results
+dim_reduction_results

rtvc/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,18 @@

+## What's new
+**2022.05.19：** We calculated GE2E loss in encoder with CUDA rather than originally-configured CPU. It speeds up the encoder training speed.<br>
+**2022.07.15：** We added Loss animation plot for synthesizer and vocoder.<br>
+**2022.07.19：** We added response time and Griffin-Lim vocoder results for demo_toolbox.<br>
+**2022.07.29：** We added model validation for encoder, synthesizer and vocoder.<br>
+**2022.08.02：** We added voxceleb train and dev data for encoder. We added [noisereduce](https://github.com/timsainb/noisereduce) denoiser for the output wav from vocoder.<br>
+**2022.08.06：** We split the long text into short sentences using spacy for input of synthesizer. Make sure to install spaCy model en_core_web_sm by
+`python -m spacy download en_core_web_sm`<br>
+**2022.09.02：** We set prop_decrease=0.6 for male and 0.9 for female in noisereduce function.(输出滤波，男女声使用不同的滤波参数)<br>
+**2022.09.26：** We added speed adjustment(声音变速) for output audios using praat, install parselmouth using pip: `pip install praat-parselmouth`<br>
+**2022.10.10：** We added voice filter functioning(声音美颜) for input audios, the weight ratio of the input audio embed and the standard audio embed is 7: 3. <br>
+**2022.10.25：** We set small values(<0.06) to zeros in embed.(对嵌入向量较小值置零)<br>
+**2022.10.26：** The split frequency for input audio is 170Hz. The split frequency for output noise reduce is 165Hz.<br>
+**2022.12.01：** merge the single sentences to input.<br>
+**2022.12.31：** added speaker embeddings dimension reduction visualzation results.<br>
+**2023.01.01：** did more text preprocessing and text cleaning for TTS text input.<br>
+**2023.02.27：** preprocessed ascii chars and abbreviations.<br>
+**2023.06.09：** We added VCTK train and dev data for synthesizer. We also combine a [deep learning denoiser](https://github.com/facebookresearch/denoiser) with the [noisereduce](https://github.com/timsainb/noisereduce) denoiser for optimized output wav quality.<br>

rtvc/LICENSE.md ADDED Viewed

	@@ -0,0 +1,24 @@

+MIT License
+Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Original work Copyright (c) 2015 braindead (https://github.com/braindead)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

rtvc/README.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# Real-Time Voice Cloning v2
+### What is this?
+It is an improved version of [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning). Our emotion voice cloning implementation is [here](https://github.com/liuhaozhe6788/voice-cloning-collab/tree/add_emotion)!
+## Installation
+1. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
+2. Create a new conda environment with
+```
+conda create -n rtvc python=3.7.13
+```
+3. Install [PyTorch](https://download.pytorch.org/whl/torch_stable.html).  Pick the proposed CUDA version if you have a GPU, otherwise pick CPU.
+My torch version: `torch=1.9.1+cu111`
+`torchvision=0.10.1+cu111`
+4. Install the remaining requirements with
+```
+pip install -r requirements.txt
+```
+5. Install spaCy model en_core_web_sm by
+`python -m spacy download en_core_web_sm`
+## Training
+### Encoder
+**Download dataset：**
+1. [LibriSpeech](https://www.openslr.org/12): train-other-500 for training, dev-other for validation
+(extract as <datasets_root>/LibriSpeech/<dataset_name>)
+2. [VoxCeleb1](https://mm.kaist.ac.kr/datasets/voxceleb/): Dev A - D for training, Test for validation as well as the metadata file `vox1_meta.csv` (extract as <datasets_root>/VoxCeleb1/ and <datasets_root>/VoxCeleb1/vox1_meta.csv)
+3. [VoxCeleb2](https://mm.kaist.ac.kr/datasets/voxceleb/): Dev A - H for training, Test for validation
+(extract as <datasets_root>/VoxCeleb2/)
+**Encoder preprocessing：**
+```
+python encoder_preprocess.py <datasets_root>
+```
+**Encoder training：**
+it is recommended to start visdom server for monitor training with
+```
+visdom
+```
+then start training with
+```
+python encoder_train.py <model_id> <datasets_root>/SV2TTS/encoder
+```
+### Synthesizer
+**Download dataset：**
+1. [LibriSpeech](https://www.openslr.org/12): train-clean-100 and train-clean-360 for training, dev-clean for validation (extract as <datasets_root>/LibriSpeech/<dataset_name>)
+2. [LibriSpeech alignments](https://drive.google.com/file/d/1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE/view?usp=sharing): merge the directory structure with the LibriSpeech datasets you have downloaded (do not take the alignments from the datasets you haven't downloaded else the scripts will think you have them)
+3. [VCTK](https://datashare.ed.ac.uk/handle/10283/3443): used for training and validation
+**Synthesizer preprocessing:**
+```
+python synthesizer_preprocess_audio.py <datasets_root>
+python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer
+```
+**Synthesizer training:**
+```
+python synthesizer_train.py <model_id> <datasets_root>/SV2TTS/synthesizer --use_tb
+```
+if you want to monitor the training progress, run
+```
+tensorboard --logdir log/vc/synthesizer --host localhost --port 8088
+```
+### Vocoder
+**Download dataset：**
+The same as synthesizer. You can skip this if you already download synthesizer training dataset.
+**Vocoder preprocessing:**
+```
+python vocoder_preprocess.py <datasets_root>
+```
+**Vocoder training:**
+```
+python vocoder_train.py <model_id> <datasets_root> --use_tb
+```
+if you want to monitor the training progress, run
+```
+tensorboard --logdir log/vc/vocoder --host localhost --port 8080
+```
+**Note:**
+Training breakpoints are saved periodically, so you can run the training command and resume training when the breakpoint exists.
+## Inference
+**Terminal:**
+```
+python demo_cli.py
+```
+First input the number of audios, then input the audio file paths, then input the text message. The attention alignments and mel spectrogram are stored in syn_results/. The generated audio is stored in out_audios/.
+**GUI demo:**
+```
+python demo_toolbox.py
+```
+## Dimension reduction visualization
+**Download dataset:**
+[LibriSpeech](https://www.openslr.org/12): test-other
+(extract as <datasets_root>/LibriSpeech/<dataset_name>)
+**Preprocessing:**
+```
+python encoder_test_preprocess.py <datasets_root>
+```
+**Visualization:**
+```
+python encoder_test_visualization.py <model_id> <datasets_root>
+```
+The results are saved in dim_reduction_results/.
+## Pretrained models
+You can download the pretrained model from [this](https://drive.google.com/drive/folders/11DFU_JBGet_HEwUoPZGDfe-fDZ42eqiG) and extract as saved_models/default
+## Demo results
+The audio results are [here](https://liuhaozhe6788.github.io/voice-cloning-collab/index.html)

rtvc/css/bootstrap.min.css ADDED Viewed

The diff for this file is too large to render. See raw diff

rtvc/css/custom.css ADDED Viewed

	@@ -0,0 +1,196 @@

+body {
+  font-family: "Roboto", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
+  background-color: #FCFCFC;
+  -webkit-font-smoothing: antialiased;
+  font-size: 1.8em;
+  line-height: 1.5;
+  font-weight: 300;
+  width: 100%
+}
+h1, h2, h3, h4, h5, h6 {
+    color: #263c4c;
+}
+h2, h3, h4, h5, h6 {
+    margin-top: 5rem;
+    margin-bottom: 3rem;
+    font-weight: bold;
+    padding-bottom: 10px;
+}
+h1 { font-size: 3.0rem; }
+h2 {
+    margin-top: 6rem;
+    font-size: 2.6rem;
+}
+h3 { font-size: 2.1rem; }
+h4,
+h5,
+h6 { font-size: 1.9rem; }
+h2.entry-title {
+    font-size: 2.1rem;
+    margin-top: 0;
+    font-weight: 400;
+    border-bottom: none;
+}
+li {
+    margin-bottom: 0.5rem;
+    margin-left: 0.7em;
+}
+img {
+  max-width: 100%;
+  height: auto;
+  vertical-align: middle;
+  border: 0;
+  margin: 1em 0;
+}
+header,
+footer {
+  margin: 4rem 0;
+  text-align: center;
+}
+main {
+  margin: 4rem 0;
+}
+.container {
+  width: 90%;
+  /* max-width: 700px; */
+}
+.header-logo img {
+  border-radius: 50%;
+  border: 2px solid #E1E1E1;
+}
+.header-logo img:hover {
+  border-color: #F1F1F1;
+}
+.site-title {
+  margin-top: 2rem;
+}
+.entry-title {
+  margin-bottom: 0;
+}
+.entry-title a {
+  text-decoration: none;
+}
+.entry-meta {
+  display: inline-block;
+  margin-bottom: 2rem;
+  font-size: 1.6rem;
+  color: #888;
+}
+.footer-link {
+  margin: 2rem 0;
+}
+.hr {
+  height: 1px;
+  margin: 2rem 0;
+  background: #E1E1E1;
+  background: -webkit-gradient(linear, left top, right top, from(white), color-stop(#E1E1E1), to(white));
+  background: -webkit-linear-gradient(left, white, #E1E1E1, white);
+  background: linear-gradient(to right, white, #E1E1E1, white);
+}
+article .social {
+  height: 40px;
+  padding: 10px 0;
+}
+address {
+    margin: 0;
+    font-size:0.9em;
+    max-height: 60px;
+    font-weight: 300;
+    font-style: normal;
+    display: block;
+}
+address a {
+    text-decoration: none;
+}
+.avatar-bottom img {
+    border-radius: 50%;
+    border: 1px solid #E1E1E1;
+    float: left;
+    max-width: 100%;
+    vertical-align: middle;
+    width: 32px;
+    height: 32px;
+    margin: 0 20px 0 0;
+    margin-top: -7px;
+}
+.avatar-bottom img:hover {
+  border-color: #F1F1F1;
+}
+.copyright {
+    font-size:0.9em;
+    font-weight: 300;
+}
+.github {
+    float: right;
+}
+blockquote {
+    position: relative;
+    padding: 10px 10px 10px 32px;
+    box-sizing: border-box;
+    font-style: italic;
+    color: #464646;
+    background: #e0e0e0;
+}
+blockquote:before{
+    display: inline-block;
+    position: absolute;
+    top: 0;
+    left: 0;
+    vertical-align: middle;
+    content: "\f10d";
+    font-family: FontAwesome;
+    color: #e0e0e0;
+    font-size: 22px;
+    line-height: 1;
+    z-index: 2;
+}
+blockquote:after{
+    position: absolute;
+    content: '';
+    left: 0;
+    top: 0;
+    border-width: 0 0 40px 40px;
+    border-style: solid;
+    border-color: transparent #ffffff;
+}
+blockquote p {
+    position: relative;
+    padding: 0;
+    margin: 10px 0;
+    z-index: 3;
+    line-height: 1.7;
+}
+blockquote cite {
+    display: block;
+    text-align: right;
+    color: #888888;
+    font-size: 0.9em;
+}

rtvc/css/normalize.css ADDED Viewed

	@@ -0,0 +1,427 @@

+/*! normalize.css v3.0.2 | MIT License | git.io/normalize */
+/**
+ * 1. Set default font family to sans-serif.
+ * 2. Prevent iOS text size adjust after orientation change, without disabling
+ *    user zoom.
+ */
+html {
+  font-family: sans-serif; /* 1 */
+  -ms-text-size-adjust: 100%; /* 2 */
+  -webkit-text-size-adjust: 100%; /* 2 */
+}
+/**
+ * Remove default margin.
+ */
+body {
+  margin: 0;
+}
+/* HTML5 display definitions
+   ========================================================================== */
+/**
+ * Correct `block` display not defined for any HTML5 element in IE 8/9.
+ * Correct `block` display not defined for `details` or `summary` in IE 10/11
+ * and Firefox.
+ * Correct `block` display not defined for `main` in IE 11.
+ */
+article,
+aside,
+details,
+figcaption,
+figure,
+footer,
+header,
+hgroup,
+main,
+menu,
+nav,
+section,
+summary {
+  display: block;
+}
+/**
+ * 1. Correct `inline-block` display not defined in IE 8/9.
+ * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera.
+ */
+audio,
+canvas,
+progress,
+video {
+  display: inline-block; /* 1 */
+  vertical-align: baseline; /* 2 */
+}
+/**
+ * Prevent modern browsers from displaying `audio` without controls.
+ * Remove excess height in iOS 5 devices.
+ */
+audio:not([controls]) {
+  display: none;
+  height: 0;
+}
+/**
+ * Address `[hidden]` styling not present in IE 8/9/10.
+ * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22.
+ */
+[hidden],
+template {
+  display: none;
+}
+/* Links
+   ========================================================================== */
+/**
+ * Remove the gray background color from active links in IE 10.
+ */
+a {
+  background-color: transparent;
+}
+/**
+ * Improve readability when focused and also mouse hovered in all browsers.
+ */
+a:active,
+a:hover {
+  outline: 0;
+}
+/* Text-level semantics
+   ========================================================================== */
+/**
+ * Address styling not present in IE 8/9/10/11, Safari, and Chrome.
+ */
+abbr[title] {
+  border-bottom: 1px dotted;
+}
+/**
+ * Address style set to `bolder` in Firefox 4+, Safari, and Chrome.
+ */
+b,
+strong {
+  font-weight: bold;
+}
+/**
+ * Address styling not present in Safari and Chrome.
+ */
+dfn {
+  font-style: italic;
+}
+/**
+ * Address variable `h1` font-size and margin within `section` and `article`
+ * contexts in Firefox 4+, Safari, and Chrome.
+ */
+h1 {
+  font-size: 2em;
+  margin: 0.67em 0;
+}
+/**
+ * Address styling not present in IE 8/9.
+ */
+mark {
+  background: #ff0;
+  color: #000;
+}
+/**
+ * Address inconsistent and variable font size in all browsers.
+ */
+small {
+  font-size: 80%;
+}
+/**
+ * Prevent `sub` and `sup` affecting `line-height` in all browsers.
+ */
+sub,
+sup {
+  font-size: 75%;
+  line-height: 0;
+  position: relative;
+  vertical-align: baseline;
+}
+sup {
+  top: -0.5em;
+}
+sub {
+  bottom: -0.25em;
+}
+/* Embedded content
+   ========================================================================== */
+/**
+ * Remove border when inside `a` element in IE 8/9/10.
+ */
+img {
+  border: 0;
+}
+/**
+ * Correct overflow not hidden in IE 9/10/11.
+ */
+svg:not(:root) {
+  overflow: hidden;
+}
+/* Grouping content
+   ========================================================================== */
+/**
+ * Address margin not present in IE 8/9 and Safari.
+ */
+figure {
+  margin: 1em 40px;
+}
+/**
+ * Address differences between Firefox and other browsers.
+ */
+hr {
+  -moz-box-sizing: content-box;
+  box-sizing: content-box;
+  height: 0;
+}
+/**
+ * Contain overflow in all browsers.
+ */
+pre {
+  overflow: auto;
+}
+/**
+ * Address odd `em`-unit font size rendering in all browsers.
+ */
+code,
+kbd,
+pre,
+samp {
+  font-family: monospace, monospace;
+  font-size: 1em;
+}
+/* Forms
+   ========================================================================== */
+/**
+ * Known limitation: by default, Chrome and Safari on OS X allow very limited
+ * styling of `select`, unless a `border` property is set.
+ */
+/**
+ * 1. Correct color not being inherited.
+ *    Known issue: affects color of disabled elements.
+ * 2. Correct font properties not being inherited.
+ * 3. Address margins set differently in Firefox 4+, Safari, and Chrome.
+ */
+button,
+input,
+optgroup,
+select,
+textarea {
+  color: inherit; /* 1 */
+  font: inherit; /* 2 */
+  margin: 0; /* 3 */
+}
+/**
+ * Address `overflow` set to `hidden` in IE 8/9/10/11.
+ */
+button {
+  overflow: visible;
+}
+/**
+ * Address inconsistent `text-transform` inheritance for `button` and `select`.
+ * All other form control elements do not inherit `text-transform` values.
+ * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera.
+ * Correct `select` style inheritance in Firefox.
+ */
+button,
+select {
+  text-transform: none;
+}
+/**
+ * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
+ *    and `video` controls.
+ * 2. Correct inability to style clickable `input` types in iOS.
+ * 3. Improve usability and consistency of cursor style between image-type
+ *    `input` and others.
+ */
+button,
+html input[type="button"], /* 1 */
+input[type="reset"],
+input[type="submit"] {
+  -webkit-appearance: button; /* 2 */
+  cursor: pointer; /* 3 */
+}
+/**
+ * Re-set default cursor for disabled elements.
+ */
+button[disabled],
+html input[disabled] {
+  cursor: default;
+}
+/**
+ * Remove inner padding and border in Firefox 4+.
+ */
+button::-moz-focus-inner,
+input::-moz-focus-inner {
+  border: 0;
+  padding: 0;
+}
+/**
+ * Address Firefox 4+ setting `line-height` on `input` using `!important` in
+ * the UA stylesheet.
+ */
+input {
+  line-height: normal;
+}
+/**
+ * It's recommended that you don't attempt to style these elements.
+ * Firefox's implementation doesn't respect box-sizing, padding, or width.
+ *
+ * 1. Address box sizing set to `content-box` in IE 8/9/10.
+ * 2. Remove excess padding in IE 8/9/10.
+ */
+input[type="checkbox"],
+input[type="radio"] {
+  box-sizing: border-box; /* 1 */
+  padding: 0; /* 2 */
+}
+/**
+ * Fix the cursor style for Chrome's increment/decrement buttons. For certain
+ * `font-size` values of the `input`, it causes the cursor style of the
+ * decrement button to change from `default` to `text`.
+ */
+input[type="number"]::-webkit-inner-spin-button,
+input[type="number"]::-webkit-outer-spin-button {
+  height: auto;
+}
+/**
+ * 1. Address `appearance` set to `searchfield` in Safari and Chrome.
+ * 2. Address `box-sizing` set to `border-box` in Safari and Chrome
+ *    (include `-moz` to future-proof).
+ */
+input[type="search"] {
+  -webkit-appearance: textfield; /* 1 */
+  -moz-box-sizing: content-box;
+  -webkit-box-sizing: content-box; /* 2 */
+  box-sizing: content-box;
+}
+/**
+ * Remove inner padding and search cancel button in Safari and Chrome on OS X.
+ * Safari (but not Chrome) clips the cancel button when the search input has
+ * padding (and `textfield` appearance).
+ */
+input[type="search"]::-webkit-search-cancel-button,
+input[type="search"]::-webkit-search-decoration {
+  -webkit-appearance: none;
+}
+/**
+ * Define consistent border, margin, and padding.
+ */
+fieldset {
+  border: 1px solid #c0c0c0;
+  margin: 0 2px;
+  padding: 0.35em 0.625em 0.75em;
+}
+/**
+ * 1. Correct `color` not being inherited in IE 8/9/10/11.
+ * 2. Remove padding so people aren't caught out if they zero out fieldsets.
+ */
+legend {
+  border: 0; /* 1 */
+  padding: 0; /* 2 */
+}
+/**
+ * Remove default vertical scrollbar in IE 8/9/10/11.
+ */
+textarea {
+  overflow: auto;
+}
+/**
+ * Don't inherit the `font-weight` (applied by a rule above).
+ * NOTE: the default cannot safely be changed in Chrome and Safari on OS X.
+ */
+optgroup {
+  font-weight: bold;
+}
+/* Tables
+   ========================================================================== */
+/**
+ * Remove most spacing between table cells.
+ */
+table {
+  border-collapse: collapse;
+  border-spacing: 0;
+}
+td,
+th {
+  padding: 0;
+}

rtvc/css/skeleton.css ADDED Viewed

	@@ -0,0 +1,418 @@

+/*
+* Skeleton V2.0.4
+* Copyright 2014, Dave Gamache
+* www.getskeleton.com
+* Free to use under the MIT license.
+* http://www.opensource.org/licenses/mit-license.php
+* 12/29/2014
+*/
+/* Table of contents
+––––––––––––––––––––––––––––––––––––––––––––––––––
+- Grid
+- Base Styles
+- Typography
+- Links
+- Buttons
+- Forms
+- Lists
+- Code
+- Tables
+- Spacing
+- Utilities
+- Clearing
+- Media Queries
+*/
+/* Grid
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+.container {
+  position: relative;
+  width: 100%;
+  max-width: 960px;
+  margin: 0 auto;
+  padding: 0 20px;
+  box-sizing: border-box; }
+.column,
+.columns {
+  width: 100%;
+  float: left;
+  box-sizing: border-box; }
+/* For devices larger than 400px */
+@media (min-width: 400px) {
+  .container {
+    width: 85%;
+    padding: 0; }
+}
+/* For devices larger than 550px */
+@media (min-width: 550px) {
+  .container {
+    width: 80%; }
+  .column,
+  .columns {
+    margin-left: 4%; }
+  .column:first-child,
+  .columns:first-child {
+    margin-left: 0; }
+  .one.column,
+  .one.columns                    { width: 4.66666666667%; }
+  .two.columns                    { width: 13.3333333333%; }
+  .three.columns                  { width: 22%;            }
+  .four.columns                   { width: 30.6666666667%; }
+  .five.columns                   { width: 39.3333333333%; }
+  .six.columns                    { width: 48%;            }
+  .seven.columns                  { width: 56.6666666667%; }
+  .eight.columns                  { width: 65.3333333333%; }
+  .nine.columns                   { width: 74.0%;          }
+  .ten.columns                    { width: 82.6666666667%; }
+  .eleven.columns                 { width: 91.3333333333%; }
+  .twelve.columns                 { width: 100%; margin-left: 0; }
+  .one-third.column               { width: 30.6666666667%; }
+  .two-thirds.column              { width: 65.3333333333%; }
+  .one-half.column                { width: 48%; }
+  /* Offsets */
+  .offset-by-one.column,
+  .offset-by-one.columns          { margin-left: 8.66666666667%; }
+  .offset-by-two.column,
+  .offset-by-two.columns          { margin-left: 17.3333333333%; }
+  .offset-by-three.column,
+  .offset-by-three.columns        { margin-left: 26%;            }
+  .offset-by-four.column,
+  .offset-by-four.columns         { margin-left: 34.6666666667%; }
+  .offset-by-five.column,
+  .offset-by-five.columns         { margin-left: 43.3333333333%; }
+  .offset-by-six.column,
+  .offset-by-six.columns          { margin-left: 52%;            }
+  .offset-by-seven.column,
+  .offset-by-seven.columns        { margin-left: 60.6666666667%; }
+  .offset-by-eight.column,
+  .offset-by-eight.columns        { margin-left: 69.3333333333%; }
+  .offset-by-nine.column,
+  .offset-by-nine.columns         { margin-left: 78.0%;          }
+  .offset-by-ten.column,
+  .offset-by-ten.columns          { margin-left: 86.6666666667%; }
+  .offset-by-eleven.column,
+  .offset-by-eleven.columns       { margin-left: 95.3333333333%; }
+  .offset-by-one-third.column,
+  .offset-by-one-third.columns    { margin-left: 34.6666666667%; }
+  .offset-by-two-thirds.column,
+  .offset-by-two-thirds.columns   { margin-left: 69.3333333333%; }
+  .offset-by-one-half.column,
+  .offset-by-one-half.columns     { margin-left: 52%; }
+}
+/* Base Styles
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+/* NOTE
+html is set to 62.5% so that all the REM measurements throughout Skeleton
+are based on 10px sizing. So basically 1.5rem = 15px :) */
+html {
+  font-size: 62.5%; }
+body {
+  font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */
+  line-height: 1.6;
+  font-weight: 400;
+  font-family: "Raleway", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
+  color: #222; }
+/* Typography
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+h1, h2, h3, h4, h5, h6 {
+  margin-top: 0;
+  margin-bottom: 2rem;
+  font-weight: 300; }
+h1 { font-size: 4.0rem; line-height: 1.2;  letter-spacing: -.1rem;}
+h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; }
+h3 { font-size: 3.0rem; line-height: 1.3;  letter-spacing: -.1rem; }
+h4 { font-size: 2.4rem; line-height: 1.35; letter-spacing: -.08rem; }
+h5 { font-size: 1.8rem; line-height: 1.5;  letter-spacing: -.05rem; }
+h6 { font-size: 1.5rem; line-height: 1.6;  letter-spacing: 0; }
+/* Larger than phablet */
+@media (min-width: 550px) {
+  h1 { font-size: 5.0rem; }
+  h2 { font-size: 4.2rem; }
+  h3 { font-size: 3.6rem; }
+  h4 { font-size: 3.0rem; }
+  h5 { font-size: 2.4rem; }
+  h6 { font-size: 1.5rem; }
+}
+p {
+  margin-top: 0; }
+/* Links
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+a {
+  color: #1EAEDB; }
+a:hover {
+  color: #0FA0CE; }
+/* Buttons
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+.button,
+button,
+input[type="submit"],
+input[type="reset"],
+input[type="button"] {
+  display: inline-block;
+  height: 38px;
+  padding: 0 30px;
+  color: #555;
+  text-align: center;
+  font-size: 11px;
+  font-weight: 600;
+  line-height: 38px;
+  letter-spacing: .1rem;
+  text-transform: uppercase;
+  text-decoration: none;
+  white-space: nowrap;
+  background-color: transparent;
+  border-radius: 4px;
+  border: 1px solid #bbb;
+  cursor: pointer;
+  box-sizing: border-box; }
+.button:hover,
+button:hover,
+input[type="submit"]:hover,
+input[type="reset"]:hover,
+input[type="button"]:hover,
+.button:focus,
+button:focus,
+input[type="submit"]:focus,
+input[type="reset"]:focus,
+input[type="button"]:focus {
+  color: #333;
+  border-color: #888;
+  outline: 0; }
+.button.button-primary,
+button.button-primary,
+input[type="submit"].button-primary,
+input[type="reset"].button-primary,
+input[type="button"].button-primary {
+  color: #FFF;
+  background-color: #33C3F0;
+  border-color: #33C3F0; }
+.button.button-primary:hover,
+button.button-primary:hover,
+input[type="submit"].button-primary:hover,
+input[type="reset"].button-primary:hover,
+input[type="button"].button-primary:hover,
+.button.button-primary:focus,
+button.button-primary:focus,
+input[type="submit"].button-primary:focus,
+input[type="reset"].button-primary:focus,
+input[type="button"].button-primary:focus {
+  color: #FFF;
+  background-color: #1EAEDB;
+  border-color: #1EAEDB; }
+/* Forms
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+input[type="email"],
+input[type="number"],
+input[type="search"],
+input[type="text"],
+input[type="tel"],
+input[type="url"],
+input[type="password"],
+textarea,
+select {
+  height: 38px;
+  padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */
+  background-color: #fff;
+  border: 1px solid #D1D1D1;
+  border-radius: 4px;
+  box-shadow: none;
+  box-sizing: border-box; }
+/* Removes awkward default styles on some inputs for iOS */
+input[type="email"],
+input[type="number"],
+input[type="search"],
+input[type="text"],
+input[type="tel"],
+input[type="url"],
+input[type="password"],
+textarea {
+  -webkit-appearance: none;
+     -moz-appearance: none;
+          appearance: none; }
+textarea {
+  min-height: 65px;
+  padding-top: 6px;
+  padding-bottom: 6px; }
+input[type="email"]:focus,
+input[type="number"]:focus,
+input[type="search"]:focus,
+input[type="text"]:focus,
+input[type="tel"]:focus,
+input[type="url"]:focus,
+input[type="password"]:focus,
+textarea:focus,
+select:focus {
+  border: 1px solid #33C3F0;
+  outline: 0; }
+label,
+legend {
+  display: block;
+  margin-bottom: .5rem;
+  font-weight: 600; }
+fieldset {
+  padding: 0;
+  border-width: 0; }
+input[type="checkbox"],
+input[type="radio"] {
+  display: inline; }
+label > .label-body {
+  display: inline-block;
+  margin-left: .5rem;
+  font-weight: normal; }
+/* Lists
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+ul {
+  list-style: circle inside; }
+ol {
+  list-style: decimal inside; }
+ol, ul {
+  padding-left: 0;
+  margin-top: 0; }
+ul ul,
+ul ol,
+ol ol,
+ol ul {
+  margin: 1.5rem 0 1.5rem 3rem;
+  font-size: 90%; }
+li {
+  margin-bottom: 1rem; }
+/* Code
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+code {
+  padding: .2rem .5rem;
+  margin: 0 .2rem;
+  font-size: 90%;
+  white-space: nowrap;
+  background: #F1F1F1;
+  border: 1px solid #E1E1E1;
+  border-radius: 4px; }
+pre > code {
+  display: block;
+  padding: 1rem 1.5rem;
+  white-space: pre; }
+/* Tables
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+th,
+td {
+  padding: 6px 5px;
+  text-align: left;
+  border-bottom: 1px solid #E1E1E1; }
+th:first-child,
+td:first-child {
+  padding-left: 0; }
+th:last-child,
+td:last-child {
+  padding-right: 0; }
+/* Spacing
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+button,
+.button {
+  margin-bottom: 1rem; }
+input,
+textarea,
+select,
+fieldset {
+  margin-bottom: 0.5rem; }
+pre,
+blockquote,
+dl,
+figure,
+table,
+p,
+ul,
+ol,
+form {
+  margin-bottom: 1.5rem; }
+/* Utilities
+–––––––––––––––––––––––���–––––––––––––––––––––––––– */
+.u-full-width {
+  width: 100%;
+  box-sizing: border-box; }
+.u-max-full-width {
+  max-width: 100%;
+  box-sizing: border-box; }
+.u-pull-right {
+  float: right; }
+.u-pull-left {
+  float: left; }
+/* Misc
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+hr {
+  margin-top: 3rem;
+  margin-bottom: 3.5rem;
+  border-width: 0;
+  border-top: 1px solid #E1E1E1; }
+/* Clearing
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+/* Self Clearing Goodness */
+.container:after,
+.row:after,
+.u-cf {
+  content: "";
+  display: table;
+  clear: both; }
+/* Media Queries
+–––––––––––––––––––––––––––––––––––––––––––––––––– */
+/*
+Note: The best way to structure the use of media queries is to create the queries
+near the relevant code. For example, if you wanted to change the styles for buttons
+on small devices, paste the mobile query code up in the buttons section and style it
+there.
+*/
+/* Larger than mobile */
+@media (min-width: 400px) {}
+/* Larger than phablet (also point when grid becomes active) */
+@media (min-width: 550px) {}
+/* Larger than tablet */
+@media (min-width: 750px) {}
+/* Larger than desktop */
+@media (min-width: 1000px) {}
+/* Larger than Desktop HD */
+@media (min-width: 1200px) {}

rtvc/demo_cli.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import argparse
+from ctypes import alignment
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+from pathlib import Path
+import spacy
+import time
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--run_id", type=str, default="default", help= \
+    "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
+    "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
+    "states and restart from scratch.")
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
+                        help="Directory containing all saved models")
+    parser.add_argument("--weight", type=float, default=1,
+                        help="weight of input audio for voice filter")
+    parser.add_argument("--griffin_lim",
+                        action="store_true",
+                        help="if True, use griffin-lim, else use vocoder")
+    parser.add_argument("--cpu", action="store_true", help=\
+        "If True, processing is done on CPU, even when a GPU is available.")
+    parser.add_argument("--no_sound", action="store_true", help=\
+        "If True, audio won't be played.")
+    parser.add_argument("--seed", type=int, default=None, help=\
+        "Optional random number seed value to make toolbox deterministic.")
+    args = parser.parse_args()
+    arg_dict = vars(args)
+    # print_args(args, parser)
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    print("Running a test of your configuration...\n")
+    import numpy as np
+    import soundfile as sf
+    import torch
+    import encoder.inference
+    import encoder.params_data
+    from synthesizer.inference import Synthesizer_infer
+    from synthesizer.utils.cleaners import add_breaks, english_cleaners_predict
+    from vocoder import inference as vocoder
+    from vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens
+    from utils.argutils import print_args
+    from utils.default_models import ensure_default_models
+    from speed_changer.fixSpeed import *
+    if torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        ## Print some environment information (for debugging purposes)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" %
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Using CPU for inference.\n")
+    ## Load the models one by one.
+    if not args.griffin_lim:
+        print("Preparing the encoder, the synthesizer and the vocoder...")
+    else:
+        print("Preparing the encoder and the synthesizer...")
+    ensure_default_models(args.run_id, Path("saved_models"))
+    encoder.inference.load_model(list(args.models_dir.glob(f"{args.run_id}/encoder.pt"))[0])
+    synthesizer = Synthesizer_infer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0])
+    if not args.griffin_lim:
+        vocoder.load_model(list(args.models_dir.glob(f"{args.run_id}/vocoder.pt"))[0])
+    # ## Run a test
+    # print("Testing your configuration with small inputs.")
+    # # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
+    # # sampling rate, which may differ.
+    # # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
+    # # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
+    # # The sampling rate is the number of values (samples) recorded per second, it is set to
+    # # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
+    # # to an audio of 1 second.
+    # print("\tTesting the encoder...")
+    # encoder.embed_utterance(np.zeros(encoder.sampling_rate))
+    # # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
+    # # returns, but here we're going to make one ourselves just for the sake of showing that it's
+    # # possible.
+    # embed = np.random.rand(speaker_embedding_size)
+    # # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
+    # # embeddings it will be).
+    # embed /= np.linalg.norm(embed)
+    # # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
+    # # illustrate that
+    # embeds = [embed, np.zeros(speaker_embedding_size)]
+    # texts = ["test 1", "test 2"]
+    # print("\tTesting the synthesizer... (loading the model will output a lot of text)")
+    # mels = synthesizer.synthesize_spectrograms(texts, embeds)
+    # # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
+    # # can concatenate the mel spectrograms to a single one.
+    # mel = np.concatenate(mels, axis=1)
+    # # The vocoder can take a callback function to display the generation. More on that later. For
+    # # now we'll simply hide it like this:
+    # if not args.griffin_lim:
+    #     no_action = lambda *args: None
+    #     print("\tTesting the vocoder...")
+    #     # For the sake of making this test short, we'll pass a short target length. The target length
+    #     # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
+    #     # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
+    #     # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
+    #     # that has a detrimental effect on the quality of the audio. The default parameters are
+    #     # recommended in general.
+    #     vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
+    # print("All test passed! You can now synthesize speech.\n\n")
+    ## Interactive speech generation
+    print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
+          "show how you can interface this project easily with your own. See the source code for "
+          "an explanation of what is happening.\n")
+    print("Interactive generation loop")
+    num_generated = 0
+    nlp = spacy.load('en_core_web_sm')
+    weight = arg_dict["weight"] # 声音美颜的用户语音权重
+    amp = 1
+    while True:
+        # try:
+        # Get the reference audio filepath
+        num_of_input_audio = 1
+        for i in range(num_of_input_audio):
+            # Computing the embedding
+            # First, we load the wav using the function that the speaker encoder provides. This is
+            # important: there is preprocessing that must be applied.
+            # The following two methods are equivalent:
+            # - Directly load from the filepath:
+            # preprocessed_wav = encoder.preprocess_wav(in_fpath)
+            # - If the wav is already loaded:
+            # get duration info from input audio
+            message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
+                    f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
+            in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))
+            fpath_without_ext = os.path.splitext(str(in_fpath))[0]
+            speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1]
+            is_wav_file, single_wav, wav_path = TransFormat(in_fpath, 'wav')
+            if not is_wav_file:
+                os.remove(wav_path)  # remove intermediate wav files
+            # merge
+            if i == 0:
+                wav = single_wav
+            else:
+                wav = np.append(wav, single_wav)
+        # write to disk
+        path_ori, _ = os.path.split(wav_path)
+        file_ori = 'temp.wav'
+        fpath = os.path.join(path_ori, file_ori)
+        sf.write(fpath, wav, samplerate=encoder.params_data.sampling_rate)
+        # adjust the speed
+        totDur_ori, nPause_ori, arDur_ori, nSyl_ori, arRate_ori = AudioAnalysis(path_ori, file_ori)
+        DelFile(path_ori, '.TextGrid')
+        os.remove(fpath)
+        preprocessed_wav = encoder.inference.preprocess_wav(wav)
+        print("Loaded input audio file succesfully")
+        # Then we derive the embedding. There are many functions and parameters that the
+        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
+        # only use this function (with its default parameters):
+        input_embed = encoder.inference.embed_utterance(preprocessed_wav)
+        # Choose standard audio
+        fft_max_freq = vocoder.get_dominant_freq(preprocessed_wav)
+        print(f"\nthe dominant frequency of input audio is {fft_max_freq}Hz")
+        if fft_max_freq < encoder.params_data.split_freq:
+            vocoder.hp.sex = 1
+            standard_fpath = "standard_audios/male_1.wav"
+        else:
+            vocoder.hp.sex = 0
+            standard_fpath = "standard_audios/female_1.wav"
+        if os.path.exists(standard_fpath):
+            standard_wav = Synthesizer_infer.load_preprocess_wav(standard_fpath)
+            preprocessed_standard_wav = encoder.inference.preprocess_wav(standard_wav)
+            print("Loaded standard audio file successfully")
+            standard_embed = encoder.inference.embed_utterance(preprocessed_standard_wav)
+            embed1=np.copy(input_embed).dot(weight)
+            embed2=np.copy(standard_embed).dot(1 - weight)
+            embed=embed1+embed2
+        else:
+            embed = np.copy(input_embed)
+        embed[embed < encoder.params_data.set_zero_thres]=0 # 噪声值置零
+        embed = embed * amp
+        start_syn = time.time()
+        # Generating the spectrogram
+        text = input("Write a sentence to be synthesized:\n")
+        # If seed is specified, reset torch seed and force synthesizer reload
+        if args.seed is not None:
+            torch.manual_seed(args.seed)
+            synthesizer = Synthesizer_infer(args.syn_model_fpath)
+        # The synthesizer works in batch, so you need to put your data in a list or numpy array
+        def preprocess_text(text):
+            text = add_breaks(text)
+            text = english_cleaners_predict(text)
+            texts = [i.text.strip() for i in nlp(text).sents]  # split paragraph to sentences
+            return texts
+        texts = preprocess_text(text)
+        print(f"the list of inputs texts:\n{texts}")
+        # embeds = [embed] * len(texts)
+        specs = []
+        alignments = []
+        stop_tokens = []
+        for text in texts:
+            spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True)
+            specs.append(spec[0])
+            alignments.append(align[0])
+            stop_tokens.append(stop_token[0])
+        breaks = [spec.shape[1] for spec in specs]
+        spec = np.concatenate(specs, axis=1)
+        ## Save synthesizer visualization results
+        if not os.path.exists("syn_results"):
+            os.mkdir("syn_results")
+        save_attention_multiple(alignments, "syn_results/attention")
+        save_stop_tokens(stop_tokens, "syn_results/stop_tokens")
+        save_spectrogram(spec, "syn_results/mel")
+        print("Created the mel spectrogram")
+        end_syn = time.time()
+        print(f"Prediction time of synthesizer is {end_syn - start_syn}s")
+        start_voc = time.time()
+        ## Generating the waveform
+        print("Synthesizing the waveform:")
+        # If seed is specified, reset torch seed and reload vocoder
+        if args.seed is not None:
+            torch.manual_seed(args.seed)
+            vocoder.load_model(args.voc_model_fpath)
+        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+        # spectrogram, the more time-efficient the vocoder.
+        if not args.griffin_lim:
+            wav = vocoder.infer_waveform(spec, target=vocoder.hp.voc_target, overlap=vocoder.hp.voc_overlap, crossfade=vocoder.hp.is_crossfade)
+        else:
+            wav = Synthesizer_infer.griffin_lim(spec)
+        end_voc = time.time()
+        print(f"Prediction time of vocoder is {end_voc - start_voc}s")
+        print(f"Prediction time of TTS is {end_voc - start_syn}s")
+        # Add breaks
+        b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
+        b_starts = np.concatenate(([0], b_ends[:-1]))
+        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
+        breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
+        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
+        # Trim excess silences to compensate for gaps in spectrograms (issue #53)
+        # generated_wav = encoder.inference.preprocess_wav(wav)
+        wav = wav / np.abs(wav).max() * 4
+        # Save it on the disk
+        # filename = "demo_output_%02d.wav" % num_generated
+        if not os.path.exists("out_audios"):
+            os.mkdir("out_audios")
+        dir_path = os.path.dirname(os.path.realpath(__file__))  # current dir
+        filename = os.path.join(dir_path, f"out_audios/{speaker_name}_syn.wav")
+        # print(wav.dtype)
+        sf.write(filename, wav.astype(np.float32), synthesizer.sample_rate)
+        num_generated += 1
+        print("\nSaved output (havent't change speed) as %s\n\n" % filename)
+        # Fix Speed(generate new audio)
+        fix_file = work(totDur_ori,
+                        nPause_ori,
+                        arDur_ori,
+                        nSyl_ori,
+                        arRate_ori,
+                        filename)
+        print(f"\nSaved output (fixed speed) as {fix_file}\n\n")
+        # # Play the audio (non-blocking)
+        # if not args.no_sound:
+        #     import sounddevice as sd
+        #     try:
+        #         sd.stop()
+        #         sd.play(wav, synthesizer.sample_rate)
+        #     except sd.PortAudioError as e:
+        #         print("\nCaught exception: %s" % repr(e))
+        #         print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
+        #     except:
+        #         raise
+        # except Exception as e:
+        #     print("Caught exception: %s" % repr(e))
+        #     print("Restarting\n")

rtvc/demo_results/text1/1688-142285-0000_syn.wav ADDED Viewed

Binary file (108 kB). View file

rtvc/demo_results/text1/260-123286-0000_syn.wav ADDED Viewed

Binary file (112 kB). View file

rtvc/demo_results/text1/4294-9934-0000_syn.wav ADDED Viewed

Binary file (104 kB). View file

rtvc/demo_results/text1/7176-88083-0000_syn.wav ADDED Viewed

Binary file (104 kB). View file

rtvc/demo_results/text1/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Life was like a box of chocolates, you never know what you're gonna get.

rtvc/demo_results/text2/1688-142285-0000_syn.wav ADDED Viewed

Binary file (650 kB). View file

rtvc/demo_results/text2/260-123286-0000_syn.wav ADDED Viewed

Binary file (611 kB). View file

rtvc/demo_results/text2/4294-9934-0000_syn.wav ADDED Viewed

Binary file (617 kB). View file

rtvc/demo_results/text2/7176-88083-0000_syn.wav ADDED Viewed

Binary file (595 kB). View file

rtvc/demo_results/text2/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ In 2014, P&G recorded $83.1 billion in sales. On August 1, 2014, P&G announced it was streamlining the company, dropping and selling off around 100 brands from its product portfolio in order to focus on the remaining 65 brands, which produced 95% of the company's profits.

rtvc/demo_results/text3/1688-142285-0000_syn.wav ADDED Viewed

Binary file (746 kB). View file

rtvc/demo_results/text3/260-123286-0000_syn.wav ADDED Viewed

Binary file (812 kB). View file

rtvc/demo_results/text3/4294-9934-0000_syn.wav ADDED Viewed

Binary file (744 kB). View file

rtvc/demo_results/text3/7176-88083-0000_syn.wav ADDED Viewed

Binary file (720 kB). View file

rtvc/demo_results/text3/README.md ADDED Viewed

	@@ -0,0 +1 @@

+ Mechanics is a branch of physics that deals with the behavior of physical bodies under the influence of various forces. The study of mechanics is important in understanding the behavior of machines, the motion of objects, and the principles of engineering. Mechanics has been an essential part of physics since ancient times and has continued to evolve with advancements in science and technology. This paper will discuss the principles of mechanics, the laws of motion, and the applications of mechanics in engineering and technology.

rtvc/demo_toolbox.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import argparse
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+from pathlib import Path
+from toolbox import Toolbox
+from utils.argutils import print_args
+from utils.default_models import ensure_default_models
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Runs the toolbox.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--run_id", type=str, default="20230609", help= \
+    "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
+    "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
+    "states and restart from scratch.")
+    parser.add_argument("-d", "--datasets_root", type=Path, help= \
+        "Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
+        "supported datasets.", default=None)
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
+                        help="Directory containing all saved models")
+    parser.add_argument("--cpu", action="store_true", help=\
+        "If True, all inference will be done on CPU")
+    parser.add_argument("--seed", type=int, default=None, help=\
+        "Optional random number seed value to make toolbox deterministic.")
+    args = parser.parse_args()
+    arg_dict = vars(args)
+    print_args(args, parser)
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    # Remind the user to download pretrained models if needed
+    ensure_default_models(args.run_id, args.models_dir)
+    # Launch the toolbox
+    Toolbox(**arg_dict)

rtvc/docs/images/audio_icon.png ADDED Viewed

rtvc/docs/images/voice_cloning_arch.png ADDED Viewed

rtvc/encoder/__init__.py ADDED Viewed

File without changes

rtvc/encoder/audio.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from scipy.ndimage.morphology import binary_dilation
+from encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+from warnings import warn
+import numpy as np
+import librosa
+import struct
+import os
+from pydub import AudioSegment
+import noisereduce
+try:
+    import webrtcvad
+except:
+    warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
+    webrtcvad=None
+int16_max = (2 ** 15) - 1
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None,
+                   normalize: Optional[bool] = True,
+                   trim_silence: Optional[bool] = True):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        # if str(fpath_or_wav).endswith(".m4a"):
+        #     try:
+        #         track = AudioSegment.from_file(fpath_or_wav, format="m4a")
+        #     except:
+        #         return []
+        #     fpath = os.path.splitext(str(fpath_or_wav))[0]
+        #     path_components = os.path.normpath(fpath).split(os.sep)
+        #     wav_dir = Path("D:\\liuhaozhe").joinpath(f"VoxCeleb2_wav")  # local path
+        #     wav_dir.mkdir(exist_ok=True)
+        #     wav_name = "_".join(path_components[-6: ])
+        #     wav_path = wav_dir.joinpath(f"{wav_name}.wav")
+        #     track.export(wav_path, format="wav")
+        #     wav, source_sr = librosa.load(str(wav_path), sr=None)
+        # else:
+        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    if normalize:
+        wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    if webrtcvad and trim_silence:
+        wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        wav,
+        sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

rtvc/encoder/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+librispeech_datasets = {
+    "train": {
+        "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
+        "other": ["LibriSpeech/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriSpeech/test-clean"],
+        "other": ["LibriSpeech/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriSpeech/dev-clean"],
+        "other": ["LibriSpeech/dev-other"]
+    },
+}
+libritts_datasets = {
+    "train": {
+        "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
+        "other": ["LibriTTS/train-other-500"]
+    },
+    "test": {
+        "clean": ["LibriTTS/test-clean"],
+        "other": ["LibriTTS/test-other"]
+    },
+    "dev": {
+        "clean": ["LibriTTS/dev-clean"],
+        "other": ["LibriTTS/dev-other"]
+    },
+}
+voxceleb_datasets = {
+    "voxceleb1" : {
+        "train": ["VoxCeleb1/wav"],
+        "test": ["VoxCeleb1/test_wav"]
+    },
+    "voxceleb2" : {
+        "train": ["VoxCeleb2/dev/aac"],
+        "test": ["VoxCeleb2/test_wav"]
+    }
+}
+other_datasets = [
+    "LJSpeech-1.1",
+    "VCTK-Corpus/wav48",
+]
+anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]

rtvc/encoder/data_objects/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from encoder.data_objects.speaker_verification_dataset import Train_Dataset, Dev_Dataset
2	+ from encoder.data_objects.speaker_verification_dataset import DataLoader

rtvc/encoder/data_objects/random_cycler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import random
+class RandomCycler:
+    """
+    Creates an internal copy of a sequence and allows access to its items in a constrained random
+    order. For a source sequence of n items and one or several consecutive queries of a total
+    of m items, the following guarantees hold (one implies the other):
+        - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
+        - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
+    """
+    def __init__(self, source):
+        if len(source) == 0:
+            raise Exception("Can't create RandomCycler from an empty collection")
+        self.all_items = list(source)
+        self.next_items = []
+    def sample(self, count: int):
+        shuffle = lambda l: random.sample(l, len(l))
+        out = []
+        while count > 0:
+            if count >= len(self.all_items):
+                out.extend(shuffle(list(self.all_items)))
+                count -= len(self.all_items)
+                continue
+            n = min(count, len(self.next_items))
+            out.extend(self.next_items[:n])
+            count -= n
+            self.next_items = self.next_items[n:]
+            if len(self.next_items) == 0:
+                self.next_items = shuffle(list(self.all_items))
+        return out
+    def __next__(self):
+        return self.sample(1)[0]

rtvc/encoder/data_objects/speaker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from encoder.data_objects.random_cycler import RandomCycler
+from encoder.data_objects.utterance import Utterance
+from pathlib import Path
+# Contains the set of utterances of a single speaker
+class Speaker:
+    def __init__(self, root: Path):
+        self.root = root
+        self.name = root.name
+        self.utterances = None
+        self.utterance_cycler = None
+    def _load_utterances(self):
+        with self.root.joinpath("_sources.txt").open("r") as sources_file:
+            sources = [l.split(",") for l in sources_file]
+        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
+        self.utterance_cycler = RandomCycler(self.utterances)
+    def random_partial(self, count, n_frames):
+        """
+        Samples a batch of <count> unique partial utterances from the disk in a way that all
+        utterances come up at least once every two cycles and in a random order every time.
+        :param count: The number of partial utterances to sample from the set of utterances from
+        that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
+        the number of utterances available.
+        :param n_frames: The number of frames in the partial utterance.
+        :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
+        frames are the frames of the partial utterances and range is the range of the partial
+        utterance with regard to the complete utterance.
+        """
+        if self.utterances is None:
+            self._load_utterances()
+        utterances = self.utterance_cycler.sample(count)
+        a = [(u,) + u.random_partial(n_frames) for u in utterances]
+        return a

rtvc/encoder/data_objects/speaker_batch.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import numpy as np
+from typing import List
+from encoder.data_objects.speaker import Speaker
+class SpeakerBatch:
+    def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
+        self.speakers = speakers
+        self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
+        # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
+        # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
+        self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])

rtvc/encoder/data_objects/speaker_verification_dataset.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from encoder.data_objects.random_cycler import RandomCycler
+from encoder.data_objects.speaker_batch import SpeakerBatch
+from encoder.data_objects.utterance_batch import UtteranceBatch
+from encoder.data_objects.speaker import Speaker
+from encoder.params_data import partials_n_frames
+from torch.utils.data import Dataset, DataLoader
+from pathlib import Path
+from os import listdir
+from os.path import isfile
+import numpy as np
+# TODO: improve with a pool of speakers for data efficiency
+class Train_Dataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+    def __len__(self):
+        return int(1e8)
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+    def get_logs(self):
+        log_string = ""
+        for log_fpath in self.root.glob("*.txt"):
+            with log_fpath.open("r") as log_file:
+                log_string += "".join(log_file.readlines())
+        return log_string
+class Dev_Dataset(Dataset):
+    def __init__(self, datasets_root: Path):
+        self.root = datasets_root
+        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        if len(speaker_dirs) == 0:
+            raise Exception("No speakers found. Make sure you are pointing to the directory "
+                            "containing all preprocessed speaker directories.")
+        self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
+        self.speaker_cycler = RandomCycler(self.speakers)
+    def __len__(self):
+        return len(self.speakers)
+    def __getitem__(self, index):
+        return next(self.speaker_cycler)
+class DataLoader(DataLoader):
+    def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, shuffle, sampler=None,
+                 batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
+                 worker_init_fn=None):
+        self.utterances_per_speaker = utterances_per_speaker
+        super().__init__(
+            dataset=dataset,
+            batch_size=speakers_per_batch,
+            shuffle=shuffle,
+            sampler=sampler,
+            batch_sampler=batch_sampler,
+            num_workers=num_workers,
+            collate_fn=self.collate,
+            pin_memory=pin_memory,
+            drop_last=False,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn
+        )
+    def collate(self, speakers):
+        return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)

rtvc/encoder/data_objects/utterance.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+class Utterance:
+    def __init__(self, frames_fpath, wave_fpath):
+        self.frames_fpath = frames_fpath
+        self.wave_fpath = wave_fpath
+    def get_frames(self):
+        # frame_len = len(np.load(self.frames_fpath))
+        return np.load(self.frames_fpath)
+    def random_partial(self, n_frames):
+        """
+        Crops the frames into a partial utterance of n_frames
+        :param n_frames: The number of frames of the partial utterance
+        :return: the partial utterance frames and a tuple indicating the start and end of the
+        partial utterance in the complete utterance.
+        """
+        frames = self.get_frames()
+        if frames.shape[0] == n_frames:
+            start = 0
+        else:
+            start = np.random.randint(0, frames.shape[0] - n_frames)
+        end = start + n_frames
+        # frame_len = end - start
+        # frames_trim = frames[start:end]
+        return frames[start:end], (start, end)

rtvc/encoder/data_objects/utterance_batch.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pathlib import Path
+import numpy as np
+from typing import List
+from encoder.data_objects.utterance import Utterance
+class UtteranceBatch:
+    def __init__(self, utterance_path: List[Path], n_frames: int):
+        self.utterance = Utterance(utterance_path, None)
+        self.data = np.array(self.utterance.random_partial(n_frames)[0])

rtvc/encoder/inference.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from encoder.params_data import *
+from encoder.model import SpeakerEncoder
+from encoder.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from encoder import audio
+from pathlib import Path
+import numpy as np
+import torch
+_model = None # type: SpeakerEncoder
+_device = None # type: torch.device
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
+    first call to embed_frames() with the default weights file.
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = SpeakerEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath, _device)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
+def is_loaded():
+    return _model is not None
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.forward(frames).detach().cpu().numpy()
+    return embed
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    return wav_slices, mel_slices
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    import matplotlib.pyplot as plt
+    if ax is None:
+        ax = plt.gca()
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    sm = cm.ScalarMappable(cmap=cmap)
+    sm.set_clim(*color_range)
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)

rtvc/encoder/model.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from encoder.params_model import *
+from encoder.params_data import *
+from scipy.interpolate import interp1d
+from sklearn.metrics import roc_curve
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+class SpeakerEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,
+                            hidden_size=model_hidden_size,
+                            num_layers=model_num_layers,
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size,
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.], device=loss_device))
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.], device=loss_device))   ####modified####
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
+        return embeds
+    def similarity_matrix(self, embeds):
+        """
+        Computes the similarity matrix according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the similarity matrix as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, speakers_per_batch)
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
+        centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
+        centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
+        # Exclusive centroids (1 per utterance)
+        centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
+        centroids_excl /= (utterances_per_speaker - 1)
+        centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
+        # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
+        # product of these vectors (which is just an element-wise multiplication reduced by a sum).
+        # We vectorize the computation for efficiency.
+        sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
+                                 speakers_per_batch).to(self.loss_device)
+        mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
+        for j in range(speakers_per_batch):
+            mask = np.where(mask_matrix[j])[0]
+            sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
+            sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
+        ## Even more vectorized version (slower maybe because of transpose)
+        # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
+        #                           ).to(self.loss_device)
+        # eye = np.eye(speakers_per_batch, dtype=np.int)
+        # mask = np.where(1 - eye)
+        # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
+        # mask = np.where(eye)
+        # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
+        # sim_matrix2 = sim_matrix2.transpose(1, 2)
+        sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
+        return sim_matrix
+    def loss(self, embeds):
+        """
+        Computes the softmax loss according the section 2.1 of GE2E.
+        :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
+        utterances_per_speaker, embedding_size)
+        :return: the loss and the EER for this batch of embeddings.
+        """
+        speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
+        # Loss
+        sim_matrix = self.similarity_matrix(embeds)
+        sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
+                                         speakers_per_batch))
+        ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
+        target = torch.from_numpy(ground_truth).long().to(self.loss_device)
+        loss = self.loss_fn(sim_matrix, target)
+        # EER (not backpropagated)
+        with torch.no_grad():
+            inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
+            labels = np.array([inv_argmax(i) for i in ground_truth])
+            preds = sim_matrix.detach().cpu().numpy()
+            # Snippet from https://yangcha.github.io/EER-ROC/
+            fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
+            eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+        return loss, eer

rtvc/encoder/params_data.py ADDED Viewed

	@@ -0,0 +1,34 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+# 判断用户输入语音为男声或女声的分界频率
+split_freq = 170
+# embed去噪置零的阈值
+set_zero_thres=0.06