Spaces:

FunAudioLLM
/

Fun-CosyVoice3-0.5B

Running on Zero

App Files Files Community

aluminumbox commited on 3 days ago

Commit

eb07486

verified ·

1 Parent(s): fc258c9

Upload 194 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +249 -14
app.py +200 -0
cosyvoice/__init__.py +0 -0
cosyvoice/bin/average_model.py +93 -0
cosyvoice/bin/convert.py +223 -0
cosyvoice/bin/export_jit.py +101 -0
cosyvoice/bin/export_onnx.py +114 -0
cosyvoice/bin/inference_deprecated.py +126 -0
cosyvoice/bin/train.py +195 -0
cosyvoice/cli/__init__.py +0 -0
cosyvoice/cli/cosyvoice.py +238 -0
cosyvoice/cli/frontend.py +219 -0
cosyvoice/cli/model.py +430 -0
cosyvoice/dataset/__init__.py +0 -0
cosyvoice/dataset/dataset.py +151 -0
cosyvoice/dataset/processor.py +443 -0
cosyvoice/flow/DiT/dit.py +176 -0
cosyvoice/flow/DiT/modules.py +616 -0
cosyvoice/flow/decoder.py +494 -0
cosyvoice/flow/flow.py +432 -0
cosyvoice/flow/flow_matching.py +228 -0
cosyvoice/flow/length_regulator.py +70 -0
cosyvoice/hifigan/discriminator.py +230 -0
cosyvoice/hifigan/f0_predictor.py +103 -0
cosyvoice/hifigan/generator.py +746 -0
cosyvoice/hifigan/hifigan.py +67 -0
cosyvoice/llm/llm.py +739 -0
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +0 -0
cosyvoice/tokenizer/tokenizer.py +327 -0
cosyvoice/transformer/__init__.py +0 -0
cosyvoice/transformer/activation.py +84 -0
cosyvoice/transformer/attention.py +330 -0
cosyvoice/transformer/convolution.py +258 -0
cosyvoice/transformer/decoder.py +396 -0
cosyvoice/transformer/decoder_layer.py +132 -0
cosyvoice/transformer/embedding.py +302 -0
cosyvoice/transformer/encoder.py +474 -0
cosyvoice/transformer/encoder_layer.py +236 -0
cosyvoice/transformer/label_smoothing_loss.py +96 -0
cosyvoice/transformer/positionwise_feed_forward.py +115 -0
cosyvoice/transformer/subsampling.py +383 -0
cosyvoice/transformer/upsample_encoder.py +321 -0
cosyvoice/utils/__init__.py +0 -0
cosyvoice/utils/class_utils.py +85 -0
cosyvoice/utils/common.py +213 -0
cosyvoice/utils/executor.py +176 -0
cosyvoice/utils/file_utils.py +118 -0
cosyvoice/utils/frontend_utils.py +136 -0
cosyvoice/utils/losses.py +57 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,249 @@
----
-title: Fun CosyVoice3 0.5B
-emoji: 🚀
-colorFrom: green
-colorTo: green
-sdk: gradio
-sdk_version: 6.1.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Fun-CosyVoice3-0.5B
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)
+## 👉🏻 CosyVoice 👈🏻
+**Fun-CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [Modelscope](https://www.modelscope.cn/studios/FunAudioLLM/Fun-CosyVoice3-0.5B); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
+**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
+**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
+## Highlight🔥
+**Fun-CosyVoice 3.0** is an advanced text-to-speech (TTS) system based on large language models (LLM), surpassing its predecessor (CosyVoice 2.0) in content consistency, speaker similarity, and prosody naturalness. It is designed for zero-shot multilingual speech synthesis in the wild.
+### Key Features
+- **Language Coverage**: Covers 9 common languages (Chinese, English, Japanese, Korean, German, Spanish, French, Italian, Russian), 18+ Chinese dialects/accents (Guangdong, Minnan, Sichuan, Dongbei, Shan3xi, Shan1xi, Shanghai, Tianjin, Shan1dong, Ningxia, Gansu, etc.) and meanwhile supports both multi-lingual/cross-lingual zero-shot voice cloning.
+- **Content Consistency & Naturalness**: Achieves state-of-the-art performance in content consistency, speaker similarity, and prosody naturalness.
+- **Pronunciation Inpainting**: Supports pronunciation inpainting of Chinese Pinyin and English CMU phonemes, providing more controllability and thus suitable for production use.
+- **Text Normalization**: Supports reading of numbers, special symbols and various text formats without a traditional frontend module.
+- **Bi-Streaming**: Support both text-in streaming and audio-out streaming, and achieves latency as low as 150ms while maintaining high-quality audio output.
+- **Instruct Support**: Supports various instructions such as languages, dialects, emotions, speed, volume, etc.
+## Roadmap
+- [x] 2025/12
+    - [x] release Fun-CosyVoice3-0.5B-2512 base model, rl model and its training/inference script
+    - [x] release Fun-CosyVoice3-0.5B modelscope gradio space
+- [x] 2025/08
+    - [x] Thanks to the contribution from NVIDIA Yuekai Zhang, add triton trtllm runtime support and cosyvoice2 grpo training support
+- [x] 2025/07
+    - [x] release Fun-CosyVoice 3.0 eval set
+- [x] 2025/05
+    - [x] add CosyVoice2-0.5B vllm support
+- [x] 2024/12
+    - [x] 25hz CosyVoice2-0.5B released
+- [x] 2024/09
+    - [x] 25hz CosyVoice-300M base model
+    - [x] 25hz CosyVoice-300M voice conversion function
+- [x] 2024/08
+    - [x] Repetition Aware Sampling(RAS) inference for llm stability
+    - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
+- [x] 2024/07
+    - [x] Flow matching training support
+    - [x] WeTextProcessing support when ttsfrd is not available
+    - [x] Fastapi server and client
+## Evaluation
+| Model | CER (%) ↓ (test-zh) | WER (%) ↓ (test-en) | CER (%) ↓ (test-hard) |
+|-----|------------------|------------------|------------------|
+| Human | 1.26 | 2.14 | - |
+| F5-TTS | 1.53 | 2.00 | 8.67 |
+| SparkTTS | 1.20 | 1.98 | - |
+| Seed-TTS | 1.12 | 2.25 | 7.59 |
+| CosyVoice2 | 1.45 | 2.57 | 6.83 |
+| FireRedTTS-2 | 1.14 | 1.95 | - |
+| IndexTTS2 | 1.01 | 1.52 | 7.12 |
+| VibeVoice | 1.16 | 3.04 | - |
+| HiggsAudio | 1.79 | 2.44 | - |
+| MiniMax-Speech | 0.83 | 1.65 | - |
+| VoxPCM | 0.93 | 1.85 | 8.87 |
+| GLM-TTS | 1.03 | - | - |
+| GLM-TTS_RL | 0.89 | - | - |
+| Fun-CosyVoice3-0.5B-2512 | 1.21 |  2.24 | 6.71 |
+| Fun-CosyVoice3-0.5B-2512_RL | 0.81 | 1.68 | 5.44 |
+## Install
+### Clone and install
+- Clone the repo
+    ``` sh
+    git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+    # If you failed to clone the submodule due to network failures, please run the following command until success
+    cd CosyVoice
+    git submodule update --init --recursive
+    ```
+- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
+- Create Conda env:
+    ``` sh
+    conda create -n cosyvoice -y python=3.10
+    conda activate cosyvoice
+    pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+    # If you encounter sox compatibility issues
+    # ubuntu
+    sudo apt-get install sox libsox-dev
+    # centos
+    sudo yum install sox sox-devel
+    ```
+### Model download
+We strongly recommend that you download our pretrained `Fun-CosyVoice3-0.5B` `CosyVoice2-0.5B` `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource.
+``` python
+# SDK模型下载
+from modelscope import snapshot_download
+snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B')
+snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
+snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
+snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
+snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
+snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
+```
+Optionally, you can unzip `ttsfrd` resource and install `ttsfrd` package for better text normalization performance.
+Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use wetext by default.
+``` sh
+cd pretrained_models/CosyVoice-ttsfrd/
+unzip resource.zip -d .
+pip install ttsfrd_dependency-0.1-py3-none-any.whl
+pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
+```
+### Basic Usage
+We strongly recommend using `Fun-CosyVoice3-0.5B` for better performance.
+Follow the code in `example.py` for detailed usage of each model.
+```sh
+python example.py
+```
+#### CosyVoice2 vllm Usage
+If you want to use vllm for inference, please install `vllm==v0.9.0`. Older vllm version do not support CosyVoice2 inference.
+Notice that `vllm==v0.9.0` has a lot of specific requirements, for example `torch==2.7.0`. You can create a new env to in case your hardward do not support vllm and old env is corrupted.
+``` sh
+conda create -n cosyvoice_vllm --clone cosyvoice
+conda activate cosyvoice_vllm
+pip install vllm==v0.9.0 transformers==4.51.3 -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+python vllm_example.py
+```
+#### Start web demo
+You can use our web demo page to get familiar with CosyVoice quickly.
+Please see the demo website for details.
+``` python
+# change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference
+python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
+```
+#### Advanced Usage
+For advanced users, we have provided training and inference scripts in `examples/libritts/cosyvoice/run.sh`.
+#### Build for deployment
+Optionally, if you want service deployment,
+You can run the following steps.
+``` sh
+cd runtime/python
+docker build -t cosyvoice:v1.0 .
+# change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
+# for grpc usage
+docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/grpc && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
+cd grpc && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
+# for fastapi usage
+docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/fastapi && python3 server.py --port 50000 --model_dir iic/CosyVoice-300M && sleep infinity"
+cd fastapi && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
+```
+#### Using Nvidia TensorRT-LLM for deployment
+Using TensorRT-LLM to accelerate cosyvoice2 llm could give 4x acceleration comparing with huggingface transformers implementation.
+To quick start:
+``` sh
+cd runtime/triton_trtllm
+docker compose up -d
+```
+For more details, you could check [here](https://github.com/FunAudioLLM/CosyVoice/tree/main/runtime/triton_trtllm)
+## Discussion & Communication
+You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
+You can also scan the QR code to join our official Dingding chat group.
+<img src="./asset/dingding.png" width="250px">
+## Acknowledge
+1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR).
+2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec).
+3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS).
+4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
+5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
+## Citations
+``` bibtex
+@article{du2024cosyvoice,
+  title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
+  author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
+  journal={arXiv preprint arXiv:2407.05407},
+  year={2024}
+}
+@article{du2024cosyvoice,
+  title={Cosyvoice 2: Scalable streaming speech synthesis with large language models},
+  author={Du, Zhihao and Wang, Yuxuan and Chen, Qian and Shi, Xian and Lv, Xiang and Zhao, Tianyu and Gao, Zhifu and Yang, Yexin and Gao, Changfeng and Wang, Hui and others},
+  journal={arXiv preprint arXiv:2412.10117},
+  year={2024}
+}
+@article{du2025cosyvoice,
+  title={CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Post-training},
+  author={Du, Zhihao and Gao, Changfeng and Wang, Yuxuan and Yu, Fan and Zhao, Tianyu and Wang, Hao and Lv, Xiang and Wang, Hui and Shi, Xian and An, Keyu and others},
+  journal={arXiv preprint arXiv:2505.17589},
+  year={2025}
+}
+@inproceedings{lyu2025build,
+  title={Build LLM-Based Zero-Shot Streaming TTS System with Cosyvoice},
+  author={Lyu, Xiang and Wang, Yuxuan and Zhao, Tianyu and Wang, Hao and Liu, Huadai and Du, Zhihao},
+  booktitle={ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={1--2},
+  year={2025},
+  organization={IEEE}
+}
+```
+## Disclaimer
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import argparse
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+import random
+import librosa
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+from modelscope import snapshot_download, HubApi
+api = HubApi()
+_, cookies = api.login(access_token=os.environ['token'])
+snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B', cookies=cookies)
+snapshot_download('iic/SenseVoiceSmall', local_dir='pretrained_models/SenseVoiceSmall', cookies=cookies)
+snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd', cookies=cookies)
+os.system('cd pretrained_models/CosyVoice-ttsfrd/ && pip install ttsfrd_dependency-0.1-py3-none-any.whl && pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl && apt install -y unzip && rm -rf resource && unzip resource.zip -d .')
+from cosyvoice.cli.cosyvoice import AutoModel as CosyVoiceAutoModel
+from cosyvoice.utils.file_utils import logging, load_wav
+from cosyvoice.utils.common import set_all_random_seed, instruct_list
+inference_mode_list = ['3s极速复刻', '自然语言控制']
+instruct_dict = {'3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
+                 '自然语言控制': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入instruct文本\n3. 点击生成音频按钮'}
+stream_mode_list = [('否', False)]
+max_val = 0.8
+def generate_seed():
+    seed = random.randint(1, 100000000)
+    return {
+        "__type__": "update",
+        "value": seed
+    }
+top_db = 60
+hop_length = 220
+win_length = 440
+def postprocess(wav):
+    speech = load_wav(wav, target_sr=target_sr, min_sr=16000)
+    speech, _ = librosa.effects.trim(
+        speech, top_db=top_db,
+        frame_length=win_length,
+        hop_length=hop_length
+    )
+    if speech.abs().max() > max_val:
+        speech = speech / speech.abs().max() * max_val
+    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
+    torchaudio.save(wav, speech, target_sr)
+    return wav
+def change_instruction(mode_checkbox_group):
+    return instruct_dict[mode_checkbox_group]
+def prompt_wav_recognition(prompt_wav):
+    res = asr_model.generate(input=prompt_wav,
+                             language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+                             use_itn=True,
+    )
+    text = res[0]["text"].split('|>')[-1]
+    return text
+def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                   seed, stream):
+    stream = False
+    if len(tts_text) > 200:
+        gr.Warning('您输入的文字过长，请限制在200字以内')
+        return (target_sr, default_data)
+    sft_dropdown, speed = '', 1.0
+    if prompt_wav_upload is not None:
+        prompt_wav = prompt_wav_upload
+    elif prompt_wav_record is not None:
+        prompt_wav = prompt_wav_record
+    else:
+        prompt_wav = None
+    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
+    if mode_checkbox_group in ['自然语言控制']:
+        if instruct_text == '':
+            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
+            return (target_sr, default_data)
+        if prompt_wav is None:
+            gr.Info('您正在使用自然语言控制模式, 请输入prompt音频')
+            return (target_sr, default_data)
+    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
+    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
+        if prompt_wav is None:
+            gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
+            return (target_sr, default_data)
+        info = torchaudio.info(prompt_wav)
+        if info.sample_rate < prompt_sr:
+            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+            return (target_sr, default_data)
+        if info.num_frames / info.sample_rate > 10:
+            gr.Warning('请限制输入音频在10s内，避免推理效果过低')
+            return (target_sr, default_data)
+    # zero_shot mode only use prompt_wav prompt text
+    if mode_checkbox_group in ['3s极速复刻']:
+        if prompt_text == '':
+            gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
+            return (target_sr, default_data)
+        if instruct_text != '':
+            gr.Info('您正在使用3s极速复刻模式，instruct文本会被忽略！')
+        info = torchaudio.info(prompt_wav)
+        if info.num_frames / info.sample_rate > 10:
+            gr.Warning('请限制输入音频在10s内，避免推理效果过低')
+            return (target_sr, default_data)
+    if mode_checkbox_group == '3s极速复刻':
+        logging.info('get zero_shot inference request')
+        set_all_random_seed(seed)
+        speech_list = []
+        for i in cosyvoice.inference_zero_shot(tts_text, 'You are a helpful assistant.<|endofprompt|>' + prompt_text, postprocess(prompt_wav), stream=stream, speed=speed):
+            speech_list.append(i['tts_speech'])
+        return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())
+    elif mode_checkbox_group == '自然语言控制':
+        logging.info('get instruct inference request')
+        set_all_random_seed(seed)
+        speech_list = []
+        for i in cosyvoice.inference_instruct2(tts_text, instruct_text, postprocess(prompt_wav), stream=stream, speed=speed):
+            speech_list.append(i['tts_speech'])
+        return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())
+    else:
+        gr.Warning('无效的模式选择')
+def main():
+    with gr.Blocks() as demo:
+        gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
+                    预训练模型 [Fun-CosyVoice3-0.5B](https://www.modelscope.cn/models/FunAudioLLM/Fun-CosyVoice3-0.5B) \
+                    [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
+                    [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
+                    [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
+                    [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
+        gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
+        tts_text = gr.Textbox(label="输入合成文本", lines=1, value="Her handwriting is [M][AY0][N][UW1][T]并且很整洁，说明她[h][ào]干净。")
+        with gr.Row():
+            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
+            instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
+            stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
+            with gr.Column(scale=0.25):
+                seed_button = gr.Button(value="\U0001F3B2")
+                seed = gr.Number(value=0, label="随机推理种子")
+        with gr.Row():
+            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件，注意采样率不低于16khz')
+            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
+        prompt_text = gr.Textbox(label="prompt文本", lines=1, placeholder="请输入prompt文本，支持自动识别，您可以自行修正识别结果...", value='')
+        instruct_text = gr.Dropdown(choices=instruct_list, label='选择instruct文本', value=instruct_list[0])
+        generate_button = gr.Button("生成音频")
+        audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=False)
+        seed_button.click(generate_seed, inputs=[], outputs=seed)
+        generate_button.click(generate_audio,
+                              inputs=[tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
+                                      seed, stream],
+                              outputs=[audio_output])
+        mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
+        prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
+        prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
+    demo.queue(default_concurrency_limit=4).launch(server_port=50000, server_name='0.0.0.0')
+if __name__ == '__main__':
+    cosyvoice = CosyVoiceAutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B', load_trt=True, fp16=False)
+    sft_spk = cosyvoice.list_available_spks()
+    for stream in [False]:
+        for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄��的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', 'zero_shot_prompt.wav', stream=stream)):
+            continue
+    prompt_sr, target_sr = 16000, 24000
+    default_data = np.zeros(target_sr)
+    model_dir = "pretrained_models/SenseVoiceSmall"
+    asr_model = AutoModel(
+        model=model_dir,
+        disable_update=True,
+        log_level='DEBUG',
+        device="cuda:0")
+    main()

cosyvoice/__init__.py ADDED Viewed

File without changes

cosyvoice/bin/average_model.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import glob
+import yaml
+import torch
+def get_args():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument('--src_path',
+                        required=True,
+                        help='src model path for average')
+    parser.add_argument('--val_best',
+                        action="store_true",
+                        help='averaged model')
+    parser.add_argument('--num',
+                        default=5,
+                        type=int,
+                        help='nums for averaged model')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    val_scores = []
+    if args.val_best:
+        yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+        yamls = [
+            f for f in yamls
+            if not (os.path.basename(f).startswith('train')
+                    or os.path.basename(f).startswith('init'))
+        ]
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.BaseLoader)
+                loss = float(dic_yaml['loss_dict']['loss'])
+                epoch = int(dic_yaml['epoch'])
+                step = int(dic_yaml['step'])
+                tag = dic_yaml['tag']
+                val_scores += [[epoch, step, loss, tag]]
+        sorted_val_scores = sorted(val_scores,
+                                   key=lambda x: x[2],
+                                   reverse=False)
+        print("best val (epoch, step, loss, tag) = " +
+              str(sorted_val_scores[:args.num]))
+        path_list = [
+            args.src_path + '/epoch_{}_whole.pt'.format(score[0])
+            for score in sorted_val_scores[:args.num]
+        ]
+    print(path_list)
+    avg = {}
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        for k in states.keys():
+            if k not in ['step', 'epoch']:
+                if k not in avg.keys():
+                    avg[k] = states[k].clone()
+                else:
+                    avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            avg[k] = torch.true_divide(avg[k], num)
+    print('Saving to {}'.format(args.dst_model))
+    torch.save(avg, args.dst_model)
+if __name__ == '__main__':
+    main()

cosyvoice/bin/convert.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import sys
+import torch
+def convert_llm(state_dict):
+    # 调整了lm的结构，把codec_lm.encoder作为llm，codec_lm.decoder作为decoder
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('codec_lm.encoder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_lm.encoder.', 'llm.')
+            state_dict[k] = v
+        if k.startswith('codec_lm.decoder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_lm.decoder.', 'llm_decoder.')
+            state_dict[k] = v
+    # espnet和wenet具体实现上的差异
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('text_encoder.embed.'):
+            v = state_dict.pop(k)
+            k = k.replace('text_encoder.embed.', 'text_encoder.embed.out.')
+            state_dict[k] = v
+        if k.startswith('llm.embed.'):
+            v = state_dict.pop(k)
+            k = k.replace('llm.embed.', 'llm.embed.out.')
+            state_dict[k] = v
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('text_enc_out_layer.'):
+            v = state_dict.pop(k)
+            k = k.replace('text_enc_out_layer.', 'text_encoder_affine_layer.')
+            state_dict[k] = v
+        if k.startswith('token_embedding.'):
+            v = state_dict.pop(k)
+            k = k.replace('token_embedding.', 'text_embedding.')
+            state_dict[k] = v
+        if k.startswith('xvec_proj.'):
+            v = state_dict.pop(k)
+            k = k.replace('xvec_proj.', 'spk_embed_affine_layer.')
+            state_dict[k] = v
+        if k.startswith('lm_embedding.'):
+            v = state_dict.pop(k)
+            k = k.replace('lm_embedding.', 'llm_embedding.')
+            state_dict[k] = v
+        if k.startswith('codec_embedder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_embedder.', 'speech_embedding.')
+            state_dict[k] = v
+    # instruct少了spk embedding参数，加个全0上去
+    keys = list(state_dict.keys())
+    if 'spk_embed_affine_layer.weight' not in keys:
+        print('no spk_embed_affine_layer.weight, should be instruct model')
+        state_dict['spk_embed_affine_layer.weight'] = torch.zeros(1024, 192)
+    if 'spk_embed_affine_layer.bias' not in keys:
+        print('no spk_embed_affine_layer.bias, should be instruct model')
+        state_dict['spk_embed_affine_layer.bias'] = torch.zeros(1024)
+    return state_dict
+def convert_hift(state_dict):
+    # 调整了cosyvoice中hifigan的结构，把f0_predictor放到generator里
+    state_dict = {k: v for k, v in state_dict.items() if not k.startswith('discriminator.')}
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k in ['step', 'epoch']:
+            del state_dict[k]
+        if k.startswith('decoder.'):
+            v = state_dict.pop(k)
+            k = k.replace('decoder.', '')
+            state_dict[k] = v
+        if k.startswith('generator.'):
+            v = state_dict.pop(k)
+            k = k.replace('generator.', '')
+            state_dict[k] = v
+    return state_dict
+def convert_flow(state_dict):
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('encoder.embed.'):
+            v = state_dict.pop(k)
+            k = k.replace('encoder.embed.', 'encoder.embed.out.')
+            state_dict[k] = v
+    for k in keys:
+        if k.startswith('xvec_proj.'):
+            v = state_dict.pop(k)
+            k = k.replace('xvec_proj.', 'spk_embed_affine_layer.')
+            state_dict[k] = v
+    return state_dict
+def convert_llm2(state_dict):
+    # 调整了lm的结构，把codec_lm.encoder作为llm，codec_lm.decoder作为decoder
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('codec_lm.encoder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_lm.encoder.', 'llm.')
+            state_dict[k] = v
+        if k.startswith('codec_lm.decoder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_lm.decoder.', 'llm_decoder.')
+            state_dict[k] = v
+        if k.startswith('lm_embedding.'):
+            v = state_dict.pop(k)
+            k = k.replace('lm_embedding.', 'llm_embedding.')
+            state_dict[k] = v
+        if k.startswith('codec_embedder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_embedder.', 'speech_embedding.')
+            state_dict[k] = v
+        if k.startswith('text_enc_out_layer.'):
+            state_dict.pop(k)
+        if k.startswith('token_embedding.weight'):
+            state_dict.pop(k)
+    return state_dict
+def convert_llm3(state_dict):
+    # 调整了lm的结构，把codec_lm.encoder作为llm，codec_lm.decoder作为decoder
+    keys = list(state_dict.keys())
+    state_dict = {k: v for k, v in state_dict.items() if (not k.startswith('reward') and not k.startswith('ref'))}
+    for k in keys:
+        if k.startswith('llm.model.'):
+            v = state_dict.pop(k)
+            k = k.replace('llm.model.', 'llm.model.model.')
+            state_dict[k] = v
+        if k.startswith('codec_head.'):
+            v = state_dict.pop(k)
+            state_dict[k.replace('codec_head.', 'llm_decoder.')] = v
+        if k.startswith('codec_embed.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_embed.', 'speech_embedding.')
+            state_dict[k] = v
+    state_dict['llm.model.lm_head.weight'] = state_dict['llm.model.model.embed_tokens.weight']
+    return state_dict
+def convert_flow2(state_dict):
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('encoder.embed.'):
+            v = state_dict.pop(k)
+            k = k.replace('encoder.embed.', 'encoder.embed.out.')
+            state_dict[k] = v
+    for k in keys:
+        if k.startswith('xvec_proj.'):
+            v = state_dict.pop(k)
+            k = k.replace('xvec_proj.', 'spk_embed_affine_layer.')
+            state_dict[k] = v
+    for k in keys:
+        if k.startswith('mel_extractor.'):
+            state_dict.pop(k)
+    for k in keys:
+        if k.startswith('encoder.upsample_blocks.0.0.'):
+            v = state_dict.pop(k)
+            k = k.replace('encoder.upsample_blocks.0.0.', 'encoder.up_layer.')
+            state_dict[k] = v
+        if k.startswith('encoder.upsample_blocks.0.1.'):
+            v = state_dict.pop(k)
+            k = k.replace('encoder.upsample_blocks.0.1.', 'encoder.up_embed.out.')
+            state_dict[k] = v
+        if k.startswith('encoder.upsample_blocks.0.2.'):
+            v = state_dict.pop(k)
+            k = k.replace('encoder.upsample_blocks.0.2.', 'encoder.up_encoders.')
+            state_dict[k] = v
+        # CausalBlock1D中sequantial 1->2
+        if k.startswith('decoder.estimator.') and k.endswith('block.1.weight'):
+            v = state_dict.pop(k)
+            k = k.replace('block.1.weight', 'block.2.weight')
+            state_dict[k] = v
+        if k.startswith('decoder.estimator.') and k.endswith('block.1.bias'):
+            v = state_dict.pop(k)
+            k = k.replace('block.1.bias', 'block.2.bias')
+            state_dict[k] = v
+    return state_dict
+def convert_flow3(state_dict):
+    keys = list(state_dict.keys())
+    for k in keys:
+        if k.startswith('xvec_proj.'):
+            v = state_dict.pop(k)
+            k = k.replace('xvec_proj.', 'spk_embed_affine_layer.')
+            state_dict[k] = v
+        if k.startswith('codec_embedder.'):
+            v = state_dict.pop(k)
+            k = k.replace('codec_embedder.', 'input_embedding.')
+            state_dict[k] = v
+        if k.startswith('lookahead_conv1d.'):
+            v = state_dict.pop(k)
+            k = k.replace('lookahead_conv1d.', 'pre_lookahead_layer.')
+            state_dict[k] = v
+    for k in keys:
+        if k.startswith('mel_extractor.'):
+            state_dict.pop(k)
+    for k in keys:
+        # CausalBlock1D中sequantial 1->2
+        if k.startswith('dit_model.'):
+            v = state_dict.pop(k)
+            k = k.replace('dit_model.', 'decoder.estimator.')
+            state_dict[k] = v
+        if k in ['epoch', 'step']:
+            state_dict.pop(k)
+    return state_dict
+if __name__ == '__main__':
+    # 使用方法 python3 convert.py 原格式llm.pt llm 新格式llm.pt
+    state_dict = torch.load(sys.argv[1], map_location='cpu')
+    if 'state_dict' in state_dict:
+        state_dict = state_dict['state_dict']
+    if sys.argv[2] == 'llm':
+        state_dict = convert_llm(state_dict)
+    elif sys.argv[2] == 'flow':
+        state_dict = convert_flow(state_dict)
+    elif sys.argv[2] == 'hift':
+        state_dict = convert_hift(state_dict)
+    elif sys.argv[2] == 'llm2':
+        state_dict = convert_llm2(state_dict)
+    elif sys.argv[2] == 'llm3':
+        state_dict = convert_llm3(state_dict)
+    elif sys.argv[2] == 'flow2':
+        state_dict = convert_flow2(state_dict)
+    elif sys.argv[2] == 'flow3':
+        state_dict = convert_flow3(state_dict)
+    else:
+        raise ValueError
+    torch.save(state_dict, sys.argv[3])

cosyvoice/bin/export_jit.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import AutoModel
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model, CosyVoice3Model
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.class_utils import get_model_type
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def get_optimized_script(model, preserved_attrs=[]):
+    script = torch.jit.script(model)
+    if preserved_attrs != []:
+        script = torch.jit.freeze(script, preserved_attrs=preserved_attrs)
+    else:
+        script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    return script
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    model = AutoModel(model_dir=args.model_dir)
+    if get_model_type(model.model) == CosyVoiceModel:
+        # 1. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+    elif get_model_type(model.model) == CosyVoice2Model:
+        # 1. export llm text_encoder
+        llm_text_encoder = model.model.llm.text_encoder
+        script = get_optimized_script(llm_text_encoder)
+        script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_text_encoder.half())
+        script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_text_encoder')
+        # 2. export llm llm
+        llm_llm = model.model.llm.llm
+        script = get_optimized_script(llm_llm, ['forward_chunk'])
+        script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
+        script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_llm')
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+    else:
+        raise ValueError('unsupported model type')
+if __name__ == '__main__':
+    main()

cosyvoice/bin/export_onnx.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, [email protected])
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import AutoModel
+from cosyvoice.utils.file_utils import logging
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+@torch.no_grad()
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    model = AutoModel(model_dir=args.model_dir)
+    # 1. export flow decoder estimator
+    estimator = model.model.flow.decoder.estimator
+    estimator.eval()
+    device = model.model.device
+    batch_size, seq_len = 2, 256
+    out_channels = model.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {2: 'seq_len'},
+            'mask': {2: 'seq_len'},
+            'mu': {2: 'seq_len'},
+            'cond': {2: 'seq_len'},
+            'estimator_out': {2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+    logging.info('successfully export estimator')
+if __name__ == "__main__":
+    main()

cosyvoice/bin/inference_deprecated.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
+from cosyvoice.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    try:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': args.qwen_pretrain_path})
+        model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'])
+    except Exception:
+        try:
+            with open(args.config, 'r') as f:
+                configs = load_hyperpyyaml(f)
+            model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+        except Exception:
+            raise TypeError('no valid model_type!')
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
+                           tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    sample_rate = configs['sample_rate']
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for _, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            tts_speeches = []
+            for model_output in model.tts(**model_input):
+                tts_speeches.append(model_output['tts_speech'])
+            tts_speeches = torch.concat(tts_speeches, dim=1)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, tts_speeches, sample_rate=sample_rate, backend='soundfile')
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    logging.warning('this code has been deprecated, please refer to README for CosyVoice inference usage!')
+    main()

cosyvoice/bin/train.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import os
+import torch
+import torch.distributed as dist
+import deepspeed
+from hyperpyyaml import load_hyperpyyaml
+from torch.distributed.elastic.multiprocessing.errors import record
+from cosyvoice.utils.losses import DPOLoss
+from cosyvoice.utils.executor import Executor
+from cosyvoice.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--ref_model', required=False, help='ref model used in dpo')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--qwen_pretrain_path', required=False, help='qwen pretrain path')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--use_amp',
+                        action='store_true',
+                        default=False,
+                        help='Use automatic mixed precision training')
+    parser.add_argument('--dpo',
+                        action='store_true',
+                        default=False,
+                        help='Use Direct Preference Optimization')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=60,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join.')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    # gan train has some special initialization logic
+    gan = True if args.model == 'hifigan' else False
+    override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
+    if gan is True:
+        override_dict.pop('hift')
+    try:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={**override_dict, 'qwen_pretrain_path': args.qwen_pretrain_path})
+    except Exception:
+        with open(args.config, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides=override_dict)
+    if gan is True:
+        configs['train_conf'] = configs['train_conf_gan']
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs, gan, args.dpo)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    if args.dpo is True:
+        configs[args.model].forward = configs[args.model].forward_dpo
+    model = configs[args.model]
+    start_step, start_epoch = 0, -1
+    if args.checkpoint is not None:
+        if os.path.exists(args.checkpoint):
+            state_dict = torch.load(args.checkpoint, map_location='cpu')
+            model.load_state_dict(state_dict, strict=False)
+            if 'step' in state_dict:
+                start_step = state_dict['step']
+            if 'epoch' in state_dict:
+                start_epoch = state_dict['epoch']
+        else:
+            logging.warning('checkpoint {} do not exsist!'.format(args.checkpoint))
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler, optimizer_d, scheduler_d = init_optimizer_and_scheduler(args, configs, model, gan)
+    scheduler.set_step(start_step)
+    if scheduler_d is not None:
+        scheduler_d.set_step(start_step)
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    info_dict['step'] = start_step
+    info_dict['epoch'] = start_epoch
+    save_model(model, 'init', info_dict)
+    # DPO related
+    if args.dpo is True:
+        ref_model = deepcopy(configs[args.model])
+        state_dict = torch.load(args.ref_model, map_location='cpu')
+        ref_model.load_state_dict(state_dict, strict=False)
+        dpo_loss = DPOLoss(beta=0.01, label_smoothing=0.0, ipo=False)
+        # NOTE maybe it is not needed to wrap ref_model as ddp because its parameter is not updated
+        ref_model = wrap_cuda_model(args, ref_model)
+    else:
+        ref_model, dpo_loss = None, None
+    # Get executor
+    executor = Executor(gan=gan, ref_model=ref_model, dpo_loss=dpo_loss)
+    executor.step = start_step
+    # Init scaler, used for pytorch amp mixed precision training
+    scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
+    print('start step {} start epoch {}'.format(start_step, start_epoch))
+    # Start training loop
+    for epoch in range(start_epoch + 1, info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        if gan is True:
+            executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
+                                        writer, info_dict, scaler, group_join)
+        else:
+            executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=ref_model)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

cosyvoice/cli/__init__.py ADDED Viewed

File without changes

cosyvoice/cli/cosyvoice.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+from typing import Generator
+from tqdm import tqdm
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+import torch
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model, CosyVoice3Model
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.class_utils import get_model_type
+class CosyVoice:
+    def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1):
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        hyper_yaml_path = '{}/cosyvoice.yaml'.format(model_dir)
+        if not os.path.exists(hyper_yaml_path):
+            raise ValueError('{} not found!'.format(hyper_yaml_path))
+        with open(hyper_yaml_path, 'r') as f:
+            configs = load_hyperpyyaml(f)
+        assert get_model_type(configs) == CosyVoiceModel, 'do not use {} for CosyVoice initialization!'.format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          configs['allowed_special'])
+        self.sample_rate = configs['sample_rate']
+        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+            load_jit, load_trt, fp16 = False, False, False
+            logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
+        if load_trt:
+            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
+                                trt_concurrent,
+                                self.fp16)
+        del configs
+    def list_available_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+    def add_zero_shot_spk(self, prompt_text, prompt_wav, zero_shot_spk_id):
+        assert zero_shot_spk_id != '', 'do not use empty zero_shot_spk_id'
+        model_input = self.frontend.frontend_zero_shot('', prompt_text, prompt_wav, self.sample_rate, '')
+        del model_input['text']
+        del model_input['text_len']
+        self.frontend.spk2info[zero_shot_spk_id] = model_input
+        return True
+    def save_spkinfo(self):
+        torch.save(self.frontend.spk2info, '{}/spk2info.pt'.format(self.model_dir))
+    def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True):
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            if (not isinstance(i, Generator)) and len(i) < 0.5 * len(prompt_text):
+                logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_cross_lingual(self, tts_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_wav, self.sample_rate, zero_shot_spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0, text_frontend=True):
+        assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False, text_frontend=text_frontend)
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_vc(self, source_wav, prompt_wav, stream=False, speed=1.0):
+        model_input = self.frontend.frontend_vc(source_wav, prompt_wav, self.sample_rate)
+        start_time = time.time()
+        for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+            speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+            logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+            yield model_output
+            start_time = time.time()
+class CosyVoice2(CosyVoice):
+    def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        hyper_yaml_path = '{}/cosyvoice2.yaml'.format(model_dir)
+        if not os.path.exists(hyper_yaml_path):
+            raise ValueError('{} not found!'.format(hyper_yaml_path))
+        with open(hyper_yaml_path, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
+        assert get_model_type(configs) == CosyVoice2Model, 'do not use {} for CosyVoice2 initialization!'.format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v2.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          configs['allowed_special'])
+        self.sample_rate = configs['sample_rate']
+        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or load_vllm is True or fp16 is True):
+            load_jit, load_trt, load_vllm, fp16 = False, False, False, False
+            logging.warning('no cuda device, set load_jit/load_trt/load_vllm/fp16 to False')
+        self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_vllm:
+            self.model.load_vllm('{}/vllm'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
+        if load_trt:
+            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
+                                trt_concurrent,
+                                self.fp16)
+        del configs
+    def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+class CosyVoice3(CosyVoice2):
+    def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        hyper_yaml_path = '{}/cosyvoice3.yaml'.format(model_dir)
+        if not os.path.exists(hyper_yaml_path):
+            raise ValueError('{} not found!'.format(hyper_yaml_path))
+        with open(hyper_yaml_path, 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
+        assert get_model_type(configs) == CosyVoice3Model, 'do not use {} for CosyVoice3 initialization!'.format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v3.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          configs['allowed_special'])
+        self.sample_rate = configs['sample_rate']
+        if torch.cuda.is_available() is False and (load_trt is True or fp16 is True):
+            load_trt, fp16 = False, False
+            logging.warning('no cuda device, set load_trt/fp16 to False')
+        self.model = CosyVoice3Model(configs['llm'], configs['flow'], configs['hift'], fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_vllm:
+            self.model.load_vllm('{}/vllm'.format(model_dir))
+        if load_trt:
+            if self.fp16 is True:
+                logging.warning('DiT tensorRT fp16 engine have some performance issue, use at caution!')
+            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
+                                trt_concurrent,
+                                self.fp16)
+        del configs
+def AutoModel(**kwargs):
+    if not os.path.exists(kwargs['model_dir']):
+        kwargs['model_dir'] = snapshot_download(kwargs['model_dir'])
+    if os.path.exists('{}/cosyvoice.yaml'.format(kwargs['model_dir'])):
+        return CosyVoice(**kwargs)
+    elif os.path.exists('{}/cosyvoice2.yaml'.format(kwargs['model_dir'])):
+        return CosyVoice2(**kwargs)
+    elif os.path.exists('{}/cosyvoice3.yaml'.format(kwargs['model_dir'])):
+        return CosyVoice3(**kwargs)
+    else:
+        raise TypeError('No valid model type found!')

cosyvoice/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Generator
+import json
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except ImportError:
+    print("failed to import ttsfrd, use wetext instead")
+    from wetext import Normalizer as ZhNormalizer
+    from wetext import Normalizer as EnNormalizer
+    use_ttsfrd = False
+from cosyvoice.utils.file_utils import logging, load_wav
+from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 get_tokenizer: Callable,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
+                                                                     providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        else:
+            self.spk2info = {}
+        self.allowed_special = allowed_special
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
+                'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyinvg')
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False)
+            self.en_tn_model = EnNormalizer()
+            self.inflect_parser = inflect.engine()
+    def _extract_text_token(self, text):
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will return _extract_text_token_generator!')
+            # NOTE add a dummy text_token_len for compatibility
+            return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
+        else:
+            text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+            text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+            text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+            return text_token, text_token_len
+    def _extract_text_token_generator(self, text_generator):
+        for text in text_generator:
+            text_token, _ = self._extract_text_token(text)
+            for i in range(text_token.shape[1]):
+                yield text_token[:, i: i + 1]
+    def _extract_speech_token(self, prompt_wav):
+        speech = load_wav(prompt_wav, 16000)
+        assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None,
+                                                         {self.speech_tokenizer_session.get_inputs()[0].name:
+                                                          feat.detach().cpu().numpy(),
+                                                          self.speech_tokenizer_session.get_inputs()[1].name:
+                                                          np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, prompt_wav):
+        speech = load_wav(prompt_wav, 16000)
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None,
+                                              {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, prompt_wav):
+        speech = load_wav(prompt_wav, 24000)
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True, text_frontend=True):
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will skip text_normalize!')
+            return [text]
+        # NOTE skip text_frontend when ssml symbol in text
+        if '<|' in text and '|>' in text:
+            text_frontend = False
+        if text_frontend is False or text == '':
+            return [text] if split is True else text
+        text = text.strip()
+        if self.use_ttsfrd:
+            texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+            text = ''.join(texts)
+        else:
+            if contains_chinese(text):
+                text = self.zh_tn_model.normalize(text)
+                text = text.replace("\n", "")
+                text = replace_blank(text)
+                text = replace_corner_mark(text)
+                text = text.replace(".", "。")
+                text = text.replace(" - ", "，")
+                text = remove_bracket(text)
+                text = re.sub(r'[，,、]+$', '。', text)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                             token_min_n=60, merge_len=20, comma_split=False))
+            else:
+                text = self.en_tn_model.normalize(text)
+                text = spell_out_number(text, self.inflect_parser)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                             token_min_n=60, merge_len=20, comma_split=False))
+        texts = [i for i in texts if not is_only_punctuation(i)]
+        return texts if split is True else text
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_wav, resample_rate, zero_shot_spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        if zero_shot_spk_id == '':
+            prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+            speech_feat, speech_feat_len = self._extract_speech_feat(prompt_wav)
+            speech_token, speech_token_len = self._extract_speech_token(prompt_wav)
+            if resample_rate == 24000:
+                # cosyvoice2, force speech_feat % speech_token = 2
+                token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+                speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+                speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+            embedding = self._extract_spk_embedding(prompt_wav)
+            model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                           'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                           'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                           'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                           'llm_embedding': embedding, 'flow_embedding': embedding}
+        else:
+            model_input = self.spk2info[zero_shot_spk_id]
+        model_input['text'] = tts_text_token
+        model_input['text_len'] = tts_text_token_len
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_wav, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_wav, resample_rate, zero_shot_spk_id)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text)
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input
+    def frontend_instruct2(self, tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id):
+        model_input = self.frontend_zero_shot(tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id)
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_vc(self, source_speech_16k, prompt_wav, resample_rate):
+        prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_wav)
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_wav)
+        embedding = self._extract_spk_embedding(prompt_wav)
+        source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
+        model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
+                       'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
+                       'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
+                       'flow_embedding': embedding}
+        return model_input

cosyvoice/cli/model.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Generator
+import torch
+import numpy as np
+import threading
+import time
+from torch.nn import functional as F
+from contextlib import nullcontext
+import uuid
+from cosyvoice.utils.common import fade_in_out
+from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
+from cosyvoice.utils.common import TrtContextWrapper
+class CosyVoiceModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool = False):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        self.token_min_hop_len = 2 * self.flow.input_frame_rate
+        self.token_max_hop_len = 4 * self.flow.input_frame_rate
+        self.token_overlap_len = 20
+        # mel fade in out
+        self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        # hift cache
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        # speech fade in out
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.flow_cache_dict = {}
+        self.hift_cache_dict = {}
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
+        self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
+        self.flow.to(self.device).eval()
+        # in case hift_model is a hifigan model
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.to(self.device).eval()
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+    def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, trt_concurrent, fp16):
+        assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
+        if not os.path.exists(flow_decoder_estimator_model) or os.path.getsize(flow_decoder_estimator_model) == 0:
+            convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
+        del self.flow.decoder.estimator
+        import tensorrt as trt
+        with open(flow_decoder_estimator_model, 'rb') as f:
+            estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
+        assert estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
+        self.flow.decoder.estimator = TrtContextWrapper(estimator_engine, trt_concurrent=trt_concurrent, device=self.device)
+    def get_trt_kwargs(self):
+        min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
+        opt_shape = [(2, 80, 500), (2, 1, 500), (2, 80, 500), (2, 80, 500)]
+        max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
+        input_names = ["x", "mask", "mu", "cond"]
+        return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
+    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
+        with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False):
+            if isinstance(text, Generator):
+                assert isinstance(self, CosyVoice2Model) and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2 and do not support vllm!'
+                for i in self.llm.inference_bistream(text=text,
+                                                     prompt_text=prompt_text.to(self.device),
+                                                     prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                                     prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                                     prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                     embedding=llm_embedding.to(self.device)):
+                    self.tts_speech_token_dict[uuid].append(i)
+            else:
+                for i in self.llm.inference(text=text.to(self.device),
+                                            text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                                            prompt_text=prompt_text.to(self.device),
+                                            prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                            prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                            prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                            embedding=llm_embedding.to(self.device),
+                                            uuid=uuid):
+                    self.tts_speech_token_dict[uuid].append(i)
+        self.llm_end_dict[uuid] = True
+    def vc_job(self, source_speech_token, uuid):
+        self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
+        self.llm_end_dict[uuid] = True
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
+        with torch.cuda.amp.autocast(self.fp16):
+            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
+                                                                      token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                                                      prompt_token=prompt_token.to(self.device),
+                                                                      prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                                      prompt_feat=prompt_feat.to(self.device),
+                                                                      prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                                                      embedding=embedding.to(self.device),
+                                                                      flow_cache=self.flow_cache_dict[uuid])
+        # mel overlap fade in out
+        if self.mel_overlap_dict[uuid].shape[2] != 0:
+            tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
+        # append hift cache
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = torch.zeros(1, 1, 0)
+        # keep overlap mel and hift cache
+        if finalize is False:
+            self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
+            tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+                                          'source': tts_source[:, :, -self.source_cache_len:],
+                                          'speech': tts_speech[:, -self.source_cache_len:]}
+            tts_speech = tts_speech[:, :-self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+        return tts_speech
+    def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
+            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
+        if source_speech_token.shape[1] == 0:
+            p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        else:
+            p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
+        p.start()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
+                        .unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     finalize=False)
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better speech quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            p.join()
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+            self.flow_cache_dict.pop(this_uuid)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.current_stream().synchronize()
+class CosyVoice2Model(CosyVoiceModel):
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool = False):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        # NOTE must matching training static_chunk_size
+        self.token_hop_len = 25
+        # hift cache
+        self.mel_cache_len = 8
+        self.source_cache_len = int(self.mel_cache_len * 480)
+        # speech fade in out
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        # rtf and decoding related
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.hift_cache_dict = {}
+    def load_jit(self, flow_encoder_model):
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+    def load_vllm(self, model_dir):
+        export_cosyvoice2_vllm(self.llm, model_dir, self.device)
+        from vllm import EngineArgs, LLMEngine
+        engine_args = EngineArgs(model=model_dir,
+                                 skip_tokenizer_init=True,
+                                 enable_prompt_embeds=True,
+                                 gpu_memory_utilization=0.2)
+        self.llm.vllm = LLMEngine.from_engine_args(engine_args)
+        self.llm.lock = threading.Lock()
+        del self.llm.llm.model.model.layers
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
+        with torch.cuda.amp.autocast(self.fp16):
+            tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
+                                             token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                             prompt_token=prompt_token.to(self.device),
+                                             prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                             prompt_feat=prompt_feat.to(self.device),
+                                             prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                             embedding=embedding.to(self.device),
+                                             streaming=stream,
+                                             finalize=finalize)
+        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
+        # append hift cache
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = torch.zeros(1, 1, 0)
+        # keep overlap mel and hift cache
+        if finalize is False:
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+                                          'source': tts_source[:, :, -self.source_cache_len:],
+                                          'speech': tts_speech[:, -self.source_cache_len:]}
+            tts_speech = tts_speech[:, :-self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+        return tts_speech
+    def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+        if source_speech_token.shape[1] == 0:
+            p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        else:
+            p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
+        p.start()
+        if stream is True:
+            token_offset = 0
+            prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
+            while True:
+                time.sleep(0.1)
+                this_token_hop_len = self.token_hop_len + prompt_token_pad if token_offset == 0 else self.token_hop_len
+                if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= this_token_hop_len + self.flow.pre_lookahead_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + this_token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     token_offset=token_offset,
+                                                     uuid=this_uuid,
+                                                     stream=stream,
+                                                     finalize=False)
+                    token_offset += this_token_hop_len
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             token_offset=token_offset,
+                                             uuid=this_uuid,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            p.join()
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             token_offset=0,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.current_stream().synchronize()
+class CosyVoice3Model(CosyVoice2Model):
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool = False):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        # NOTE must matching training static_chunk_size
+        self.token_hop_len = 25
+        # rtf and decoding related
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.hift_cache_dict = {}
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
+        with torch.cuda.amp.autocast(self.fp16):
+            tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
+                                             token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                             prompt_token=prompt_token.to(self.device),
+                                             prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                             prompt_feat=prompt_feat.to(self.device),
+                                             prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                             embedding=embedding.to(self.device),
+                                             streaming=stream,
+                                             finalize=finalize)
+            tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
+            # append mel cache
+            if self.hift_cache_dict[uuid] is not None:
+                hift_cache_mel = self.hift_cache_dict[uuid]['mel']
+                tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+                self.hift_cache_dict[uuid]['mel'] = tts_mel
+            else:
+                self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
+            if speed != 1.0:
+                assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
+            tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
+            self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
+        return tts_speech

cosyvoice/dataset/__init__.py ADDED Viewed

File without changes

cosyvoice/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from cosyvoice.utils.file_utils import read_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            gan=False,
+            dpo=False,
+            shuffle=True,
+            partition=True):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    lists = read_lists(data_list_file)
+    dataset = DataList(lists,
+                       shuffle=shuffle,
+                       partition=partition)
+    # map partial arg to padding func
+    data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset

cosyvoice/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import pyarrow.parquet as pq
+from io import BytesIO
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import pyworld as pw
+AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+def parquet_opener(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        url = sample['src']
+        try:
+            for df in pq.ParquetFile(url).iter_batches(batch_size=64):
+                df = df.to_pandas()
+                for i in range(len(df)):
+                    sample.update(dict(df.loc[i]))
+                    if mode == 'train':
+                        # NOTE do not return sample directly, must initialize a new dict
+                        yield {**sample}
+                    else:
+                        for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
+                            yield {**sample, 'tts_index': index, 'tts_text': text}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
+        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+        del sample['audio_data']
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['text_token']) < token_min_length:
+            continue
+        if len(sample['text_token']) > token_max_length:
+            continue
+        if len(sample['speech_token']) == 0:
+            continue
+        if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
+            continue
+        if num_frames != 0:
+            if len(sample['text_token']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['text_token']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+    """ Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['speech']
+        if sample_rate != resample_rate:
+            if sample_rate < min_sample_rate:
+                continue
+            sample['sample_rate'] = resample_rate
+            sample['speech'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        max_val = sample['speech'].abs().max()
+        if max_val > 1:
+            sample['speech'] /= max_val
+        yield sample
+def truncate(data, truncate_length=24576, mode='train'):
+    """ Truncate data.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            truncate_length: truncate length
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        waveform = sample['speech']
+        if waveform.shape[1] > truncate_length:
+            start = random.randint(0, waveform.shape[1] - truncate_length)
+            waveform = waveform[:, start: start + truncate_length]
+        else:
+            waveform = torch.concat([waveform, torch.zeros(1, truncate_length - waveform.shape[1])], dim=1)
+        sample['speech'] = waveform
+        yield sample
+def compute_fbank(data,
+                  feat_extractor,
+                  token_mel_ratio=0,
+                  mode='train'):
+    """ Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        if token_mel_ratio != 0:
+            # trim to align speech_token and speech_feat
+            token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
+            feat = feat[:token_mel_ratio * token_len]
+            sample["speech_token"] = sample["speech_token"][:token_len]
+        sample['speech_feat'] = feat
+        yield sample
+def compute_f0(data, sample_rate, hop_size, mode='train'):
+    """ Extract f0
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    frame_period = hop_size * 1000 / sample_rate
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        _f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
+        if sum(_f0 != 0) < 5:  # this happens when the algorithm fails
+            _f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)  # if harvest fails, try dio
+        f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
+        f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
+        sample['pitch_feat'] = f0
+        yield sample
+def parse_embedding(data, normalize, mode='train'):
+    """ Parse utt_embedding/spk_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
+        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
+        if normalize:
+            sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
+            sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
+        yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+    """ Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    tokenizer = get_tokenizer()
+    for sample in data:
+        assert 'text' in sample
+        sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
+        if 'instruct' in sample:
+            sample['instruct_token'] = tokenizer.encode(sample['instruct'], allowed_special=allowed_special)
+        else:
+            sample['instruct_token'] = tokenizer.encode('', allowed_special=allowed_special)
+        yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+    """ Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+def sort(data, sort_size=500, mode='train'):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['speech_feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['speech_feat'].size(0))
+    for x in buf:
+        yield x
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'speech_feat' in sample
+        assert isinstance(sample['speech_feat'], torch.Tensor)
+        new_sample_frames = sample['speech_feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
+    """ Wrapper for static/dynamic batch
+    """
+    if batch_type == 'static':
+        return static_batch(data, batch_size)
+    elif batch_type == 'dynamic':
+        return dynamic_batch(data, max_frames_in_batch)
+    else:
+        logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        utts = [sample[i]['utt'] for i in order]
+        speech = [sample[i]['speech'].squeeze(dim=0) for i in order]
+        speech_len = torch.tensor([i.size(0) for i in speech], dtype=torch.int32)
+        speech = pad_sequence(speech, batch_first=True, padding_value=0)
+        speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+        speech_token = pad_sequence(speech_token,
+                                    batch_first=True,
+                                    padding_value=0)
+        speech_feat = [sample[i]['speech_feat'] for i in order]
+        speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+        speech_feat = pad_sequence(speech_feat,
+                                   batch_first=True,
+                                   padding_value=0)
+        text = [sample[i]['text'] for i in order]
+        text_token = [torch.tensor(sample[i]['text_token']) for i in order]
+        text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
+        text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
+        instruct_token = [torch.tensor(sample[i]['instruct_token']) for i in order]
+        instruct_token_len = torch.tensor([i.size(0) for i in instruct_token], dtype=torch.int32)
+        instruct_token = pad_sequence(instruct_token, batch_first=True, padding_value=0)
+        utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
+        spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
+        batch = {
+            "utts": utts,
+            "speech": speech,
+            "speech_len": speech_len,
+            "speech_token": speech_token,
+            "speech_token_len": speech_token_len,
+            "speech_feat": speech_feat,
+            "speech_feat_len": speech_feat_len,
+            "text": text,
+            "text_token": text_token,
+            "text_token_len": text_token_len,
+            "instruct_token": instruct_token,
+            "instruct_token_len": instruct_token_len,
+            "utt_embedding": utt_embedding,
+            "spk_embedding": spk_embedding,
+        }
+        if gan is True:
+            # in gan train, we need pitch_feat
+            pitch_feat = [sample[i]['pitch_feat'] for i in order]
+            pitch_feat_len = torch.tensor([i.size(0) for i in pitch_feat], dtype=torch.int32)
+            pitch_feat = pad_sequence(pitch_feat,
+                                      batch_first=True,
+                                      padding_value=0)
+            batch["pitch_feat"] = pitch_feat
+            batch["pitch_feat_len"] = pitch_feat_len
+        else:
+            # only gan train needs speech, delete it to save memory
+            del batch["speech"]
+            del batch["speech_len"]
+        if dpo is True:
+            reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
+            reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
+            reject_speech_token = pad_sequence(reject_speech_token,
+                                               batch_first=True,
+                                               padding_value=0)
+            batch['reject_speech_token'] = reject_speech_token
+            batch['reject_speech_token_len'] = reject_speech_token_len
+        if use_spk_embedding is True:
+            batch["embedding"] = batch["spk_embedding"]
+        else:
+            batch["embedding"] = batch["utt_embedding"]
+        yield batch

cosyvoice/flow/DiT/dit.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import repeat
+from x_transformers.x_transformers import RotaryEmbedding
+from cosyvoice.utils.mask import add_optional_chunk_mask
+from cosyvoice.flow.DiT.modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    CausalConvPositionEmbedding,
+    DiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis,
+    get_pos_embed_indices,
+)
+# Text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(
+                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
+            )
+        else:
+            self.extra_modeling = False
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        batch, text_len = text.shape[0], text.shape[1]
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        text = F.pad(text, (0, seq_len - text_len), value=0)
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)  # b n -> b n d
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+        return text
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim, spk_dim=None):
+        super().__init__()
+        spk_dim = 0 if spk_dim is None else spk_dim
+        self.spk_dim = spk_dim
+        self.proj = nn.Linear(mel_dim * 2 + text_dim + spk_dim, out_dim)
+        self.conv_pos_embed = CausalConvPositionEmbedding(dim=out_dim)
+    def forward(
+            self,
+            x: float["b n d"],
+            cond: float["b n d"],
+            text_embed: float["b n d"],
+            spks: float["b d"],
+    ):
+        to_cat = [x, cond, text_embed]
+        if self.spk_dim > 0:
+            spks = repeat(spks, "b c -> b t c", t=x.shape[1])
+            to_cat.append(spks)
+        x = self.proj(torch.cat(to_cat, dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=80,
+        mu_dim=None,
+        long_skip_connection=False,
+        spk_dim=None,
+        out_channels=None,
+        static_chunk_size=50,
+        num_decoding_left_chunks=2
+    ):
+        super().__init__()
+        self.time_embed = TimestepEmbedding(dim)
+        if mu_dim is None:
+            mu_dim = mel_dim
+        self.input_embed = InputEmbedding(mel_dim, mu_dim, dim, spk_dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
+        )
+        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+        self.out_channels = out_channels
+        self.static_chunk_size = static_chunk_size
+        self.num_decoding_left_chunks = num_decoding_left_chunks
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        x = x.transpose(1, 2)
+        mu = mu.transpose(1, 2)
+        cond = cond.transpose(1, 2)
+        spks = spks.unsqueeze(dim=1)
+        batch, seq_len = x.shape[0], x.shape[1]
+        if t.ndim == 0:
+            t = t.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(t)
+        x = self.input_embed(x, cond, mu, spks.squeeze(1))
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        if self.long_skip_connection is not None:
+            residual = x
+        if streaming is True:
+            attn_mask = add_optional_chunk_mask(x, mask.bool(), False, False, 0, self.static_chunk_size, -1).unsqueeze(dim=1)
+        else:
+            attn_mask = add_optional_chunk_mask(x, mask.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1).unsqueeze(dim=1)
+        for block in self.transformer_blocks:
+            x = block(x, t, mask=attn_mask.bool(), rope=rope)
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+        x = self.norm_out(x, t)
+        output = self.proj_out(x).transpose(1, 2)
+        return output

cosyvoice/flow/DiT/modules.py ADDED Viewed

	@@ -0,0 +1,616 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+from typing import Optional
+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+from x_transformers.x_transformers import apply_rotary_pos_emb
+# raw wav to mel spec
+class MelSpec(nn.Module):
+    def __init__(
+        self,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=100,
+        target_sample_rate=24_000,
+        normalize=False,
+        power=1,
+        norm=None,
+        center=True,
+    ):
+        super().__init__()
+        self.n_mel_channels = n_mel_channels
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(
+            sample_rate=target_sample_rate,
+            n_fft=filter_length,
+            win_length=win_length,
+            hop_length=hop_length,
+            n_mels=n_mel_channels,
+            power=power,
+            center=center,
+            normalized=normalize,
+            norm=norm,
+        )
+        self.register_buffer("dummy", torch.tensor(0), persistent=False)
+    def forward(self, inp):
+        if len(inp.shape) == 3:
+            inp = inp.squeeze(1)  # 'b 1 nw -> b nw'
+        assert len(inp.shape) == 2
+        if self.dummy.device != inp.device:
+            self.to(inp.device)
+        mel = self.mel_stft(inp)
+        mel = mel.clamp(min=1e-5).log()
+        return mel
+# sinusoidal position embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# convolutional position embedding
+class ConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):  # noqa: F722
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = self.conv1d(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+class CausalConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.kernel_size = kernel_size
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=0),
+            nn.Mish(),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=0),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):  # noqa: F722
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = F.pad(x, (self.kernel_size - 1, 0, 0, 0))
+        x = self.conv1(x)
+        x = F.pad(x, (self.kernel_size - 1, 0, 0, 0))
+        x = self.conv2(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+# rotary positional embedding related
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return torch.cat([freqs_cos, freqs_sin], dim=-1)
+def get_pos_embed_indices(start, length, max_pos, scale=1.0):
+    # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
+    pos = (
+        start.unsqueeze(1)
+        + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
+    )
+    # avoid extra long error.
+    pos = torch.where(pos < max_pos, pos, max_pos - 1)
+    return pos
+# Global Response Normalization layer (Instance Normalization ?)
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
+# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = x.transpose(1, 2)  # b n d -> b d n
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        return residual + x
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# FeedForward
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+    def forward(self, x):
+        return self.ff(x)
+# Attention with possible joint part
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class Attention(nn.Module):
+    def __init__(
+        self,
+        processor: JointAttnProcessor | AttnProcessor,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        context_dim: Optional[int] = None,  # if not None -> joint attention
+        context_pre_only=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.processor = processor
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.context_dim = context_dim
+        self.context_pre_only = context_pre_only
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if self.context_dim is not None:
+            self.to_k_c = nn.Linear(context_dim, self.inner_dim)
+            self.to_v_c = nn.Linear(context_dim, self.inner_dim)
+            if self.context_pre_only is not None:
+                self.to_q_c = nn.Linear(context_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_out_c = nn.Linear(self.inner_dim, dim)
+    def forward(
+        self,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b n d"] = None,  # context c  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.Tensor:
+        if c is not None:
+            return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
+        else:
+            return self.processor(self, x, mask=mask, rope=rope)
+# Attention processor
+class AttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding
+    ) -> torch.FloatTensor:
+        batch_size = x.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = mask
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+                attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if mask is not None:
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(-1)
+            else:
+                mask = mask[:, 0, -1].unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+        return x
+# Joint Attention processor for MM-DiT
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class JointAttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b nt d"] = None,  # context c, here text # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.FloatTensor:
+        residual = x
+        batch_size = c.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # `context` projections.
+        c_query = attn.to_q_c(c)
+        c_key = attn.to_k_c(c)
+        c_value = attn.to_v_c(c)
+        # apply rope for context and noised input independently
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        if c_rope is not None:
+            freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
+            c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
+        # attention
+        query = torch.cat([query, c_query], dim=1)
+        key = torch.cat([key, c_key], dim=1)
+        value = torch.cat([value, c_value], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # Split the attention outputs.
+        x, c = (
+            x[:, : residual.shape[1]],
+            x[:, residual.shape[1]:],
+        )
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if not attn.context_pre_only:
+            c = attn.to_out_c(c)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+            # c = c.masked_fill(~mask, 0.)  # no mask for c (text)
+        return x, c
+# DiT Block
+class DiTBlock(nn.Module):
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+        )
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        ff_norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(ff_norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+        return x
+# MMDiT Block https://arxiv.org/abs/2403.03206
+class MMDiTBlock(nn.Module):
+    r"""
+    modified from diffusers/src/diffusers/models/attention.py
+    notes.
+    _c: context related. text, cond, etc. (left part in sd3 fig2.b)
+    _x: noised input related. (right part)
+    context_pre_only: last layer only do prenorm + modulation cuz no more ffn
+    """
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
+        super().__init__()
+        self.context_pre_only = context_pre_only
+        self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
+        self.attn_norm_x = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=JointAttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            context_dim=dim,
+            context_pre_only=context_pre_only,
+        )
+        if not context_pre_only:
+            self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+        else:
+            self.ff_norm_c = None
+            self.ff_c = None
+        self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, c, t, mask=None, rope=None, c_rope=None):  # x: noised input, c: context, t: time embedding
+        # pre-norm & modulation for attention input
+        if self.context_pre_only:
+            norm_c = self.attn_norm_c(c, t)
+        else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
+        # attention
+        x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
+        # process attention output for context c
+        if self.context_pre_only:
+            c = None
+        else:  # if not last layer
+            c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            c_ff_output = self.ff_c(norm_c)
+            c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
+        # process attention output for input x
+        x = x + x_gate_msa.unsqueeze(1) * x_attn_output
+        norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
+        x_ff_output = self.ff_x(norm_x)
+        x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
+        return c, x
+# time step conditioning embedding
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+    def forward(self, timestep: float["b"]):  # noqa: F821
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        time = self.time_mlp(time_hidden)  # b d
+        return time

cosyvoice/flow/decoder.py ADDED Viewed

	@@ -0,0 +1,494 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import pack, rearrange, repeat
+from cosyvoice.utils.common import mask_to_bias
+from cosyvoice.utils.mask import add_optional_chunk_mask
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from matcha.models.components.transformer import BasicTransformerBlock
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = kernel_size - 1
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
+class CausalConditionalDecoder(ConditionalDecoder):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+        static_chunk_size=50,
+        num_decoding_left_chunks=2,
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        torch.nn.Module.__init__(self)
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.static_chunk_size = static_chunk_size
+        self.num_decoding_left_chunks = num_decoding_left_chunks
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask

cosyvoice/flow/flow.py ADDED Viewed

	@@ -0,0 +1,432 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from omegaconf import DictConfig
+from cosyvoice.utils.mask import make_pad_mask
+class MaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 length_regulator: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        token = batch['speech_token'].to(device)
+        token_len = batch['speech_token_len'].to(device)
+        feat = batch['speech_feat'].to(device)
+        feat_len = batch['speech_feat_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        h, h_lengths = self.length_regulator(h, feat_len)
+        # get conditions
+        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(feat_len)).to(h)
+        # NOTE this is unnecessary, feat/h already same shape
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  flow_cache):
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat speech token and prompt speech token
+        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
+        h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, flow_cache = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            prompt_len=mel_len1,
+            cache=flow_cache
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), flow_cache
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 token_mel_ratio: int = 2,
+                 pre_lookahead_len: int = 3,
+                 encoder: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        token = batch['speech_token'].to(device)
+        token_len = batch['speech_token_len'].to(device)
+        feat = batch['speech_feat'].to(device)
+        feat_len = batch['speech_feat_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # NOTE unified training, static_chunk_size > 0 or = 0
+        streaming = True if random.random() < 0.5 else False
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds,
+            streaming=streaming,
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  streaming,
+                  finalize):
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        if finalize is True:
+            h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        else:
+            token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
+            h, h_lengths = self.encoder(token, token_len, context=context, streaming=streaming)
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            streaming=streaming
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None
+class CausalMaskedDiffWithDiT(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 token_mel_ratio: int = 2,
+                 pre_lookahead_len: int = 3,
+                 pre_lookahead_layer: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.pre_lookahead_len = pre_lookahead_len
+        self.pre_lookahead_layer = pre_lookahead_layer
+        self.decoder = decoder
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        token = batch['speech_token'].to(device)
+        token_len = batch['speech_token_len'].to(device)
+        feat = batch['speech_feat'].to(device)
+        feat_len = batch['speech_feat_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # NOTE unified training, static_chunk_size > 0 or = 0
+        streaming = True if random.random() < 0.5 else False
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds,
+            streaming=streaming,
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  streaming,
+                  finalize):
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        if finalize is True:
+            h = self.pre_lookahead_layer(token)
+        else:
+            h = self.pre_lookahead_layer(token[:, :-self.pre_lookahead_len], context=token[:, -self.pre_lookahead_len:])
+        h = h.repeat_interleave(self.token_mel_ratio, dim=1)
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            streaming=streaming
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None
+if __name__ == '__main__':
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    from hyperpyyaml import load_hyperpyyaml
+    with open('./pretrained_models/Fun-CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
+        configs = load_hyperpyyaml(f, overrides={'llm': None, 'hift': None})
+    model = configs['flow']
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model.to(device)
+    model.eval()
+    max_len = 10 * model.decoder.estimator.static_chunk_size
+    chunk_size = model.decoder.estimator.static_chunk_size
+    context_size = model.pre_lookahead_layer.pre_lookahead_len
+    token = torch.randint(0, 6561, size=(1, max_len)).to(device)
+    token_len = torch.tensor([max_len]).to(device)
+    prompt_token = torch.randint(0, 6561, size=(1, chunk_size)).to(device)
+    prompt_token_len = torch.tensor([chunk_size]).to(device)
+    prompt_feat = torch.rand(1, chunk_size * 2, 80).to(device)
+    prompt_feat_len = torch.tensor([chunk_size * 2]).to(device)
+    prompt_embedding = torch.rand(1, 192).to(device)
+    pred_gt, _ = model.inference(token, token_len, prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=True)
+    for i in range(0, max_len, chunk_size):
+        finalize = True if i + chunk_size + context_size >= max_len else False
+        pred_chunk, _ = model.inference(token[:, :i + chunk_size + context_size], torch.tensor([token[:, :i + chunk_size + context_size].shape[1]]).to(device),
+                                        prompt_token, prompt_token_len, prompt_feat, prompt_feat_len, prompt_embedding, streaming=True, finalize=finalize)
+        pred_chunk = pred_chunk[:, :, i * model.token_mel_ratio:]
+        print((pred_gt[:, :, i * model.token_mel_ratio: i * model.token_mel_ratio + pred_chunk.shape[2]] - pred_chunk).abs().max().item())

cosyvoice/flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from matcha.models.components.flow_matching import BASECFM
+from cosyvoice.utils.common import set_all_random_seed
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, cache=torch.zeros(1, 80, 0, 2)):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        cache_size = cache.shape[2]
+        # fix prompt and overlap part mu and z
+        if cache_size != 0:
+            z[:, :, :cache_size] = cache[:, :, :, 0]
+            mu[:, :, :cache_size] = cache[:, :, :, 1]
+        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
+        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
+        cache = torch.stack([z_cache, mu_cache], dim=-1)
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), cache
+    def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        # NOTE when flow run in amp mode, x.dtype is float32, which cause nan in trt fp16 inference, so set dtype=spks.dtype
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=spks.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=spks.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=spks.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=spks.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=spks.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=spks.dtype)
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in,
+                streaming
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+    def forward_estimator(self, x, mask, mu, t, spks, cond, streaming=False):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator(x, mask, mu, t, spks, cond, streaming=streaming)
+        else:
+            [estimator, stream], trt_engine = self.estimator.acquire_estimator()
+            # NOTE need to synchronize when switching stream
+            torch.cuda.current_stream().synchronize()
+            with stream:
+                estimator.set_input_shape('x', (2, 80, x.size(2)))
+                estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                estimator.set_input_shape('t', (2,))
+                estimator.set_input_shape('spks', (2, 80))
+                estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                data_ptrs = [x.contiguous().data_ptr(),
+                             mask.contiguous().data_ptr(),
+                             mu.contiguous().data_ptr(),
+                             t.contiguous().data_ptr(),
+                             spks.contiguous().data_ptr(),
+                             cond.contiguous().data_ptr(),
+                             x.data_ptr()]
+                for i, j in enumerate(data_ptrs):
+                    estimator.set_tensor_address(trt_engine.get_tensor_name(i), j)
+                # run trt engine
+                assert estimator.execute_async_v3(torch.cuda.current_stream().cuda_stream) is True
+                torch.cuda.current_stream().synchronize()
+            self.estimator.release_estimator(estimator, stream)
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None, streaming=False):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            spks = spks * cfg_mask.view(-1, 1)
+            cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
+        set_all_random_seed(0)
+        self.rand_noise = torch.randn([1, 80, 50 * 300])
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, streaming=False):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond, streaming=streaming), None

cosyvoice/flow/length_regulator.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch.nn as nn
+import torch
+from torch.nn import functional as F
+from cosyvoice.utils.mask import make_pad_mask
+class InterpolateRegulator(nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            sampling_ratios: Tuple,
+            out_channels: int = None,
+            groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = nn.ModuleList([])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = nn.GroupNorm(groups, channels)
+                act = nn.Mish()
+                model.extend([module, norm, act])
+        model.append(
+            nn.Conv1d(channels, out_channels, 1, 1)
+        )
+        self.model = nn.Sequential(*model)
+    def forward(self, x, ylens=None):
+        # x in (B, T, D)
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # NOTE 20 corresponds to token_overlap_len in cosyvoice/cli/model.py
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2

cosyvoice/hifigan/discriminator.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from torch.nn.utils.parametrizations import weight_norm, spectral_norm
+except ImportError:
+    from torch.nn.utils import weight_norm, spectral_norm
+from typing import List, Optional, Tuple
+from einops import rearrange
+from torchaudio.transforms import Spectrogram
+LRELU_SLOPE = 0.1
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self, mpd: nn.Module, mrd: nn.Module
+    ):
+        super().__init__()
+        self.mpd = mpd
+        self.mrd = mrd
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor):
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mpd(y.unsqueeze(dim=1), y_hat.unsqueeze(dim=1))
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mrd(y, y_hat)
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0]: b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap
+class MultiResSpecDiscriminator(torch.nn.Module):
+    def __init__(self,
+                 fft_sizes=[1024, 2048, 512],
+                 hop_sizes=[120, 240, 50],
+                 win_lengths=[600, 1200, 240],
+                 window="hann_window"):
+        super(MultiResSpecDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
+            SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
+            SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window)])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for _, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=True)
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.abs(x_stft).transpose(2, 1)
+class SpecDiscriminator(nn.Module):
+    """docstring for Discriminator."""
+    def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False):
+        super(SpecDiscriminator, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length)
+        self.discriminators = nn.ModuleList([
+            norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
+        ])
+        self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
+    def forward(self, y):
+        fmap = []
+        y = y.squeeze(1)
+        y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.device))
+        y = y.unsqueeze(1)
+        for _, d in enumerate(self.discriminators):
+            y = d(y)
+            y = F.leaky_relu(y, LRELU_SLOPE)
+            fmap.append(y)
+        y = self.out(y)
+        fmap.append(y)
+        return torch.flatten(y, 1, -1), fmap

cosyvoice/hifigan/f0_predictor.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm
+from cosyvoice.transformer.convolution import CausalConv1d
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
+class CausalConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                CausalConv1d(in_channels, cond_channels, kernel_size=4, causal_type='right')
+            ),
+            nn.ELU(),
+            weight_norm(
+                CausalConv1d(cond_channels, cond_channels, kernel_size=3, causal_type='left')
+            ),
+            nn.ELU(),
+            weight_norm(
+                CausalConv1d(cond_channels, cond_channels, kernel_size=3, causal_type='left')
+            ),
+            nn.ELU(),
+            weight_norm(
+                CausalConv1d(cond_channels, cond_channels, kernel_size=3, causal_type='left')
+            ),
+            nn.ELU(),
+            weight_norm(
+                CausalConv1d(cond_channels, cond_channels, kernel_size=3, causal_type='left')
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor, finalize: bool = True) -> torch.Tensor:
+        if finalize is True:
+            x = self.condnet[0](x)
+        else:
+            x = self.condnet[0](x[:, :, :-self.condnet[0].causal_padding], x[:, :, -self.condnet[0].causal_padding:])
+        for i in range(1, len(self.condnet)):
+            x = self.condnet[i](x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))

cosyvoice/hifigan/generator.py ADDED Viewed

	@@ -0,0 +1,746 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HIFI-GAN"""
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm
+from torch.distributions.uniform import Uniform
+from cosyvoice.transformer.convolution import CausalConv1d, CausalConv1dDownSample, CausalConv1dUpsample
+from cosyvoice.transformer.activation import Snake
+from cosyvoice.utils.common import get_padding
+from cosyvoice.utils.common import init_weights
+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+        causal: bool = False,
+    ):
+        super(ResBlock, self).__init__()
+        self.causal = causal
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)) if causal is False else
+                    CausalConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        causal_type='left'
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)) if causal is False else
+                    CausalConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        causal_type='left'
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, dim=1, length)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0 = f0.transpose(1, 2)
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves.transpose(1, 2), uv.transpose(1, 2), noise
+class SineGen2(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False,
+                 causal=False):
+        super(SineGen2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+        self.causal = causal
+        if causal is True:
+            self.rand_ini = torch.rand(1, 9)
+            self.rand_ini[:, 0] = 0
+            self.sine_waves = torch.rand(1, 300 * 24000, 9)
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        if self.training is False and self.causal is True:
+            rad_values[:, 0, :] = rad_values[:, 0, :] + self.rand_ini.to(rad_values.device)
+        else:
+            rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                         scale_factor=1 / self.upsample_scale,
+                                                         mode="linear").transpose(1, 2)
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                    scale_factor=self.upsample_scale, mode="nearest" if self.causal is True else 'linear').transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        if self.training is False and self.causal is True:
+            noise = noise_amp * self.sine_waves[:, :sine_waves.shape[1]].to(sine_waves.device)
+        else:
+            noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0, sinegen_type='1', causal=False):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        if sinegen_type == '1':
+            self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
+        else:
+            self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num, sine_amp, add_noise_std, voiced_threshod, causal=causal)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+        self.causal = causal
+        if causal is True:
+            self.uv = torch.rand(1, 300 * 24000, 1)
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        if self.training is False and self.causal is True:
+            noise = self.uv[:, :uv.shape[1]] * self.sine_amp / 3
+        else:
+            noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        # NOTE in CosyVoice2, we use the original SineGen implementation
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold,
+            sinegen_type='1' if self.sampling_rate == 22050 else '2',
+            causal=False)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s
+class CausalHiFTGenerator(HiFTGenerator):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            conv_pre_look_right: int = 4,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        torch.nn.Module.__init__(self)
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold,
+            sinegen_type='1' if self.sampling_rate == 22050 else '2',
+            causal=True)
+        self.upsample_rates = upsample_rates
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            CausalConv1d(in_channels, base_channels, conv_pre_look_right + 1, 1, causal_type='right')
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    CausalConv1dUpsample(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    CausalConv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1, causal_type='left')
+                )
+            else:
+                self.source_downs.append(
+                    CausalConv1dDownSample(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u)
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d, causal=True)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d, causal=True))
+        self.conv_post = weight_norm(CausalConv1d(ch, istft_params["n_fft"] + 2, 7, 1, causal_type='left'))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.conv_pre_look_right = conv_pre_look_right
+        self.f0_predictor = f0_predictor
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0), finalize: bool = True) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        if finalize is True:
+            x = self.conv_pre(x)
+        else:
+            x = self.conv_pre(x[:, :, :-self.conv_pre_look_right], x[:, :, -self.conv_pre_look_right:])
+            s_stft_real = s_stft_real[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
+            s_stft_imag = s_stft_imag[:, :, :-int(np.prod(self.upsample_rates) * self.conv_pre_look_right)]
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        if finalize is False:
+            x = x[:, :-int(np.prod(self.upsample_rates) * self.istft_params['hop_len'])]
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, finalize: bool = True) -> torch.Tensor:
+        # mel->f0 NOTE f0_predictor precision is crucial for causal inference, move self.f0_predictor to cpu if necessary
+        self.f0_predictor.to('cpu')
+        f0 = self.f0_predictor(speech_feat.cpu(), finalize=finalize).to(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        if finalize is True:
+            generated_speech = self.decode(x=speech_feat, s=s, finalize=finalize)
+        else:
+            generated_speech = self.decode(x=speech_feat[:, :, :-self.f0_predictor.condnet[0].causal_padding], s=s, finalize=finalize)
+        return generated_speech, s
+if __name__ == '__main__':
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    from hyperpyyaml import load_hyperpyyaml
+    with open('./pretrained_models/Fun-CosyVoice3-0.5B/cosyvoice3.yaml', 'r') as f:
+        configs = load_hyperpyyaml(f, overrides={'llm': None, 'flow': None})
+    model = configs['hift']
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model.to(device)
+    model.eval()
+    max_len, chunk_size, context_size = 300, 30, 8
+    mel = torch.rand(1, 80, max_len).to(device)
+    pred_gt, _ = model.inference(mel)
+    for i in range(0, max_len, chunk_size):
+        finalize = True if i + chunk_size + context_size >= max_len else False
+        pred_chunk, _ = model.inference(mel[:, :, : i + chunk_size + context_size], finalize=finalize)
+        pred_chunk = pred_chunk[:, i * 480:]
+        print((pred_gt[:, i * 480:i * 480 + pred_chunk.shape[1]] - pred_chunk).abs().max().item())

cosyvoice/hifigan/hifigan.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
+from cosyvoice.utils.losses import tpr_loss, mel_loss
+class HiFiGan(nn.Module):
+    def __init__(self, generator, discriminator, mel_spec_transform,
+                 multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
+                 tpr_loss_weight=1.0, tpr_loss_tau=0.04):
+        super(HiFiGan, self).__init__()
+        self.generator = generator
+        self.discriminator = discriminator
+        self.mel_spec_transform = mel_spec_transform
+        self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
+        self.feat_match_loss_weight = feat_match_loss_weight
+        self.tpr_loss_weight = tpr_loss_weight
+        self.tpr_loss_tau = tpr_loss_tau
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        if batch['turn'] == 'generator':
+            return self.forward_generator(batch, device)
+        else:
+            return self.forward_discriminator(batch, device)
+    def forward_generator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        pitch_feat = batch['pitch_feat'].to(device)
+        # 1. calculate generator outputs
+        generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
+        loss_gen, _ = generator_loss(y_d_gs)
+        loss_fm = feature_loss(fmap_rs, fmap_gs)
+        loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_gs, y_d_rs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss_f0 = F.l1_loss(generated_f0, pitch_feat)
+        loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
+            self.multi_mel_spectral_recon_loss_weight * loss_mel + \
+            self.tpr_loss_weight * loss_tpr + loss_f0
+        return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
+    def forward_discriminator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        # 1. calculate generator outputs
+        with torch.no_grad():
+            generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech.detach())
+        # 3. calculate discriminator losses, tpr losses [Optional]
+        loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss = loss_disc + self.tpr_loss_weight * loss_tpr
+        return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}

cosyvoice/llm/llm.py ADDED Viewed

	@@ -0,0 +1,739 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua, Shengqiang Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import queue
+import random
+import time
+import threading
+from typing import Dict, Optional, Callable, List, Generator
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import Qwen2ForCausalLM
+from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+from cosyvoice.utils.common import IGNORE_ID
+from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
+from cosyvoice.utils.common import th_accuracy
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.mask import make_pad_mask
+class TransformerLM(torch.nn.Module):
+    def __init__(
+            self,
+            text_encoder_input_size: int,
+            llm_input_size: int,
+            llm_output_size: int,
+            text_token_size: int,
+            speech_token_size: int,
+            text_encoder: torch.nn.Module,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            spk_embed_dim: int = 192,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.speech_token_size = speech_token_size
+        # 1. build text token inputs related modules
+        self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
+        self.text_encoder = text_encoder
+        self.text_encoder_affine_layer = nn.Linear(
+            self.text_encoder.output_size(),
+            llm_input_size
+        )
+        # 2. build speech token language model related modules
+        self.sos = 0
+        self.task_id = 1
+        self.eos_token = self.speech_token_size
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 1)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 1,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size, llm_input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
+    def encode(
+            self,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor,
+    ):
+        encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        encoder_out = self.text_encoder_affine_layer(encoder_out)
+        return encoder_out, encoder_out_lens
+    def pad_unpad_sequence(self, sos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len):
+        text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
+        speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
+        lm_input = [torch.concat([sos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0)
+                    for i in range(len(text_token))]
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        return lm_input, lm_input_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        speech_token = batch['speech_token'].to(device)
+        speech_token_len = batch['speech_token_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # 1. prepare llm_target
+        lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() +
+                                  [self.speech_token_size]) for i in range(text_token.size(0))]
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
+        # 1. encode text_token
+        text_token = self.text_embedding(text_token)
+        text_token, text_token_len = self.encode(text_token, text_token_len)
+        # 2. embedding projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        embedding = embedding.unsqueeze(1)
+        # 3. sos and task_id
+        sos_emb = self.llm_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 4. encode speech_token
+        speech_token = self.speech_embedding(speech_token)
+        # 5. unpad and pad
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_emb, embedding, text_token, text_token_len,
+                                                         task_id_emb, speech_token, speech_token_len)
+        # 6. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target)
+        acc = th_accuracy(logits.view(-1, self.speech_token_size + 1), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            sampling: int,
+            ignore_eos: bool = True,
+    ):
+        num_trials, max_trials = 0, 100
+        while True:
+            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
+            if (not ignore_eos) or (top_ids < self.speech_token_size):
+                break
+            num_trials += 1
+            if num_trials > max_trials:
+                raise RuntimeError('sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!'.format(max_trials))
+        return top_ids
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+            uuid: str = '',
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.text_embedding(text)
+        # 1. encode text
+        text, text_len = self.encode(text, text_len)
+        # 2. encode embedding
+        if embedding.shape[0] != 0:
+            embedding = F.normalize(embedding, dim=1)
+            embedding = self.spk_embed_affine_layer(embedding)
+            embedding = embedding.unsqueeze(dim=1)
+        else:
+            embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device).to(text.dtype)
+        # 3. concat llm_input
+        sos_emb = self.llm_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        # 5. step by step decode
+        out_tokens = []
+        offset = 0
+        att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device)
+        for i in range(max_len):
+            y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=offset, required_cache_size=-1,
+                                                                  att_cache=att_cache, cnn_cache=cnn_cache,
+                                                                  att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
+                                                                                                 device=lm_input.device)).to(torch.bool))
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
+            if top_ids == self.eos_token:
+                break
+            # in stream mode, yield token one by one
+            yield top_ids
+            out_tokens.append(top_ids)
+            offset += lm_input.size(1)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+class Qwen2Encoder(torch.nn.Module):
+    def __init__(self, pretrain_path):
+        super().__init__()
+        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
+    def forward(self, xs: torch.Tensor, xs_lens: torch.Tensor):
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T)
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=masks,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        return outs.hidden_states[-1], masks.unsqueeze(1)
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, new_cache
+class Qwen2LM(TransformerLM):
+    def __init__(
+            self,
+            llm_input_size: int,
+            llm_output_size: int,
+            speech_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            mix_ratio: List[int] = [5, 15],
+    ):
+        torch.nn.Module.__init__(self)
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        # 2. build speech token language model related modules
+        self.sos = 0
+        self.task_id = 1
+        self.eos_token = speech_token_size
+        self.fill_token = speech_token_size + 2
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 3)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 3,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
+        self.mix_ratio = mix_ratio
+        # 5. vllm related
+        self.stop_token_ids = [speech_token_size + i for i in range(3)]
+        self.vllm_output_queue = {}
+    def prepare_lm_input_target(self, sos_emb, text_token, text_token_emb, text_token_len, task_id_emb, speech_token, speech_token_emb, speech_token_len):
+        lm_target, lm_input = [], []
+        text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
+        speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
+        text_token_emb = unpad_sequence(text_token_emb, text_token_len.cpu(), batch_first=True)
+        speech_token_emb = unpad_sequence(speech_token_emb, speech_token_len.cpu(), batch_first=True)
+        for i in range(len(text_token)):
+            # bistream sequence
+            if random.random() < 0.5 and speech_token_len[i] / text_token_len[i] > self.mix_ratio[1] / self.mix_ratio[0]:
+                this_lm_target, this_lm_input = [], []
+                this_lm_target.append(IGNORE_ID)
+                this_lm_input.append(sos_emb.squeeze(dim=0))
+                for j in range(((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()):
+                    this_text_token = text_token[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]].tolist()
+                    this_speech_token = speech_token[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]].tolist()
+                    if len(this_text_token) == self.mix_ratio[0]:
+                        assert len(this_speech_token) == self.mix_ratio[1]
+                        this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
+                        this_lm_target += this_speech_token
+                        this_lm_target.append(self.fill_token)
+                        this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]: (j + 1) * self.mix_ratio[0]])
+                        this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]: (j + 1) * self.mix_ratio[1]])
+                    else:
+                        this_lm_target += [-1] * len(this_text_token)
+                        this_lm_target += speech_token[i][j * self.mix_ratio[1]:].tolist()
+                        this_lm_target.append(self.eos_token)
+                        this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0]:])
+                        this_lm_input.append(task_id_emb.squeeze(dim=0))
+                        this_lm_input.append(speech_token_emb[i][j * self.mix_ratio[1]:])
+                this_lm_target, this_lm_input = torch.tensor(this_lm_target), torch.concat(this_lm_input, dim=0)
+            # unistream sequence
+            else:
+                this_lm_target = torch.tensor([IGNORE_ID] * (1 + text_token_len[i]) + speech_token[i].tolist() + [self.eos_token])
+                this_lm_input = torch.concat([sos_emb.squeeze(dim=0), text_token_emb[i], task_id_emb.squeeze(dim=0), speech_token_emb[i]], dim=0)
+            lm_target.append(this_lm_target)
+            lm_input.append(this_lm_input)
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID)
+        return lm_target, lm_input, lm_input_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        speech_token = batch['speech_token'].to(device)
+        speech_token_len = batch['speech_token_len'].to(device)
+        # 1. encode text_token
+        text_token_emb = self.llm.model.model.embed_tokens(text_token)
+        # 3. sos and task_id
+        sos_emb = self.llm_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 2. encode speech_token
+        speech_token_emb = self.speech_embedding(speech_token)
+        # 3. prepare llm_input/target
+        lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb,
+                                                                         speech_token, speech_token_emb, speech_token_len)
+        lm_target = lm_target.to(device)
+        # 4. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target.to(device))
+        acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
+    def forward_dpo(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        speech_token = batch['speech_token'].to(device)
+        speech_token_len = batch['speech_token_len'].to(device)
+        reject_speech_token = batch['reject_speech_token'].to(device)
+        reject_speech_token_len = batch['reject_speech_token_len'].to(device)
+        # 1. encode text_token
+        text_token_emb = self.llm.model.model.embed_tokens(text_token)
+        # 3. sos and task_id
+        sos_emb = self.llm_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 2. encode speech_token
+        speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
+        reject_speech_token = unpad_sequence(reject_speech_token, reject_speech_token_len.cpu(), batch_first=True)
+        speech_token_combined = speech_token + reject_speech_token
+        speech_token_combined = pad_sequence(speech_token_combined, batch_first=True, padding_value=0)
+        speech_token_combined_len = torch.concat([speech_token_len, reject_speech_token_len], dim=0)
+        speech_token_combined_emb = self.speech_embedding(speech_token_combined)
+        # 3. prepare llm_input/target
+        lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token.repeat(2, 1), text_token_emb.repeat(2, 1, 1), text_token_len.repeat(2),
+                                                                         task_id_emb, speech_token_combined, speech_token_combined_emb, speech_token_combined_len)
+        lm_target = lm_target.to(device)
+        # 4. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        chosen_logits = logits[:text_token.shape[0]]
+        rejected_logits = logits[text_token.shape[0]:]
+        chosen_lm_target = lm_target[:text_token.shape[0]]
+        rejected_lm_target = lm_target[text_token.shape[0]:]
+        loss = self.criterion_ce(chosen_logits, chosen_lm_target.to(device))
+        acc = th_accuracy(chosen_logits.view(-1, self.speech_token_size + 3), chosen_lm_target, ignore_label=IGNORE_ID)
+        # 5. calculate dpo logits
+        chosen_lm_mask = chosen_lm_target == IGNORE_ID
+        rejected_lm_mask = rejected_lm_target == IGNORE_ID
+        chosen_logps = torch.gather(chosen_logits.log_softmax(dim=-1), dim=2, index=chosen_lm_target.masked_fill(chosen_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
+        rejected_logps = torch.gather(rejected_logits.log_softmax(dim=-1), dim=2, index=rejected_lm_target.masked_fill(rejected_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
+        chosen_logps = (chosen_logps * chosen_lm_mask).sum(dim=-1) / chosen_lm_mask.sum(dim=-1)
+        rejected_logps = (rejected_logps * rejected_lm_mask).sum(dim=-1) / rejected_lm_mask.sum(dim=-1)
+        return {'loss': loss, 'acc': acc, 'chosen_logps': chosen_logps, 'rejected_logps': rejected_logps}
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+            uuid: str = '',
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.llm.model.model.embed_tokens(text)
+        # 3. concat llm_input
+        sos_emb = self.llm_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_emb, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        # 5. step by step decode
+        for token in self.inference_wrapper(lm_input, sampling, min_len, max_len, uuid):
+            yield token
+    @torch.inference_mode()
+    def inference_wrapper(self, lm_input, sampling, min_len, max_len, uuid):
+        if hasattr(self, 'vllm'):
+            from vllm import SamplingParams, RequestOutput
+            sampling_params = SamplingParams(top_k=sampling,
+                                             stop_token_ids=self.stop_token_ids,
+                                             min_tokens=min_len,
+                                             max_tokens=max_len)
+            with self.lock:
+                self.vllm.add_request(uuid, {"prompt_embeds": lm_input.squeeze(0).to(torch.bfloat16).to(lm_input.device)}, sampling_params)
+                self.vllm_output_queue[uuid] = queue.Queue()
+            out_tokens = []
+            while True:
+                with self.lock:
+                    if self.vllm_output_queue[uuid].empty() is True:
+                        request_outputs: List[RequestOutput] = self.vllm.step()
+                        for request_output in request_outputs:
+                            top_ids = list(request_output.outputs[0].token_ids)[-1]
+                            self.vllm_output_queue[request_output.request_id].put(top_ids)
+                if self.vllm_output_queue[uuid].empty() is False:
+                    top_ids = self.vllm_output_queue[uuid].get()
+                    if top_ids in self.stop_token_ids:
+                        break
+                    # in stream mode, yield token one by one
+                    yield top_ids
+                    out_tokens.append(top_ids)
+                    if len(out_tokens) == max_len:
+                        break
+                time.sleep(0.001)
+            with self.lock:
+                self.vllm_output_queue.pop(uuid)
+        else:
+            out_tokens = []
+            cache = None
+            for i in range(max_len):
+                y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                          masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
+                                                          cache=cache)
+                logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
+                if top_ids in self.stop_token_ids:
+                    break
+                # in stream mode, yield token one by one
+                yield top_ids
+                out_tokens.append(top_ids)
+                lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+    @torch.inference_mode()
+    def inference_bistream(
+            self,
+            text: Generator,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = prompt_text.device
+        # 1. prepare input
+        sos_emb = self.llm_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=prompt_text.dtype).to(device)
+        lm_input = torch.concat([sos_emb], dim=1)
+        # 2. iterate text
+        out_tokens = []
+        cache = None
+        # NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
+        text_cache = self.llm.model.model.embed_tokens(prompt_text)
+        next_fill_index = (int(prompt_speech_token.shape[1] / self.mix_ratio[1]) + 1) * self.mix_ratio[1] - prompt_speech_token.shape[1]
+        for this_text in text:
+            text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
+            # prompt_speech_token_emb not empty, try append to lm_input
+            while prompt_speech_token_emb.size(1) != 0:
+                if text_cache.size(1) >= self.mix_ratio[0]:
+                    lm_input_text, lm_input_speech = text_cache[:, :self.mix_ratio[0]], prompt_speech_token_emb[:, :self.mix_ratio[1]]
+                    logging.info('append {} text token {} speech token'.format(lm_input_text.size(1), lm_input_speech.size(1)))
+                    lm_input = torch.concat([lm_input, lm_input_text, lm_input_speech], dim=1)
+                    text_cache, prompt_speech_token_emb = text_cache[:, self.mix_ratio[0]:], prompt_speech_token_emb[:, self.mix_ratio[1]:]
+                else:
+                    logging.info('not enough text token to decode, wait for more')
+                    break
+            # no prompt_speech_token_emb remain, can decode some speech token
+            if prompt_speech_token_emb.size(1) == 0:
+                if (len(out_tokens) != 0 and out_tokens[-1] == self.fill_token) or (len(out_tokens) == 0 and lm_input.size(1) == 1):
+                    logging.info('get fill token, need to append more text token')
+                    if text_cache.size(1) >= self.mix_ratio[0]:
+                        lm_input_text = text_cache[:, :self.mix_ratio[0]]
+                        logging.info('append {} text token'.format(lm_input_text.size(1)))
+                        if len(out_tokens) != 0 and out_tokens[-1] == self.fill_token:
+                            lm_input = lm_input_text
+                        else:
+                            lm_input = torch.concat([lm_input, lm_input_text], dim=1)
+                        text_cache = text_cache[:, self.mix_ratio[0]:]
+                    else:
+                        logging.info('not enough text token to decode, wait for more')
+                        continue
+                while True:
+                    seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+                    y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                              masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                              cache=cache)
+                    logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                    if next_fill_index != -1 and len(out_tokens) == next_fill_index:
+                        top_ids = self.fill_token
+                        next_fill_index += (self.mix_ratio[1] + 1)
+                    else:
+                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True)
+                    if top_ids == self.fill_token:
+                        next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
+                        logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
+                    out_tokens.append(top_ids)
+                    if top_ids >= self.speech_token_size:
+                        if top_ids == self.fill_token:
+                            break
+                        else:
+                            raise ValueError('should not get token {}'.format(top_ids))
+                    yield top_ids
+                    lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        # 3. final decode
+        lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
+        logging.info('no more text token, decode until met eos')
+        while True:
+            seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False)
+            out_tokens.append(top_ids)
+            if top_ids >= self.speech_token_size:
+                if top_ids == self.eos_token:
+                    break
+                else:
+                    raise ValueError('should not get token {}'.format(top_ids))
+            # in stream mode, yield token one by one
+            yield top_ids
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+class CosyVoice3LM(Qwen2LM):
+    def __init__(
+            self,
+            llm_input_size: int,
+            llm_output_size: int,
+            speech_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            mix_ratio: List[int] = [5, 15],
+    ):
+        torch.nn.Module.__init__(self)
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        # 2. build speech token language model related modules
+        self.sos = speech_token_size + 0
+        self.eos_token = speech_token_size + 1
+        self.task_id = speech_token_size + 2
+        self.fill_token = speech_token_size + 3
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 200, bias=False)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 200,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size + 200, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
+        self.mix_ratio = mix_ratio
+        # 5. vllm related
+        self.stop_token_ids = [speech_token_size + i for i in range(200)]
+        self.vllm_output_queue = {}
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        speech_token = batch['speech_token'].to(device)
+        speech_token_len = batch['speech_token_len'].to(device)
+        # NOTE should append instruct_token to sequence, not implemented yet
+        instruct_token = batch['instruct_token'].to(device)
+        instruct_token_len = batch['instruct_token_len'].to(device)
+        # 1. encode text_token
+        text_token_emb = self.llm.model.model.embed_tokens(text_token)
+        # 3. sos and task_id
+        sos_emb = self.speech_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.speech_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 2. encode speech_token
+        speech_token_emb = self.speech_embedding(speech_token)
+        # 3. prepare llm_input/target
+        lm_target, lm_input, lm_input_len = self.prepare_lm_input_target(sos_emb, text_token, text_token_emb, text_token_len, task_id_emb,
+                                                                         speech_token, speech_token_emb, speech_token_len)
+        lm_target = lm_target.to(device)
+        # 4. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target.to(device))
+        acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+            uuid: str = '',
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.llm.model.model.embed_tokens(text)
+        # 3. concat llm_input
+        sos_emb = self.speech_embedding.weight[self.sos].reshape(1, 1, -1)
+        task_id_emb = self.speech_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_emb, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        # 5. step by step decode
+        for token in self.inference_wrapper(lm_input, sampling, min_len, max_len, uuid):
+            yield token

cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

cosyvoice/tokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import base64
+import os
+from functools import lru_cache
+from typing import Optional
+import torch
+from transformers import AutoTokenizer
+from whisper.tokenizer import Tokenizer
+import tiktoken
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+    "minnan": "minnan",
+    "wuyu": "wuyu",
+    "dialect": "dialect",
+    "zh/en": "zh/en",
+    "en/zh": "en/zh",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+    "mandarin": "zh",
+}
+AUDIO_EVENT = {
+    "ASR": "ASR",
+    "AED": "AED",
+    "SER": "SER",
+    "Speech": "Speech",
+    "/Speech": "/Speech",
+    "BGM": "BGM",
+    "/BGM": "/BGM",
+    "Laughter": "Laughter",
+    "/Laughter": "/Laughter",
+    "Applause": "Applause",
+    "/Applause": "/Applause",
+}
+EMOTION = {
+    "HAPPY": "HAPPY",
+    "SAD": "SAD",
+    "ANGRY": "ANGRY",
+    "NEUTRAL": "NEUTRAL",
+}
+TTS_Vocal_Token = {
+    "TTS/B": "TTS/B",
+    "TTS/O": "TTS/O",
+    "TTS/Q": "TTS/Q",
+    "TTS/A": "TTS/A",
+    "TTS/CO": "TTS/CO",
+    "TTS/CL": "TTS/CL",
+    "TTS/H": "TTS/H",
+    **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
+}
+@lru_cache(maxsize=None)
+def get_encoding(name: str = "gpt2", num_languages: int = 99):
+    vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
+        *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
+        *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)],        # register special tokens for ASR
+        *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())],  # register special tokens for TTS
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    num_languages: int = 99,
+    language: Optional[str] = None,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+    if multilingual:
+        encoding_name = "multilingual_zh_ja_yue_char_del"
+        language = language or "en"
+        task = task or "transcribe"
+    else:
+        encoding_name = "gpt2"
+        language = None
+        task = None
+    encoding = get_encoding(name=encoding_name, num_languages=num_languages)
+    return Tokenizer(
+        encoding=encoding, num_languages=num_languages, language=language, task=task
+    )
+class CosyVoice2Tokenizer():
+    def __init__(self, token_path, skip_special_tokens=True):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
+            ]
+        }
+        self.special_tokens = special_tokens
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+    def encode(self, text, **kwargs):
+        tokens = self.tokenizer([text], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+    def decode(self, tokens):
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+class CosyVoice3Tokenizer(CosyVoice2Tokenizer):
+    def __init__(self, token_path, skip_special_tokens=True):
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]", "<|endofsystem|>",
+                "[AA]", "[AA0]", "[AA1]", "[AA2]", "[AE]", "[AE0]", "[AE1]", "[AE2]", "[AH]", "[AH0]", "[AH1]", "[AH2]",
+                "[AO]", "[AO0]", "[AO1]", "[AO2]", "[AW]", "[AW0]", "[AW1]", "[AW2]", "[AY]", "[AY0]", "[AY1]", "[AY2]",
+                "[B]", "[CH]", "[D]", "[DH]", "[EH]", "[EH0]", "[EH1]", "[EH2]", "[ER]", "[ER0]", "[ER1]", "[ER2]", "[EY]",
+                "[EY0]", "[EY1]", "[EY2]", "[F]", "[G]", "[HH]", "[IH]", "[IH0]", "[IH1]", "[IH2]", "[IY]", "[IY0]", "[IY1]",
+                "[IY2]", "[JH]", "[K]", "[L]", "[M]", "[N]", "[NG]", "[OW]", "[OW0]", "[OW1]", "[OW2]", "[OY]", "[OY0]",
+                "[OY1]", "[OY2]", "[P]", "[R]", "[S]", "[SH]", "[T]", "[TH]", "[UH]", "[UH0]", "[UH1]", "[UH2]", "[UW]",
+                "[UW0]", "[UW1]", "[UW2]", "[V]", "[W]", "[Y]", "[Z]", "[ZH]",
+                "[a]", "[ai]", "[an]", "[ang]", "[ao]", "[b]", "[c]", "[ch]", "[d]", "[e]", "[ei]", "[en]", "[eng]", "[f]",
+                "[g]", "[h]", "[i]", "[ian]", "[in]", "[ing]", "[iu]", "[ià]", "[iàn]", "[iàng]", "[iào]", "[iá]", "[ián]",
+                "[iáng]", "[iáo]", "[iè]", "[ié]", "[iòng]", "[ióng]", "[iù]", "[iú]", "[iā]", "[iān]", "[iāng]", "[iāo]",
+                "[iē]", "[iě]", "[iōng]", "[iū]", "[iǎ]", "[iǎn]", "[iǎng]", "[iǎo]", "[iǒng]", "[iǔ]", "[j]", "[k]", "[l]",
+                "[m]", "[n]", "[o]", "[ong]", "[ou]", "[p]", "[q]", "[r]", "[s]", "[sh]", "[t]", "[u]", "[uang]", "[ue]",
+                "[un]", "[uo]", "[uà]", "[uài]", "[uàn]", "[uàng]", "[uá]", "[uái]", "[uán]", "[uáng]", "[uè]", "[ué]", "[uì]",
+                "[uí]", "[uò]", "[uó]", "[uā]", "[uāi]", "[uān]", "[uāng]", "[uē]", "[uě]", "[uī]", "[uō]", "[uǎ]", "[uǎi]",
+                "[uǎn]", "[uǎng]", "[uǐ]", "[uǒ]", "[vè]", "[w]", "[x]", "[y]", "[z]", "[zh]", "[à]", "[ài]", "[àn]", "[àng]",
+                "[ào]", "[á]", "[ái]", "[án]", "[��ng]", "[áo]", "[è]", "[èi]", "[èn]", "[èng]", "[èr]", "[é]", "[éi]", "[én]",
+                "[éng]", "[ér]", "[ì]", "[ìn]", "[ìng]", "[í]", "[ín]", "[íng]", "[ò]", "[òng]", "[òu]", "[ó]", "[óng]", "[óu]",
+                "[ù]", "[ùn]", "[ú]", "[ún]", "[ā]", "[āi]", "[ān]", "[āng]", "[āo]", "[ē]", "[ēi]", "[ēn]", "[ēng]", "[ě]",
+                "[ěi]", "[ěn]", "[ěng]", "[ěr]", "[ī]", "[īn]", "[īng]", "[ō]", "[ōng]", "[ōu]", "[ū]", "[ūn]", "[ǎ]", "[ǎi]",
+                "[ǎn]", "[ǎng]", "[ǎo]", "[ǐ]", "[ǐn]", "[ǐng]", "[ǒ]", "[ǒng]", "[ǒu]", "[ǔ]", "[ǔn]", "[ǘ]", "[ǚ]", "[ǜ]"
+            ]
+        }
+        self.special_tokens = special_tokens
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+@lru_cache(maxsize=None)
+def get_qwen_tokenizer(
+    token_path: str,
+    skip_special_tokens: bool,
+    version: str = 'cosyvoice2'
+):
+    if version == 'cosyvoice2':
+        return CosyVoice2Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
+    elif version == 'cosyvoice3':
+        return CosyVoice3Tokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)
+    else:
+        raise ValueError

cosyvoice/transformer/__init__.py ADDED Viewed

File without changes

cosyvoice/transformer/activation.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
+#               2020 Northwestern Polytechnical University (Pengcheng Guo)
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Swish() activation function for Conformer."""
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

cosyvoice/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song ([email protected])
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+import math
+from typing import Tuple
+import torch
+from torch import nn
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                CosyVoice.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache

cosyvoice/transformer/convolution.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from typing import Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        return x.transpose(1, 2), new_cache
+# NOTE(Xiang Lyu) causal conv module used in convolution-based vocoder
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        causal_type: str = 'left',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride=1,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = int((kernel_size * dilation - dilation) / 2) * 2 + (kernel_size + 1) % 2
+        assert causal_type in ['left', 'right']
+        self.causal_type = causal_type
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor]:
+        input_timestep = x.shape[2]
+        if cache.size(2) == 0:
+            cache = torch.zeros(x.shape[0], x.shape[1], self.causal_padding).to(x)
+        assert cache.size(2) == self.causal_padding
+        if self.causal_type == 'left':
+            x = torch.concat([cache, x], dim=2)
+        else:
+            x = torch.concat([x, cache], dim=2)
+        x = super(CausalConv1d, self).forward(x)
+        assert x.shape[2] == input_timestep
+        return x
+class CausalConv1dDownSample(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1dDownSample, self).__init__(in_channels, out_channels,
+                                                     kernel_size, stride,
+                                                     padding=0, dilation=dilation,
+                                                     groups=groups, bias=bias,
+                                                     padding_mode=padding_mode,
+                                                     device=device, dtype=dtype)
+        assert stride != 1 and dilation == 1
+        assert kernel_size % stride == 0
+        self.causal_padding = stride - 1
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache.size(2) == 0:
+            x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        else:
+            assert cache.size(2) == self.causal_padding
+            x = torch.concat([cache, x], dim=2)
+        x = super(CausalConv1dDownSample, self).forward(x)
+        return x
+class CausalConv1dUpsample(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1dUpsample, self).__init__(in_channels, out_channels,
+                                                   kernel_size, 1,
+                                                   padding=0, dilation=dilation,
+                                                   groups=groups, bias=bias,
+                                                   padding_mode=padding_mode,
+                                                   device=device, dtype=dtype)
+        assert dilation == 1
+        self.causal_padding = kernel_size - 1
+        self.upsample = torch.nn.Upsample(scale_factor=stride, mode='nearest')
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.upsample(x)
+        input_timestep = x.shape[2]
+        if cache.size(2) == 0:
+            x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        else:
+            assert cache.size(2) == self.causal_padding
+            x = torch.concat([cache, x], dim=2)
+        x = super(CausalConv1dUpsample, self).forward(x)
+        assert input_timestep == x.shape[2]
+        return x

cosyvoice/transformer/decoder.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Decoder definition."""
+from typing import Tuple, List, Optional
+import torch
+import torch.utils.checkpoint as ckpt
+import logging
+from cosyvoice.transformer.decoder_layer import DecoderLayer
+from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from cosyvoice.utils.class_utils import (
+    COSYVOICE_EMB_CLASSES,
+    COSYVOICE_ATTENTION_CLASSES,
+    COSYVOICE_ACTIVATION_CLASSES,
+)
+from cosyvoice.utils.mask import (subsequent_mask, make_pad_mask)
+class TransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        src_attention: if false, encoder-decoder cross attention is not
+                       applied, such as CIF model
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+        gradient_checkpointing: rerunning a forward-pass segment for each
+            checkpointed segment during backward.
+        tie_word_embedding: Tie or clone module weights depending of whether we are
+            using TorchScript or not
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        src_attention: bool = True,
+        key_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+    ):
+        super().__init__()
+        attention_dim = encoder_output_size
+        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        self.embed = torch.nn.Sequential(
+            torch.nn.Identity() if input_layer == "no_pos" else
+            torch.nn.Embedding(vocab_size, attention_dim),
+            COSYVOICE_EMB_CLASSES[input_layer](attention_dim,
+                                               positional_dropout_rate),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
+        self.use_output_layer = use_output_layer
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = torch.nn.Identity()
+        self.num_blocks = num_blocks
+        self.decoders = torch.nn.ModuleList([
+            DecoderLayer(
+                attention_dim,
+                COSYVOICE_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim,
+                    self_attention_dropout_rate, key_bias),
+                COSYVOICE_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim, src_attention_dropout_rate,
+                    key_bias) if src_attention else None,
+                PositionwiseFeedForward(attention_dim, linear_units,
+                                        dropout_rate, activation),
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(self.num_blocks)
+        ])
+        self.gradient_checkpointing = gradient_checkpointing
+        self.tie_word_embedding = tie_word_embedding
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor = torch.empty(0),
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: not used in transformer decoder, in order to unify api
+                with bidirectional decoder
+            reverse_weight: not used in transformer decoder, in order to unify
+                api with bidirectional decode
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                torch.tensor(0.0), in order to unify api with bidirectional decoder
+                olens: (batch, )
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        tgt = ys_in_pad
+        maxlen = tgt.size(1)
+        # tgt_mask: (B, 1, L)
+        tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
+        tgt_mask = tgt_mask.to(tgt.device)
+        # m: (1, L, L)
+        m = subsequent_mask(tgt_mask.size(-1),
+                            device=tgt_mask.device).unsqueeze(0)
+        # tgt_mask: (B, L, L)
+        tgt_mask = tgt_mask & m
+        x, _ = self.embed(tgt)
+        if self.gradient_checkpointing and self.training:
+            x = self.forward_layers_checkpointed(x, tgt_mask, memory,
+                                                 memory_mask)
+        else:
+            x = self.forward_layers(x, tgt_mask, memory, memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.use_output_layer:
+            x = self.output_layer(x)
+        olens = tgt_mask.sum(1)
+        return x, torch.tensor(0.0), olens
+    def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
+                       memory: torch.Tensor,
+                       memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                     memory_mask)
+        return x
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, x: torch.Tensor,
+                                    tgt_mask: torch.Tensor,
+                                    memory: torch.Tensor,
+                                    memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
+                layer.__call__, x, tgt_mask, memory, memory_mask)
+        return x
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        x, _ = self.embed(tgt)
+        new_cache = []
+        for i, decoder in enumerate(self.decoders):
+            if cache is None:
+                c = None
+            else:
+                c = cache[i]
+            x, tgt_mask, memory, memory_mask = decoder(x,
+                                                       tgt_mask,
+                                                       memory,
+                                                       memory_mask,
+                                                       cache=c)
+            new_cache.append(x)
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.use_output_layer:
+            y = torch.log_softmax(self.output_layer(y), dim=-1)
+        return y, new_cache
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        if not self.use_output_layer:
+            return
+        if jit_mode:
+            logging.info("clone emb.weight to output.weight")
+            self.output_layer.weight = torch.nn.Parameter(
+                self.embed[0].weight.clone())
+        else:
+            logging.info("tie emb.weight with output.weight")
+            self.output_layer.weight = self.embed[0].weight
+        if getattr(self.output_layer, "bias", None) is not None:
+            self.output_layer.bias.data = torch.nn.functional.pad(
+                self.output_layer.bias.data,
+                (
+                    0,
+                    self.output_layer.weight.shape[0] -
+                    self.output_layer.bias.shape[0],
+                ),
+                "constant",
+                0,
+            )
+class BiTransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        r_num_blocks: the number of right to left decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        r_num_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+    ):
+        super().__init__()
+        self.tie_word_embedding = tie_word_embedding
+        self.left_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            key_bias=key_bias,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding)
+        self.right_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            r_num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            key_bias=key_bias,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding)
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor,
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
+                used for right to left decoder
+            reverse_weight: used for right to left decoder
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                r_x: x: decoded token score (right to left decoder)
+                    before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                          ys_in_lens)
+        r_x = torch.tensor(0.0)
+        if reverse_weight > 0.0:
+            r_x, _, olens = self.right_decoder(memory, memory_mask,
+                                               r_ys_in_pad, ys_in_lens)
+        return l_x, r_x, olens
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
+                                                  tgt_mask, cache)
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        self.left_decoder.tie_or_clone_weights(jit_mode)
+        self.right_decoder.tie_or_clone_weights(jit_mode)

cosyvoice/transformer/decoder_layer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class DecoderLayer(nn.Module):
+    """Single decoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Inter-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+            If `None` is passed, Inter-attention is not used, such as
+            CIF, GPT, and other decoder only model.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: nn.Module,
+        src_attn: Optional[nn.Module],
+        feed_forward: nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+    ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size, eps=1e-5)
+        self.norm2 = nn.LayerNorm(size, eps=1e-5)
+        self.norm3 = nn.LayerNorm(size, eps=1e-5)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute decoded features.
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor
+                (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory
+                (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask
+                (#batch, maxlen_in).
+            cache (torch.Tensor): cached tensors.
+                (#batch, maxlen_out - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = tgt_mask[:, -1:, :]
+        x = residual + self.dropout(
+            self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
+        if not self.normalize_before:
+            x = self.norm1(x)
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm2(x)
+            x = residual + self.dropout(
+                self.src_attn(x, memory, memory, memory_mask)[0])
+            if not self.normalize_before:
+                x = self.norm2(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        return x, tgt_mask, memory, memory_mask

cosyvoice/transformer/embedding.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Positonal Encoding Module."""
+import math
+from typing import Tuple, Union
+import torch
+import torch.nn.functional as F
+import numpy as np
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 5000,
+                 reverse: bool = False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+        self.pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        self.pe[:, 0::2] = torch.sin(position * div_term)
+        self.pe[:, 1::2] = torch.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int, torch.tensor): position offset
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+        self.pe = self.pe.to(x.device)
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int,
+                          apply_dropout: bool = True) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        else:  # for batched streaming decoding on GPU
+            assert torch.max(offset) + size <= self.max_len
+            index = offset.unsqueeze(1) + \
+                torch.arange(0, size).to(offset.device)  # B X T
+            flag = index > 0
+            # remove negative offset
+            index = index * flag
+            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
+        if apply_dropout:
+            pos_emb = self.dropout(pos_emb)
+        return pos_emb
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.pe = self.pe.to(x.device)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        return self.dropout(x), self.dropout(pos_emb)
+class WhisperPositionalEncoding(PositionalEncoding):
+    """ Sinusoids position encoding used in openai-whisper.encoder
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
+        super().__init__(d_model, dropout_rate, max_len)
+        self.xscale = 1.0
+        log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment *
+                                   torch.arange(d_model // 2))
+        scaled_time = torch.arange(max_len)[:, np.newaxis] * \
+            inv_timescales[np.newaxis, :]
+        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+        delattr(self, "pe")
+        self.register_buffer("pe", pe.unsqueeze(0))
+class LearnablePositionalEncoding(PositionalEncoding):
+    """ Learnable position encoding used in openai-whisper.decoder
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
+        super().__init__(d_model, dropout_rate, max_len)
+        # NOTE(xcsong): overwrite self.pe & self.xscale
+        self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
+        self.xscale = 1.0
+class NoPositionalEncoding(torch.nn.Module):
+    """ No position encoding
+    """
+    def __init__(self, d_model: int, dropout_rate: float):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Just return zero vector for interface compatibility
+        """
+        pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
+        return self.dropout(x), pos_emb
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return torch.zeros(1, size, self.d_model)
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Construct an PositionalEncoding object."""
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            pos_emb = self.pe[
+                :,
+                self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
+            ]
+        elif isinstance(offset, torch.Tensor):
+            pos_emb = self.pe[
+                :,
+                self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
+            ]
+        return pos_emb

cosyvoice/transformer/encoder.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Tuple
+import torch
+import torch.utils.checkpoint as ckpt
+from cosyvoice.transformer.convolution import ConvolutionModule
+from cosyvoice.transformer.encoder_layer import TransformerEncoderLayer
+from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
+from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from cosyvoice.utils.class_utils import (
+    COSYVOICE_EMB_CLASSES,
+    COSYVOICE_SUBSAMPLE_CLASSES,
+    COSYVOICE_ATTENTION_CLASSES,
+    COSYVOICE_ACTIVATION_CLASSES,
+)
+from cosyvoice.utils.mask import make_pad_mask
+from cosyvoice.utils.mask import add_optional_chunk_mask
+class BaseEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+        """
+        super().__init__()
+        self._output_size = output_size
+        self.global_cmvn = global_cmvn
+        self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                      positional_dropout_rate),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        if self.gradient_checkpointing and self.training:
+            xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb,
+                                                  mask_pad)
+        else:
+            xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, xs: torch.Tensor,
+                                    chunk_masks: torch.Tensor,
+                                    pos_emb: torch.Tensor,
+                                    mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs,
+                                                    chunk_masks, pos_emb,
+                                                    mask_pad)
+        return xs
+    @torch.jit.export
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Forward just one chunk
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+        Returns:
+            torch.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+        """
+        assert xs.size(0) == 1
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
+        chunk_size = xs.size(1)
+        attention_key_size = cache_t1 + chunk_size
+        pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
+                                               size=attention_key_size)
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+        r_att_cache = []
+        r_cnn_cache = []
+        for i, layer in enumerate(self.encoders):
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
+                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
+            # NOTE(xcsong): After layer.forward
+            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
+            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
+            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = torch.cat(r_att_cache, dim=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
+        return (xs, r_att_cache, r_cnn_cache)
+    @torch.jit.unused
+    def forward_chunk_by_chunk(
+        self,
+        xs: torch.Tensor,
+        decoding_chunk_size: int,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not preferred.
+        Args:
+            xs (torch.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+        num_frames = xs.size(1)
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        outputs = []
+        offset = 0
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            chunk_xs = xs[:, cur:end, :]
+            (y, att_cache,
+             cnn_cache) = self.forward_chunk(chunk_xs, offset,
+                                             required_cache_size, att_cache,
+                                             cnn_cache)
+            outputs.append(y)
+            offset += y.size(1)
+        ys = torch.cat(outputs, 1)
+        masks = torch.ones((1, 1, ys.size(1)),
+                           device=ys.device,
+                           dtype=torch.bool)
+        return ys, masks
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module."""
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        key_bias: bool = True,
+        selfattention_layer_type: str = "selfattn",
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+    ):
+        """ Construct TransformerEncoder
+        See Encoder for the meaning of each parameter.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing)
+        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        self.encoders = torch.nn.ModuleList([
+            TransformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](attention_heads,
+                                                                      output_size,
+                                                                      attention_dropout_rate,
+                                                                      key_bias),
+                PositionwiseFeedForward(output_size, linear_units,
+                                        dropout_rate, activation),
+                dropout_rate, normalize_before) for _ in range(num_blocks)
+        ])
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module."""
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+    ):
+        """Construct ConformerEncoder
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            positionwise_conv_kernel_size (int): Kernel size of positionwise
+                conv1d layer.
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            selfattention_layer_type (str): Encoder attention layer type,
+                the parameter has no effect now, it's just for configure
+                compatibility.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing)
+        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])

cosyvoice/transformer/encoder_layer.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: torch.nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size, eps=1e-12)
+        self.norm2 = nn.LayerNorm(size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): just for interface compatibility
+                to ConformerEncoderLayer
+            mask_pad (torch.Tensor): does not used in transformer layer,
+                just for unified api with conformer.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2), not used here, it's for interface
+                compatibility to ConformerEncoderLayer.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb=pos_emb, cache=att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+        fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        return x, mask, new_att_cache, fake_cnn_cache
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.1,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-12)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-12)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-12)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-12)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        return x, mask, new_att_cache, new_cnn_cache

cosyvoice/transformer/label_smoothing_loss.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Label smoothing module."""
+import torch
+from torch import nn
+class LabelSmoothingLoss(nn.Module):
+    """Label-smoothing loss.
+    In a standard CE loss, the label's data distribution is:
+    [0,1,2] ->
+    [
+        [1.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0],
+        [0.0, 0.0, 1.0],
+    ]
+    In the smoothing version CE Loss,some probabilities
+    are taken from the true label prob (1.0) and are divided
+    among other labels.
+    e.g.
+    smoothing=0.1
+    [0,1,2] ->
+    [
+        [0.9, 0.05, 0.05],
+        [0.05, 0.9, 0.05],
+        [0.05, 0.05, 0.9],
+    ]
+    Args:
+        size (int): the number of class
+        padding_idx (int): padding class id which will be ignored for loss
+        smoothing (float): smoothing rate (0.0 means the conventional CE)
+        normalize_length (bool):
+            normalize loss by sequence length if True
+            normalize loss by batch size if False
+    """
+    def __init__(self,
+                 size: int,
+                 padding_idx: int,
+                 smoothing: float,
+                 normalize_length: bool = False):
+        """Construct an LabelSmoothingLoss object."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.criterion = nn.KLDivLoss(reduction="none")
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.normalize_length = normalize_length
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """Compute loss between x and target.
+        The model outputs and data labels tensors are flatten to
+        (batch*seqlen, class) shape and a mask is applied to the
+        padding part which should not be calculated for loss.
+        Args:
+            x (torch.Tensor): prediction (batch, seqlen, class)
+            target (torch.Tensor):
+                target signal masked with self.padding_id (batch, seqlen)
+        Returns:
+            loss (torch.Tensor) : The KL loss, scalar float value
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        # use zeros_like instead of torch.no_grad() for true_dist,
+        # since no_grad() can not be exported by JIT
+        true_dist = torch.zeros_like(x)
+        true_dist.fill_(self.smoothing / (self.size - 1))
+        ignore = target == self.padding_idx  # (B,)
+        total = len(target) - ignore.sum().item()
+        target = target.masked_fill(ignore, 0)  # avoid -1 index
+        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom

cosyvoice/transformer/positionwise_feed_forward.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Positionwise feed forward layer definition."""
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+class MoEFFNLayer(torch.nn.Module):
+    """
+    Mixture of expert with Positionwise feed forward layer
+    See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
+    The output dim is same with the input dim.
+    Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
+                  https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
+    Args:
+        n_expert: number of expert.
+        n_expert_per_token: The actual number of experts used for each frame
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            n_expert: int,
+            n_expert_per_token: int,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        super(MoEFFNLayer, self).__init__()
+        self.gate = torch.nn.Linear(idim, n_expert, bias=False)
+        self.experts = torch.nn.ModuleList(
+            PositionwiseFeedForward(idim, hidden_units, dropout_rate,
+                                    activation) for _ in range(n_expert))
+        self.n_expert_per_token = n_expert_per_token
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Foward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        B, L, D = xs.size(
+        )  # batch size, sequence length, embedding dimension (idim)
+        xs = xs.view(-1, D)  # (B*L, D)
+        router = self.gate(xs)  # (B*L, n_expert)
+        logits, indices = torch.topk(
+            router, self.n_expert_per_token
+        )  # probs:(B*L, n_expert), indices: (B*L, n_expert)
+        weights = torch.nn.functional.softmax(
+            logits, dim=1,
+            dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_per_token)
+        output = torch.zeros_like(xs)  # (B*L, D)
+        for i, expert in enumerate(self.experts):
+            mask = indices == i
+            batch_idx, ith_expert = torch.where(mask)
+            output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
+                xs[batch_idx])
+        return output.view(B, L, D)

cosyvoice/transformer/subsampling.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+from typing import Tuple, Union
+import torch
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class EmbedinigNoSubsampling(BaseSubsampling):
+    """Embedding input without subsampling
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        super().__init__()
+        self.embed = torch.nn.Embedding(idim, odim)
+        self.pos_enc = pos_enc_class
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.embed(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class Conv1dSubsampling2(BaseSubsampling):
+    """Convolutional 1D subsampling (to 1/2 length).
+       It is designed for Whisper, ref:
+       https://github.com/openai/whisper/blob/main/whisper/model.py
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv1dSubsampling2 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1),
+            torch.nn.GELU(),
+        )
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 2
+        # 4 = (3 - 1) * 1 + (3 - 1) * 1
+        self.right_context = 4
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+            torch.Tensor: positional encoding
+        """
+        time = x.size(1)
+        x = x.transpose(1, 2)  # (b, f, t)
+        x = self.conv(x)
+        x = x.transpose(1, 2)  # (b, t, f)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, (time + 1) % 2::2]
+class Conv2dSubsampling4(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
+class Conv2dSubsampling6(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling6 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 5, 3),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
+                                      odim)
+        self.pos_enc = pos_enc_class
+        # 10 = (3 - 1) * 1 + (5 - 1) * 2
+        self.subsampling_rate = 6
+        self.right_context = 10
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 6.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 6.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
+class Conv2dSubsampling8(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/8 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling8 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(
+            odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
+        self.pos_enc = pos_enc_class
+        self.subsampling_rate = 8
+        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
+        self.right_context = 14
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 8.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 8.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
+class LegacyLinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.ReLU(),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask

cosyvoice/transformer/upsample_encoder.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Tuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+from cosyvoice.transformer.convolution import ConvolutionModule
+from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
+from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from cosyvoice.utils.class_utils import (
+    COSYVOICE_EMB_CLASSES,
+    COSYVOICE_SUBSAMPLE_CLASSES,
+    COSYVOICE_ATTENTION_CLASSES,
+    COSYVOICE_ACTIVATION_CLASSES,
+)
+from cosyvoice.utils.mask import make_pad_mask
+from cosyvoice.utils.mask import add_optional_chunk_mask
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.stride = stride
+        # In this mode, first repeat interpolate, than conv with stride=1
+        self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        outputs = self.conv(outputs)
+        return outputs, input_lengths * self.stride
+class PreLookaheadLayer(nn.Module):
+    def __init__(self, in_channels: int, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            in_channels, channels,
+            kernel_size=pre_lookahead_len + 1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, in_channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+    def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> torch.Tensor:
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        context = context.transpose(1, 2).contiguous()
+        # look ahead
+        if context.size(2) == 0:
+            outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        else:
+            assert self.training is False, 'you have passed context, make sure that you are running inference mode'
+            assert context.size(2) == self.pre_lookahead_len
+            outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        # residual connection
+        outputs = outputs + inputs
+        return outputs
+class UpsampleConformerEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+        """
+        super().__init__()
+        self._output_size = output_size
+        self.global_cmvn = global_cmvn
+        self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                      positional_dropout_rate),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+        self.pre_lookahead_layer = PreLookaheadLayer(in_channels=512, channels=512, pre_lookahead_len=3)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])
+        self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
+        self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                      positional_dropout_rate),
+        )
+        self.up_encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(4)
+        ])
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        context: torch.Tensor = torch.zeros(0, 0, 0),
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+        streaming: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        if context.size(1) != 0:
+            assert self.training is False, 'you have passed context, make sure that you are running inference mode'
+            context_masks = torch.ones(1, 1, context.size(1)).to(masks)
+            context, _, _ = self.embed(context, context_masks, offset=xs.size(1))
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
+        # lookahead + conformer encoder
+        xs = self.pre_lookahead_layer(xs, context=context)
+        xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        # upsample + conformer encoder
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs = xs.transpose(1, 2).contiguous()
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.up_embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size * self.up_layer.stride if streaming is True else 0, -1)
+        xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+    def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                          pos_emb: torch.Tensor,
+                          mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.up_encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs

cosyvoice/utils/__init__.py ADDED Viewed

File without changes

cosyvoice/utils/class_utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright [2023-11-28] <[email protected], Xingchen Song>
+#            2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from cosyvoice.transformer.activation import Swish
+from cosyvoice.transformer.subsampling import (
+    LinearNoSubsampling,
+    EmbedinigNoSubsampling,
+    Conv1dSubsampling2,
+    Conv2dSubsampling4,
+    Conv2dSubsampling6,
+    Conv2dSubsampling8,
+)
+from cosyvoice.transformer.embedding import (PositionalEncoding,
+                                             RelPositionalEncoding,
+                                             WhisperPositionalEncoding,
+                                             LearnablePositionalEncoding,
+                                             NoPositionalEncoding)
+from cosyvoice.transformer.attention import (MultiHeadedAttention,
+                                             RelPositionMultiHeadedAttention)
+from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
+from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
+from cosyvoice.llm.llm import TransformerLM, Qwen2LM, CosyVoice3LM
+from cosyvoice.flow.flow import MaskedDiffWithXvec, CausalMaskedDiffWithXvec, CausalMaskedDiffWithDiT
+from cosyvoice.hifigan.generator import HiFTGenerator, CausalHiFTGenerator
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model, CosyVoice3Model
+COSYVOICE_ACTIVATION_CLASSES = {
+    "hardtanh": torch.nn.Hardtanh,
+    "tanh": torch.nn.Tanh,
+    "relu": torch.nn.ReLU,
+    "selu": torch.nn.SELU,
+    "swish": getattr(torch.nn, "SiLU", Swish),
+    "gelu": torch.nn.GELU,
+}
+COSYVOICE_SUBSAMPLE_CLASSES = {
+    "linear": LinearNoSubsampling,
+    "linear_legacy": LegacyLinearNoSubsampling,
+    "embed": EmbedinigNoSubsampling,
+    "conv1d2": Conv1dSubsampling2,
+    "conv2d": Conv2dSubsampling4,
+    "conv2d6": Conv2dSubsampling6,
+    "conv2d8": Conv2dSubsampling8,
+    'paraformer_dummy': torch.nn.Identity
+}
+COSYVOICE_EMB_CLASSES = {
+    "embed": PositionalEncoding,
+    "abs_pos": PositionalEncoding,
+    "rel_pos": RelPositionalEncoding,
+    "rel_pos_espnet": EspnetRelPositionalEncoding,
+    "no_pos": NoPositionalEncoding,
+    "abs_pos_whisper": WhisperPositionalEncoding,
+    "embed_learnable_pe": LearnablePositionalEncoding,
+}
+COSYVOICE_ATTENTION_CLASSES = {
+    "selfattn": MultiHeadedAttention,
+    "rel_selfattn": RelPositionMultiHeadedAttention,
+}
+def get_model_type(configs):
+    # NOTE CosyVoice2Model inherits CosyVoiceModel
+    if isinstance(configs['llm'], TransformerLM) and isinstance(configs['flow'], MaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator):
+        return CosyVoiceModel
+    if isinstance(configs['llm'], Qwen2LM) and isinstance(configs['flow'], CausalMaskedDiffWithXvec) and isinstance(configs['hift'], HiFTGenerator):
+        return CosyVoice2Model
+    if isinstance(configs['llm'], CosyVoice3LM) and isinstance(configs['flow'], CausalMaskedDiffWithDiT) and isinstance(configs['hift'], CausalHiFTGenerator):
+        return CosyVoice3Model
+    raise TypeError('No valid model type found!')

cosyvoice/utils/common.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Bofan Zhou)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Unility functions for Transformer."""
+import queue
+import random
+from typing import List
+import numpy as np
+import torch
+IGNORE_ID = -1
+instruct_list = ["You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用东北话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用河南话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用江西话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用山西话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用山东话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用上海话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用四川话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用天津话表达。<|endofprompt|>",
+                 "You are a helpful assistant. 请用云南话表达。<|endofprompt|>",
+                 "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>",
+                 "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>",
+                 "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>",
+                 "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>",
+                 "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
+                 "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>",
+                 "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>",
+                 "You are a helpful assistant. 我想体验一下小猪佩奇风格，可以吗？<|endofprompt|>",
+                 "You are a helpful assistant. 你可以尝试用机器人的方式解答吗？<|endofprompt|>"]
+def pad_list(xs: List[torch.Tensor], pad_value: int):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    max_len = max([len(item) for item in xs])
+    batchs = len(xs)
+    ndim = xs[0].ndim
+    if ndim == 1:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 2:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 3:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              xs[0].shape[2],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    else:
+        raise ValueError(f"Unsupported ndim: {ndim}")
+    pad_res.fill_(pad_value)
+    for i in range(batchs):
+        pad_res[i, :len(xs[i])] = xs[i]
+    return pad_res
+def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
+                ignore_label: int) -> torch.Tensor:
+    """Calculate accuracy.
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax).
+        ignore_label (int): Ignore label id.
+    Returns:
+        torch.Tensor: Accuracy value (0.0 - 1.0).
+    """
+    pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
+                                pad_outputs.size(1)).argmax(2)
+    mask = pad_targets != ignore_label
+    numerator = torch.sum(
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
+    denominator = torch.sum(mask)
+    return (numerator / denominator).detach()
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+# Repetition Aware Sampling in VALL-E 2
+def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
+    top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
+    rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item()
+    if rep_num >= win_size * tau_r:
+        top_ids = random_sampling(weighted_scores, decoded_tokens, sampling)
+    return top_ids
+def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
+    prob, indices = [], []
+    cum_prob = 0.0
+    sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
+    for i in range(len(sorted_idx)):
+        # sampling both top-p and numbers.
+        if cum_prob < top_p and len(prob) < top_k:
+            cum_prob += sorted_value[i]
+            prob.append(sorted_value[i])
+            indices.append(sorted_idx[i])
+        else:
+            break
+    prob = torch.tensor(prob).to(weighted_scores)
+    indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
+    top_ids = indices[prob.multinomial(1, replacement=True)].item()
+    return top_ids
+def random_sampling(weighted_scores, decoded_tokens, sampling):
+    top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True).item()
+    return top_ids
+def fade_in_out(fade_in_mel, fade_out_mel, window):
+    device = fade_in_mel.device
+    fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
+    mel_overlap_len = int(window.shape[0] / 2)
+    if fade_in_mel.device == torch.device('cpu'):
+        fade_in_mel = fade_in_mel.clone()
+    fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
+        fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
+    return fade_in_mel.to(device)
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+class TrtContextWrapper:
+    def __init__(self, trt_engine, trt_concurrent=1, device='cuda:0'):
+        self.trt_context_pool = queue.Queue(maxsize=trt_concurrent)
+        self.trt_engine = trt_engine
+        for _ in range(trt_concurrent):
+            trt_context = trt_engine.create_execution_context()
+            trt_stream = torch.cuda.stream(torch.cuda.Stream(device))
+            assert trt_context is not None, 'failed to create trt context, maybe not enough CUDA memory, try reduce current trt concurrent {}'.format(trt_concurrent)
+            self.trt_context_pool.put([trt_context, trt_stream])
+        assert self.trt_context_pool.empty() is False, 'no avaialbe estimator context'
+    def acquire_estimator(self):
+        return self.trt_context_pool.get(), self.trt_engine
+    def release_estimator(self, context, stream):
+        self.trt_context_pool.put([context, stream])

cosyvoice/utils/executor.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from contextlib import nullcontext
+import os
+import torch
+import torch.distributed as dist
+from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, log_per_save, batch_forward, batch_backward, save_model, cosyvoice_join
+class Executor:
+    def __init__(self, gan: bool = False, ref_model: torch.nn.Module = None, dpo_loss: torch.nn.Module = None):
+        self.gan = gan
+        self.ref_model = ref_model
+        self.dpo_loss = dpo_loss
+        self.step = 0
+        self.epoch = 0
+        self.rank = int(os.environ.get('RANK', 0))
+        self.device = torch.device('cuda:{}'.format(self.rank))
+    def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join, ref_model=None):
+        ''' Train one epoch
+        '''
+        lr = optimizer.param_groups[0]['lr']
+        logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank))
+        logging.info('using accumulate grad, new batch size is {} times'
+                     ' larger than before'.format(info_dict['accum_grad']))
+        # A context manager to be used in conjunction with an instance of
+        # torch.nn.parallel.DistributedDataParallel to be able to train
+        # with uneven inputs across participating processes.
+        model.train()
+        if self.ref_model is not None:
+            self.ref_model.eval()
+        model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
+        with model_context():
+            for batch_idx, batch_dict in enumerate(train_data_loader):
+                info_dict["tag"] = "TRAIN"
+                info_dict["step"] = self.step
+                info_dict["epoch"] = self.epoch
+                info_dict["batch_idx"] = batch_idx
+                if cosyvoice_join(group_join, info_dict):
+                    break
+                # Disable gradient synchronizations across DDP processes.
+                # Within this context, gradients will be accumulated on module
+                # variables, which will later be synchronized.
+                if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0:
+                    context = model.no_sync
+                # Used for single gpu training and DDP gradient synchronization
+                # processes.
+                else:
+                    context = nullcontext
+                with context():
+                    info_dict = batch_forward(model, batch_dict, scaler, info_dict, ref_model=self.ref_model, dpo_loss=self.dpo_loss)
+                    info_dict = batch_backward(model, scaler, info_dict)
+                info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict)
+                log_per_step(writer, info_dict)
+                # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
+                if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and \
+                   (batch_idx + 1) % info_dict["accum_grad"] == 0:
+                    dist.barrier()
+                    self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False)
+                    model.train()
+                if (batch_idx + 1) % info_dict["accum_grad"] == 0:
+                    self.step += 1
+        dist.barrier()
+        self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
+    def train_one_epoc_gan(self, model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
+                           writer, info_dict, scaler, group_join):
+        ''' Train one epoch
+        '''
+        lr = optimizer.param_groups[0]['lr']
+        logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank))
+        logging.info('using accumulate grad, new batch size is {} times'
+                     ' larger than before'.format(info_dict['accum_grad']))
+        # A context manager to be used in conjunction with an instance of
+        # torch.nn.parallel.DistributedDataParallel to be able to train
+        # with uneven inputs across participating processes.
+        model.train()
+        model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
+        with model_context():
+            for batch_idx, batch_dict in enumerate(train_data_loader):
+                info_dict["tag"] = "TRAIN"
+                info_dict["step"] = self.step
+                info_dict["epoch"] = self.epoch
+                info_dict["batch_idx"] = batch_idx
+                if cosyvoice_join(group_join, info_dict):
+                    break
+                # Disable gradient synchronizations across DDP processes.
+                # Within this context, gradients will be accumulated on module
+                # variables, which will later be synchronized.
+                if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0:
+                    context = model.no_sync
+                # Used for single gpu training and DDP gradient synchronization
+                # processes.
+                else:
+                    context = nullcontext
+                with context():
+                    batch_dict['turn'] = 'discriminator'
+                    info_dict = batch_forward(model, batch_dict, scaler, info_dict)
+                    info_dict = batch_backward(model, scaler, info_dict)
+                info_dict = update_parameter_and_lr(model, optimizer_d, scheduler_d, scaler, info_dict)
+                optimizer.zero_grad()
+                log_per_step(writer, info_dict)
+                with context():
+                    batch_dict['turn'] = 'generator'
+                    info_dict = batch_forward(model, batch_dict, scaler, info_dict)
+                    info_dict = batch_backward(model, scaler, info_dict)
+                info_dict = update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict)
+                optimizer_d.zero_grad()
+                log_per_step(writer, info_dict)
+                # NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
+                if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and \
+                   (batch_idx + 1) % info_dict["accum_grad"] == 0:
+                    dist.barrier()
+                    self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False)
+                    model.train()
+                if (batch_idx + 1) % info_dict["accum_grad"] == 0:
+                    self.step += 1
+        dist.barrier()
+        self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
+    @torch.inference_mode()
+    def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True):
+        ''' Cross validation on
+        '''
+        logging.info('Epoch {} Step {} on_batch_end {} CV rank {}'.format(self.epoch, self.step + 1, on_batch_end, self.rank))
+        model.eval()
+        total_num_utts, total_loss_dict = 0, {}  # avoid division by 0
+        for batch_idx, batch_dict in enumerate(cv_data_loader):
+            info_dict["tag"] = "CV"
+            info_dict["step"] = self.step
+            info_dict["epoch"] = self.epoch
+            info_dict["batch_idx"] = batch_idx
+            num_utts = len(batch_dict["utts"])
+            total_num_utts += num_utts
+            if self.gan is True:
+                batch_dict['turn'] = 'generator'
+            info_dict = batch_forward(model, batch_dict, None, info_dict)
+            for k, v in info_dict['loss_dict'].items():
+                if k not in total_loss_dict:
+                    total_loss_dict[k] = []
+                total_loss_dict[k].append(v.mean().item() * num_utts)
+            log_per_step(None, info_dict)
+        for k, v in total_loss_dict.items():
+            total_loss_dict[k] = sum(v) / total_num_utts
+        info_dict['loss_dict'] = total_loss_dict
+        log_per_save(writer, info_dict)
+        model_name = 'epoch_{}_whole'.format(self.epoch) if on_batch_end else 'epoch_{}_step_{}'.format(self.epoch, self.step + 1)
+        save_model(model, model_name, info_dict)

cosyvoice/utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu, Zetao Hu)
+#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import torch
+import torchaudio
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s %(levelname)s %(message)s')
+def read_lists(list_file):
+    lists = []
+    with open(list_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            lists.append(line.strip())
+    return lists
+def read_json_lists(list_file):
+    lists = read_lists(list_file)
+    results = {}
+    for fn in lists:
+        with open(fn, 'r', encoding='utf8') as fin:
+            results.update(json.load(fin))
+    return results
+def load_wav(wav, target_sr, min_sr=16000):
+    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
+    speech = speech.mean(dim=0, keepdim=True)
+    if sample_rate != target_sr:
+        assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+    return speech
+def convert_onnx_to_trt(trt_model, trt_kwargs, onnx_model, fp16):
+    import tensorrt as trt
+    logging.info("Converting onnx to trt...")
+    network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    logger = trt.Logger(trt.Logger.INFO)
+    builder = trt.Builder(logger)
+    network = builder.create_network(network_flags)
+    parser = trt.OnnxParser(network, logger)
+    config = builder.create_builder_config()
+    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32)  # 4GB
+    if fp16:
+        config.set_flag(trt.BuilderFlag.FP16)
+    profile = builder.create_optimization_profile()
+    # load onnx model
+    with open(onnx_model, "rb") as f:
+        if not parser.parse(f.read()):
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            raise ValueError('failed to parse {}'.format(onnx_model))
+    # set input shapes
+    for i in range(len(trt_kwargs['input_names'])):
+        profile.set_shape(trt_kwargs['input_names'][i], trt_kwargs['min_shape'][i], trt_kwargs['opt_shape'][i], trt_kwargs['max_shape'][i])
+    tensor_dtype = trt.DataType.HALF if fp16 else trt.DataType.FLOAT
+    # set input and output data type
+    for i in range(network.num_inputs):
+        input_tensor = network.get_input(i)
+        input_tensor.dtype = tensor_dtype
+    for i in range(network.num_outputs):
+        output_tensor = network.get_output(i)
+        output_tensor.dtype = tensor_dtype
+    config.add_optimization_profile(profile)
+    engine_bytes = builder.build_serialized_network(network, config)
+    # save trt engine
+    with open(trt_model, "wb") as f:
+        f.write(engine_bytes)
+    logging.info("Succesfully convert onnx to trt...")
+# NOTE do not support bistream inference as only speech token embedding/head is kept
+def export_cosyvoice2_vllm(model, model_path, device):
+    if os.path.exists(model_path):
+        return
+    dtype = torch.bfloat16
+    # lm_head
+    use_bias = True if model.llm_decoder.bias is not None else False
+    model.llm.model.lm_head = model.llm_decoder
+    # embed_tokens
+    embed_tokens = model.llm.model.model.embed_tokens
+    model.llm.model.set_input_embeddings(model.speech_embedding)
+    model.llm.model.to(device)
+    model.llm.model.to(dtype)
+    tmp_vocab_size = model.llm.model.config.vocab_size
+    tmp_tie_embedding = model.llm.model.config.tie_word_embeddings
+    del model.llm.model.generation_config.eos_token_id
+    del model.llm.model.config.bos_token_id
+    del model.llm.model.config.eos_token_id
+    model.llm.model.config.vocab_size = model.speech_embedding.num_embeddings
+    model.llm.model.config.tie_word_embeddings = False
+    model.llm.model.config.use_bias = use_bias
+    model.llm.model.save_pretrained(model_path)
+    if use_bias is True:
+        os.system('sed -i s@Qwen2ForCausalLM@CosyVoice2ForCausalLM@g {}/config.json'.format(os.path.abspath(model_path)))
+    model.llm.model.config.vocab_size = tmp_vocab_size
+    model.llm.model.config.tie_word_embeddings = tmp_tie_embedding
+    model.llm.model.set_input_embeddings(embed_tokens)

cosyvoice/utils/frontend_utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import regex
+chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
+# whether contain chinese character
+def contains_chinese(text):
+    return bool(chinese_char_pattern.search(text))
+# replace special symbol
+def replace_corner_mark(text):
+    text = text.replace('²', '平方')
+    text = text.replace('³', '立方')
+    return text
+# remove meaningless symbol
+def remove_bracket(text):
+    text = text.replace('（', '').replace('）', '')
+    text = text.replace('【', '').replace('】', '')
+    text = text.replace('`', '').replace('`', '')
+    text = text.replace("——", " ")
+    return text
+# spell Arabic numerals
+def spell_out_number(text: str, inflect_parser):
+    new_text = []
+    st = None
+    for i, c in enumerate(text):
+        if not c.isdigit():
+            if st is not None:
+                num_str = inflect_parser.number_to_words(text[st: i])
+                new_text.append(num_str)
+                st = None
+            new_text.append(c)
+        else:
+            if st is None:
+                st = i
+    if st is not None and st < len(text):
+        num_str = inflect_parser.number_to_words(text[st:])
+        new_text.append(num_str)
+    return ''.join(new_text)
+# split paragrah logic：
+# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
+# 2. cal sentence len according to lang
+# 3. split sentence according to puncatation
+def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
+    def calc_utt_length(_text: str):
+        if lang == "zh":
+            return len(_text)
+        else:
+            return len(tokenize(_text))
+    def should_merge(_text: str):
+        if lang == "zh":
+            return len(_text) < merge_len
+        else:
+            return len(tokenize(_text)) < merge_len
+    if lang == "zh":
+        pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
+    else:
+        pounc = ['.', '?', '!', ';', ':']
+    if comma_split:
+        pounc.extend(['，', ','])
+    if text[-1] not in pounc:
+        if lang == "zh":
+            text += "。"
+        else:
+            text += "."
+    st = 0
+    utts = []
+    for i, c in enumerate(text):
+        if c in pounc:
+            if len(text[st: i]) > 0:
+                utts.append(text[st: i] + c)
+            if i + 1 < len(text) and text[i + 1] in ['"', '”']:
+                tmp = utts.pop(-1)
+                utts.append(tmp + text[i + 1])
+                st = i + 2
+            else:
+                st = i + 1
+    final_utts = []
+    cur_utt = ""
+    for utt in utts:
+        if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
+            final_utts.append(cur_utt)
+            cur_utt = ""
+        cur_utt = cur_utt + utt
+    if len(cur_utt) > 0:
+        if should_merge(cur_utt) and len(final_utts) != 0:
+            final_utts[-1] = final_utts[-1] + cur_utt
+        else:
+            final_utts.append(cur_utt)
+    return final_utts
+# remove blank between chinese character
+def replace_blank(text: str):
+    out_str = []
+    for i, c in enumerate(text):
+        if c == " ":
+            if ((text[i + 1].isascii() and text[i + 1] != " ") and
+                    (text[i - 1].isascii() and text[i - 1] != " ")):
+                out_str.append(c)
+        else:
+            out_str.append(c)
+    return "".join(out_str)
+def is_only_punctuation(text):
+    # Regular expression: Match strings that consist only of punctuation marks or are empty.
+    punctuation_pattern = r'^[\p{P}\p{S}]*$'
+    return bool(regex.fullmatch(punctuation_pattern, text))

cosyvoice/utils/losses.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import torch.nn.functional as F
+from typing import Tuple
+def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
+    loss = 0
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        m_DG = torch.median((dr - dg))
+        L_rel = torch.mean((((dr - dg) - m_DG) ** 2)[dr < dg + m_DG])
+        loss += tau - F.relu(tau - L_rel)
+    return loss
+def mel_loss(real_speech, generated_speech, mel_transforms):
+    loss = 0
+    for transform in mel_transforms:
+        mel_r = transform(real_speech)
+        mel_g = transform(generated_speech)
+        loss += F.l1_loss(mel_g, mel_r)
+    return loss
+class DPOLoss(torch.nn.Module):
+    """
+    DPO Loss
+    """
+    def __init__(self, beta: float, label_smoothing: float = 0.0, ipo: bool = False) -> None:
+        super().__init__()
+        self.beta = beta
+        self.label_smoothing = label_smoothing
+        self.ipo = ipo
+    def forward(
+        self,
+        policy_chosen_logps: torch.Tensor,
+        policy_rejected_logps: torch.Tensor,
+        reference_chosen_logps: torch.Tensor,
+        reference_rejected_logps: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        ref_logratios = reference_chosen_logps - reference_rejected_logps
+        logits = pi_logratios - ref_logratios
+        if self.ipo:
+            losses = (logits - 1 / (2 * self.beta)) ** 2  # Eq. 17 of https://arxiv.org/pdf/2310.12036v2.pdf
+        else:
+            # Eq. 3 https://ericmitchell.ai/cdpo.pdf; label_smoothing=0 gives original DPO (Eq. 7 of https://arxiv.org/pdf/2305.18290.pdf)
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        loss = losses.mean()
+        chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
+        rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach()
+        return loss, chosen_rewards, rejected_rewards