Merge pull request #49 from nadare881/augment

ddPn08 · web-flow · commit 1ccca4a7dad0 · 2023-05-31T20:28:15.000+09:00
experimental: augmentation&#12398;&#36861;&#21152;
diff --git a/lib/rvc/models.py b/lib/rvc/models.py
@@ -525,6 +525,7 @@ def __init__(
         self.segment_size = segment_size
         self.gin_channels = gin_channels
         self.emb_channels = emb_channels
+        self.sr = sr
         # self.hop_length = hop_length#
         self.spk_embed_dim = spk_embed_dim
         self.enc_p = TextEncoder(
@@ -644,6 +645,7 @@ def __init__(
         self.segment_size = segment_size
         self.gin_channels = gin_channels
         self.emb_channels = emb_channels
+        self.sr = sr
         # self.hop_length = hop_length#
         self.spk_embed_dim = spk_embed_dim
         self.enc_p = TextEncoder(
diff --git a/lib/rvc/preprocessing/extract_feature.py b/lib/rvc/preprocessing/extract_feature.py
@@ -11,6 +11,22 @@
 from fairseq import checkpoint_utils
 from tqdm import tqdm
 
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+MODELS_DIR = os.path.join(ROOT_DIR, "models")
+EMBEDDINGS_LIST = {
+    "hubert-base-japanese": (
+        "rinna_hubert_base_jp.pt",
+        "hubert-base-japanese",
+        "local",
+    ),
+    "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
+}
+
+def get_embedder(embedder_name):
+    if embedder_name in EMBEDDINGS_LIST:
+        return EMBEDDINGS_LIST[embedder_name]
+    return None
+
 
 def load_embedder(embedder_path: str, device):
     try:
diff --git a/lib/rvc/train.py b/lib/rvc/train.py
@@ -11,6 +11,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+import torchaudio
 import tqdm
 from sklearn.cluster import MiniBatchKMeans
 from torch.cuda.amp import GradScaler, autocast
@@ -22,20 +23,15 @@
 from . import commons, utils
 from .checkpoints import save
 from .config import DatasetMetadata, TrainConfig
-from .data_utils import (
-    DistributedBucketSampler,
-    TextAudioCollate,
-    TextAudioCollateMultiNSFsid,
-    TextAudioLoader,
-    TextAudioLoaderMultiNSFsid,
-)
+from .data_utils import (DistributedBucketSampler, TextAudioCollate,
+                         TextAudioCollateMultiNSFsid, TextAudioLoader,
+                         TextAudioLoaderMultiNSFsid)
 from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
 from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
-from .models import (
-    MultiPeriodDiscriminator,
-    SynthesizerTrnMs256NSFSid,
-    SynthesizerTrnMs256NSFSidNono,
-)
+from .models import (MultiPeriodDiscriminator, SynthesizerTrnMs256NSFSid,
+                     SynthesizerTrnMs256NSFSidNono)
+from .preprocessing.extract_feature import (MODELS_DIR, get_embedder,
+                                            load_embedder)
 
 
 def is_audio_file(file: str):
@@ -149,6 +145,91 @@ def list_data(dir: str):
         json.dump(meta, f, indent=2)
 
 
+def change_speaker(net_g, speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths):
+    """
+    random change formant
+    inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
+    """
+    N = phone.shape[0]
+    device = phone.device
+    dtype = phone.dtype
+
+    f0_bin = 256
+    f0_max = 1100.0
+    f0_min = 50.0
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+    pitch_median = torch.median(pitchf, 1).values
+    lo = 75. + 25. * (pitch_median >= 200).to(dtype=dtype)
+    hi = 250. + 150. * (pitch_median >= 200).to(dtype=dtype)
+    pitch_median = torch.clip(pitch_median, lo, hi).unsqueeze(1)
+
+    shift_pitch = torch.exp2((1. - 2. * torch.rand(N)) / 2).unsqueeze(1).to(device, dtype)   # &#12500;&#12483;&#12481;&#12434;1&#12458;&#12463;&#12479;&#12540;&#12502;&#12398;&#31684;&#22258;&#12391;&#12378;&#12425;&#12377;
+
+    new_sid = np.random.choice(np.arange(len(speaker_info))[speaker_info > 0], size=N)
+    rel_pitch = pitchf / pitch_median
+    new_pitch_median = torch.from_numpy(speaker_info[new_sid]).to(device, dtype).unsqueeze(1) * shift_pitch
+    new_pitchf = new_pitch_median * rel_pitch
+    new_sid = torch.from_numpy(new_sid).to(device)
+
+    new_pitch = 1127. * torch.log(1. + new_pitchf / 700.)
+    new_pitch = (pitch - f0_mel_min) * (f0_bin - 2.) / (f0_mel_max - f0_mel_min) + 1.
+    new_pitch = torch.clip(new_pitch, 1, f0_bin - 1).to(dtype=torch.int)
+
+    new_wave = net_g.infer(phone, phone_lengths, new_pitch, new_pitchf, new_sid)[0]
+    new_wave_16k = torchaudio.functional.resample(new_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
+    padding_mask = torch.arange(new_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)
+
+    inputs = {
+        "source": new_wave_16k.to(device, dtype),
+        "padding_mask": padding_mask.to(device),
+        "output_layer": embedding_output_layer
+    }
+    logits = embedder.extract_features(**inputs)
+    if phone.shape[-1] == 768:
+        feats = logits[0]
+    else:
+        feats = embedder.final_proj(logits[0])
+    feats = torch.repeat_interleave(feats, 2, 1)
+    new_phone = torch.zeros(phone.shape).to(device, dtype)
+    new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
+    return new_phone.to(device)
+
+
+def change_speaker_nono(net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths):
+    """
+    random change formant
+    inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
+    """
+    N = phone.shape[0]
+    device = phone.device
+    dtype = phone.dtype
+
+    new_sid = np.random.randint(net_g.spk_embed_dim, size=N)
+    new_sid = torch.from_numpy(new_sid).to(device)
+
+    new_wave = net_g.infer(phone, phone_lengths, new_sid)[0]
+    new_wave_16k = torchaudio.functional.resample(new_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
+    padding_mask = torch.arange(new_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)
+
+    inputs = {
+        "source": new_wave_16k.to(device, dtype),
+        "padding_mask": padding_mask.to(device),
+        "output_layer": embedding_output_layer
+    }
+
+    logits = embedder.extract_features(**inputs)
+    if phone.shape[-1] == 768:
+        feats = logits[0]
+    else:
+        feats = embedder.final_proj(logits[0])
+    feats = torch.repeat_interleave(feats, 2, 1)
+    new_phone = torch.zeros(phone.shape).to(device, dtype)
+    new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
+    return new_phone.to(device)
+
+
 def train_index(
     training_dir: str,
     model_name: str,
@@ -225,6 +306,9 @@ def train_model(
     sample_rate: int,
     f0: bool,
     batch_size: int,
+    augment: bool,
+    augment_path: Optional[str],
+    speaker_info_path: Optional[str],
     cache_batch: bool,
     total_epoch: int,
     save_every_epoch: int,
@@ -261,6 +345,9 @@ def train_model(
             sample_rate,
             f0,
             batch_size,
+            augment,
+            augment_path,
+            speaker_info_path,
             cache_batch,
             total_epoch,
             save_every_epoch,
@@ -284,6 +371,9 @@ def train_model(
                 sample_rate,
                 f0,
                 batch_size,
+                augment,
+                augment_path,
+                speaker_info_path,
                 cache_batch,
                 total_epoch,
                 save_every_epoch,
@@ -319,6 +409,9 @@ def training_runner(
     sample_rate: int,
     f0: bool,
     batch_size: int,
+    augment: bool,
+    augment_path: Optional[str],
+    speaker_info_path: Optional[str],
     cache_in_gpu: bool,
     total_epoch: int,
     save_every_epoch: int,
@@ -395,14 +488,15 @@ def training_runner(
             config.train.segment_size // config.data.hop_length,
             **config.model.dict(),
             is_half=config.train.fp16_run,
-            sr=sample_rate,
+            sr=int(sample_rate[:-1] + "000"),
         )
     else:
         net_g = SynthesizerTrnMs256NSFSidNono(
             config.data.filter_length // 2 + 1,
             config.train.segment_size // config.data.hop_length,
             **config.model.dict(),
             is_half=config.train.fp16_run,
+            sr=int(sample_rate[:-1] + "000"),
         )
 
     if is_multi_process:
@@ -440,6 +534,48 @@ def training_runner(
     last_d_state = utils.latest_checkpoint_path(state_dir, "D_*.pth")
     last_g_state = utils.latest_checkpoint_path(state_dir, "G_*.pth")
 
+    if augment:
+        # load embedder
+        embedder_filepath, _, embedder_load_from = get_embedder(embedder_name)
+
+        if embedder_load_from == "local":
+            embedder_filepath = os.path.join(
+                MODELS_DIR, "embeddings", embedder_filepath
+            )
+        embedder, _ = load_embedder(embedder_filepath, device)
+        if not config.train.fp16_run:
+            embedder = embedder.float()
+
+        if (augment_path is not None):
+            state_dict = torch.load(augment_path, map_location="cpu")
+            if state_dict["f0"] == 1:
+                augment_net_g = SynthesizerTrnMs256NSFSid(
+                    **state_dict["params"], is_half=config.train.fp16_run
+                )
+                augment_speaker_info = np.load(speaker_info_path)
+            else:
+                augment_net_g = SynthesizerTrnMs256NSFSidNono(
+                    **state_dict["params"], is_half=config.train.fp16_run
+                )
+
+            augment_net_g.load_state_dict(state_dict["weight"], strict=False)
+            augment_net_g.eval().to(device)
+
+            if config.train.fp16_run:
+                augment_net_g = augment_net_g.half()
+            else:
+                augment_net_g= augment_net_g.float()
+        else:
+            augment_net_g = net_g
+            if f0:
+                medians = [[] for _ in range(augment_net_g.spk_embed_dim)]
+                for file in training_meta.files.values():
+                    f0f = np.load(file.f0nsf)
+                    if np.any(f0f > 0):
+                        medians[file.speaker_id].append(np.median(f0f[f0f > 0]))
+                augment_speaker_info = np.array([np.median(x) if len(x) else 0. for x in medians])
+                np.save(os.path.join(training_dir, "speaker_info.npy"), augment_speaker_info)
+
     if last_d_state is None or last_g_state is None:
         epoch = 1
         global_step = 0
@@ -520,6 +656,7 @@ def training_runner(
     cache = []
     progress_bar = tqdm.tqdm(range((total_epoch - epoch + 1) * len(train_loader)))
     progress_bar.set_postfix(epoch=epoch)
+    step = -1
     for epoch in range(epoch, total_epoch + 1):
         train_loader.batch_sampler.set_epoch(epoch)
 
@@ -536,6 +673,7 @@ def training_runner(
             shuffle(cache)
 
         for batch_idx, batch in data:
+            step += 1
             progress_bar.update(1)
             if f0:
                 (
@@ -614,6 +752,15 @@ def training_runner(
                         )
 
             with autocast(enabled=config.train.fp16_run):
+                if augment:
+                    with torch.no_grad():
+                        if type(augment_net_g) == SynthesizerTrnMs256NSFSid:
+                            new_phone = change_speaker(augment_net_g, augment_speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths)
+                        else:
+                            new_phone = change_speaker_nono(augment_net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths)
+                        weight = np.power(.5, step / len(train_dataset))  # &#23398;&#32722;&#12398;&#21021;&#26399;&#12399;&#12381;&#12398;&#12414;&#12414;&#12398;phone embedding&#12434;&#20351;&#12358;
+                        phone = phone * weight + new_phone * (1. - weight)
+
                 if f0:
                     (
                         y_hat,
diff --git a/modules/tabs/training.py b/modules/tabs/training.py
@@ -6,7 +6,8 @@
 import gradio as gr
 
 from lib.rvc.preprocessing import extract_f0, extract_feature, split
-from lib.rvc.train import create_dataset_meta, glob_dataset, train_index, train_model
+from lib.rvc.train import (create_dataset_meta, glob_dataset, train_index,
+                           train_model)
 from modules import models, utils
 from modules.shared import MODELS_DIR, device, half_support
 from modules.ui import Tab
@@ -141,6 +142,10 @@ def train_all(
             norm_audio_when_preprocess,
             pitch_extraction_algo,
             batch_size,
+            augment,
+            augment_from_pretrain,
+            augment_path,
+            speaker_info_path,
             cache_batch,
             num_epochs,
             save_every_epoch,
@@ -234,6 +239,10 @@ def train_all(
             )
             out_dir = os.path.join(MODELS_DIR, "checkpoints")
 
+            if not augment_from_pretrain:
+                augment_path = None
+                speaker_info_path = None
+
             train_model(
                 gpu_ids,
                 config,
@@ -243,6 +252,9 @@ def train_all(
                 sampling_rate_str,
                 f0,
                 batch_size,
+                augment,
+                augment_path,
+                speaker_info_path,
                 cache_batch,
                 num_epochs,
                 save_every_epoch,
@@ -363,6 +375,17 @@ def train_all(
                         fp16 = gr.Checkbox(
                             label="FP16", value=half_support, disabled=not half_support
                         )
+                    with gr.Row().style(equal_height=False):
+                        augment = gr.Checkbox(label="Augment", value=False)
+                        augment_from_pretrain = gr.Checkbox(label="Augment From Pretrain", value=False)
+                        augment_path = gr.Textbox(
+                            label="Pre trained generator path (pth)",
+                            value="file is not prepared"
+                        )
+                        speaker_info_path = gr.Textbox(
+                            label="speaker info path (npy)",
+                            value="file is not prepared"
+                        )
                     with gr.Row().style(equal_height=False):
                         pre_trained_generator = gr.Textbox(
                             label="Pre trained generator path",
@@ -438,6 +461,10 @@ def train_all(
                 norm_audio_when_preprocess,
                 pitch_extraction_algo,
                 batch_size,
+                augment,
+                augment_from_pretrain,
+                augment_path,
+                speaker_info_path,
                 cache_batch,
                 num_epochs,
                 save_every_epoch,