augmentationの追加

nadare881 · nadare881 · commit 30151f84980c · 2023-05-29T21:45:49.000+09:00
diff --git a/lib/rvc/models.py b/lib/rvc/models.py
@@ -525,6 +525,7 @@ def __init__(
         self.segment_size = segment_size
         self.gin_channels = gin_channels
         self.emb_channels = emb_channels
+        self.sr = sr
         # self.hop_length = hop_length#
         self.spk_embed_dim = spk_embed_dim
         self.enc_p = TextEncoder(
@@ -644,6 +645,7 @@ def __init__(
         self.segment_size = segment_size
         self.gin_channels = gin_channels
         self.emb_channels = emb_channels
+        self.sr = sr
         # self.hop_length = hop_length#
         self.spk_embed_dim = spk_embed_dim
         self.enc_p = TextEncoder(
diff --git a/lib/rvc/preprocessing/extract_feature.py b/lib/rvc/preprocessing/extract_feature.py
@@ -11,6 +11,22 @@
 from fairseq import checkpoint_utils
 from tqdm import tqdm
 
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+MODELS_DIR = os.path.join(ROOT_DIR, "models")
+EMBEDDINGS_LIST = {
+    "hubert-base-japanese": (
+        "rinna_hubert_base_jp.pt",
+        "hubert-base-japanese",
+        "local",
+    ),
+    "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"),
+}
+
+def get_embedder(embedder_name):
+    if embedder_name in EMBEDDINGS_LIST:
+        return EMBEDDINGS_LIST[embedder_name]
+    return None
+
 
 def load_embedder(embedder_path: str, device):
     try:
diff --git a/lib/rvc/train.py b/lib/rvc/train.py
@@ -11,6 +11,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+import torchaudio
 import tqdm
 from sklearn.cluster import MiniBatchKMeans
 from torch.cuda.amp import GradScaler, autocast
@@ -22,20 +23,15 @@
 from . import commons, utils
 from .checkpoints import save
 from .config import DatasetMetadata, TrainConfig
-from .data_utils import (
-    DistributedBucketSampler,
-    TextAudioCollate,
-    TextAudioCollateMultiNSFsid,
-    TextAudioLoader,
-    TextAudioLoaderMultiNSFsid,
-)
+from .data_utils import (DistributedBucketSampler, TextAudioCollate,
+                         TextAudioCollateMultiNSFsid, TextAudioLoader,
+                         TextAudioLoaderMultiNSFsid)
 from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
 from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
-from .models import (
-    MultiPeriodDiscriminator,
-    SynthesizerTrnMs256NSFSid,
-    SynthesizerTrnMs256NSFSidNono,
-)
+from .models import (MultiPeriodDiscriminator, SynthesizerTrnMs256NSFSid,
+                     SynthesizerTrnMs256NSFSidNono)
+from .preprocessing.extract_feature import (MODELS_DIR, get_embedder,
+                                            load_embedder)
 
 
 def is_audio_file(file: str):
@@ -149,6 +145,60 @@ def list_data(dir: str):
         json.dump(meta, f, indent=2)
 
 
+def change_speaker(net_g, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths, sid):
+    """
+    random change formant
+    inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
+    """
+    N = pitchf.shape[0]
+    device = pitchf.device
+    dtype = pitchf.dtype
+
+    f0_bin = 256
+    f0_max = 1100.0
+    f0_min = 50.0
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+
+    pitch_median = torch.median(pitchf, 1).values
+    lo = 75. + 25. * (pitch_median >= 200).to(dtype=dtype)
+    hi = 250. + 150. * (pitch_median >= 200).to(dtype=dtype)
+    pitch_median = torch.clip(pitch_median, lo, hi).unsqueeze(1)
+
+    ratio_speaker = torch.pow(.5, 2. * torch.rand(N)).unsqueeze(1).to(device, dtype)  # &#22793;&#26356;&#24460;&#12398;&#35441;&#32773;&#12395;&#12500;&#12483;&#12481;&#12398;&#20013;&#22830;&#20516;&#12434;[.25, .1]&#12398;&#21106;&#21512;&#12391;&#21512;&#12431;&#12379;&#12427;
+    shift_pitch = torch.exp2((1. - 2. * torch.rand(N)) / 2).unsqueeze(1).to(device, dtype)   # &#12500;&#12483;&#12481;&#12434;1&#12458;&#12463;&#12479;&#12540;&#12502;&#12398;&#31684;&#22258;&#12391;&#12378;&#12425;&#12377;
+
+    shuffle_ixs = np.arange(N)
+    np.random.shuffle(shuffle_ixs)
+    rel_pitch = pitchf / pitch_median
+    new_pitch_median = torch.exp2(torch.log2(pitch_median[shuffle_ixs]) * ratio_speaker + torch.log2(pitch_median) *(1. - ratio_speaker)) * shift_pitch
+    new_pitchf = new_pitch_median * rel_pitch
+    new_sid = sid[shuffle_ixs]
+
+    new_pitch = 1127. * torch.log(1. + new_pitchf / 700.)
+    new_pitch = (pitch - f0_mel_min) * (f0_bin - 2.) / (f0_mel_max - f0_mel_min) + 1.
+    new_pitch = torch.clip(new_pitch, 1, f0_bin - 1).to(dtype=torch.int)
+
+    new_wave = net_g.infer(phone, phone_lengths, new_pitch, new_pitchf, new_sid)[0]
+    new_wave_16k = torchaudio.functional.resample(new_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
+    padding_mask = torch.arange(new_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)
+
+    inputs = {
+        "source": new_wave_16k.to(device, dtype),
+        "padding_mask": padding_mask.to(device),
+        "output_layer": embedding_output_layer
+    }
+    logits = embedder.extract_features(**inputs)
+    if phone.shape[-1] == 768:
+        feats = logits[0]
+    else:
+        feats = embedder.final_proj(logits[0])
+    feats = torch.repeat_interleave(feats, 2, 1)
+    new_phone = torch.zeros(phone.shape).to(device, dtype)
+    new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
+    return new_phone.to(device)
+
+
 def train_index(
     training_dir: str,
     model_name: str,
@@ -225,6 +275,7 @@ def train_model(
     sample_rate: int,
     f0: bool,
     batch_size: int,
+    augment: bool,
     cache_batch: bool,
     total_epoch: int,
     save_every_epoch: int,
@@ -261,6 +312,7 @@ def train_model(
             sample_rate,
             f0,
             batch_size,
+            augment,
             cache_batch,
             total_epoch,
             save_every_epoch,
@@ -284,6 +336,7 @@ def train_model(
                 sample_rate,
                 f0,
                 batch_size,
+                augment,
                 cache_batch,
                 total_epoch,
                 save_every_epoch,
@@ -319,6 +372,7 @@ def training_runner(
     sample_rate: int,
     f0: bool,
     batch_size: int,
+    augment: bool,
     cache_in_gpu: bool,
     total_epoch: int,
     save_every_epoch: int,
@@ -359,6 +413,17 @@ def training_runner(
 
     torch.manual_seed(config.train.seed)
 
+    if augment:
+        embedder_filepath, _, embedder_load_from = get_embedder(embedder_name)
+
+        if embedder_load_from == "local":
+            embedder_filepath = os.path.join(
+                MODELS_DIR, "embeddings", embedder_filepath
+            )
+        embedder, _ = load_embedder(embedder_filepath, device)
+        if not config.train.fp16_run:
+            embedder = embedder.float()
+
     if f0:
         train_dataset = TextAudioLoaderMultiNSFsid(training_meta, config.data)
     else:
@@ -520,6 +585,7 @@ def training_runner(
     cache = []
     progress_bar = tqdm.tqdm(range((total_epoch - epoch + 1) * len(train_loader)))
     progress_bar.set_postfix(epoch=epoch)
+    step = -1
     for epoch in range(epoch, total_epoch + 1):
         train_loader.batch_sampler.set_epoch(epoch)
 
@@ -536,6 +602,7 @@ def training_runner(
             shuffle(cache)
 
         for batch_idx, batch in data:
+            step += 1
             progress_bar.update(1)
             if f0:
                 (
@@ -614,6 +681,12 @@ def training_runner(
                         )
 
             with autocast(enabled=config.train.fp16_run):
+                if f0 and augment:
+                    with torch.no_grad():
+                        new_phone = change_speaker(net_g, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths, sid)
+                        weight = np.power(.5, step / len(train_dataset))  # &#23398;&#32722;&#12398;&#21021;&#26399;&#12399;&#12381;&#12398;&#12414;&#12414;&#12398;phone embedding&#12434;&#20351;&#12358;
+                        phone = phone * weight + new_phone * (1. - weight)
+
                 if f0:
                     (
                         y_hat,
diff --git a/modules/tabs/training.py b/modules/tabs/training.py
@@ -6,7 +6,8 @@
 import gradio as gr
 
 from lib.rvc.preprocessing import extract_f0, extract_feature, split
-from lib.rvc.train import create_dataset_meta, glob_dataset, train_index, train_model
+from lib.rvc.train import (create_dataset_meta, glob_dataset, train_index,
+                           train_model)
 from modules import models, utils
 from modules.shared import MODELS_DIR, device, half_support
 from modules.ui import Tab
@@ -141,6 +142,7 @@ def train_all(
             norm_audio_when_preprocess,
             pitch_extraction_algo,
             batch_size,
+            augment,
             cache_batch,
             num_epochs,
             save_every_epoch,
@@ -243,6 +245,7 @@ def train_all(
                 sampling_rate_str,
                 f0,
                 batch_size,
+                augment,
                 cache_batch,
                 num_epochs,
                 save_every_epoch,
@@ -359,6 +362,7 @@ def train_all(
                             step=1,
                             label="Save every epoch",
                         )
+                        augment = gr.Checkbox(label="Augment", value=False)
                         cache_batch = gr.Checkbox(label="Cache batch", value=True)
                         fp16 = gr.Checkbox(
                             label="FP16", value=half_support, disabled=not half_support
@@ -438,6 +442,7 @@ def train_all(
                 norm_audio_when_preprocess,
                 pitch_extraction_algo,
                 batch_size,
+                augment,
                 cache_batch,
                 num_epochs,
                 save_every_epoch,