Merge pull request #55 from Iamgoofball/multispeaker_adjustments

ddPn08 · web-flow · commit 65c37b25e1dd · 2023-06-17T09:21:57.000+09:00
Crepe Support + Changed Multispeaker Training
diff --git a/lib/rvc/pipeline.py b/lib/rvc/pipeline.py
@@ -8,7 +8,8 @@
 import scipy.signal as signal
 import torch
 import torch.nn.functional as F
-
+import torchcrepe
+from torch import Tensor
 # from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip
 from fairseq.models.hubert import HubertModel
 
@@ -51,6 +52,85 @@ def __init__(self, tgt_sr: int, device: Union[str, torch.device], is_half: bool)
         self.device = device
         self.is_half = is_half
 
+    def get_optimal_torch_device(self, index: int = 0) -> torch.device:
+        # Get cuda device
+        if torch.cuda.is_available():
+            return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
+        # Else wise return the "cpu" as a torch device, 
+        return torch.device("cpu")
+
+    def get_f0_crepe_computation(
+            self, 
+            x, 
+            f0_min,
+            f0_max,
+            p_len,
+            hop_length=64, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
+            model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
+    ):
+        x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
+        x /= np.quantile(np.abs(x), 0.999)
+        torch_device = self.get_optimal_torch_device()
+        audio = torch.from_numpy(x).to(torch_device, copy=True)
+        audio = torch.unsqueeze(audio, dim=0)
+        if audio.ndim == 2 and audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True).detach()
+        audio = audio.detach()
+        print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
+        pitch: Tensor = torchcrepe.predict(
+            audio,
+            self.sr,
+            hop_length,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=hop_length * 2,
+            device=torch_device,
+            pad=True
+        )
+        p_len = p_len or x.shape[0] // hop_length
+        # Resize the pitch for final f0
+        source = np.array(pitch.squeeze(0).cpu().float().numpy())
+        source[source < 0.001] = np.nan
+        target = np.interp(
+            np.arange(0, len(source) * p_len, len(source)) / p_len,
+            np.arange(0, len(source)),
+            source
+        )
+        f0 = np.nan_to_num(target)
+        return f0 # Resized f0
+
+    def get_f0_official_crepe_computation(
+            self,
+            x,
+            f0_min,
+            f0_max,
+            model="full",
+    ):
+        # Pick a batch size that doesn't cause memory errors on your gpu
+        batch_size = 512
+        # Compute pitch using first gpu
+        audio = torch.tensor(np.copy(x))[None].float()
+        f0, pd = torchcrepe.predict(
+            audio,
+            self.sr,
+            self.window,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=batch_size,
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 = f0[0].cpu().numpy()
+        return f0
+
     def get_f0(
         self,
         x: np.ndarray,
@@ -84,6 +164,10 @@ def get_f0(
             )
             f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
             f0 = signal.medfilt(f0, 3)
+        elif f0_method == "mangio-crepe":
+            f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, 160, "full")
+        elif f0_method == "crepe":
+            f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "full")
 
         f0 *= pow(2, f0_up_key / 12)
         tf0 = self.sr // self.window  # f0 points per second
diff --git a/lib/rvc/preprocessing/extract_f0.py b/lib/rvc/preprocessing/extract_f0.py
@@ -5,10 +5,92 @@
 
 import numpy as np
 import pyworld
+import torch
+import torchcrepe
+from torch import Tensor
 from tqdm import tqdm
 
 from lib.rvc.utils import load_audio
 
+def get_optimal_torch_device(index: int = 0) -> torch.device:
+    # Get cuda device
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
+    elif torch.backends.mps.is_available():
+        return torch.device("mps")
+    # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
+    # Else wise return the "cpu" as a torch device, 
+    return torch.device("cpu")
+
+def get_f0_official_crepe_computation(
+        x,
+        sr,
+        f0_min,
+        f0_max,
+        model="full",
+):
+    batch_size = 512
+    torch_device = get_optimal_torch_device()
+    audio = torch.tensor(np.copy(x))[None].float()
+    f0, pd = torchcrepe.predict(
+        audio,
+        sr,
+        160,
+        f0_min,
+        f0_max,
+        model,
+        batch_size=batch_size,
+        device=torch_device,
+        return_periodicity=True,
+    )
+    pd = torchcrepe.filter.median(pd, 3)
+    f0 = torchcrepe.filter.mean(f0, 3)
+    f0[pd < 0.1] = 0
+    f0 = f0[0].cpu().numpy()
+    f0 = f0[1:] # Get rid of extra first frame
+    return f0
+
+def get_f0_crepe_computation(
+        x, 
+        sr,
+        f0_min,
+        f0_max,
+        hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
+        model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
+):
+    x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
+    x /= np.quantile(np.abs(x), 0.999)
+    torch_device = get_optimal_torch_device()
+    audio = torch.from_numpy(x).to(torch_device, copy=True)
+    audio = torch.unsqueeze(audio, dim=0)
+    if audio.ndim == 2 and audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True).detach()
+    audio = audio.detach()
+    print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
+    pitch: Tensor = torchcrepe.predict(
+        audio,
+        sr,
+        hop_length,
+        f0_min,
+        f0_max,
+        model,
+        batch_size=hop_length * 2,
+        device=torch_device,
+        pad=True
+    )
+    p_len = x.shape[0] // hop_length
+    # Resize the pitch for final f0
+    source = np.array(pitch.squeeze(0).cpu().float().numpy())
+    source[source < 0.001] = np.nan
+    target = np.interp(
+        np.arange(0, len(source) * p_len, len(source)) / p_len,
+        np.arange(0, len(source)),
+        source
+    )
+    f0 = np.nan_to_num(target)
+    f0 = f0[1:] # Get rid of extra first frame
+    return f0 # Resized f0
+
 
 def compute_f0(
     path: str,
@@ -37,6 +119,10 @@ def compute_f0(
             frame_period=1000 * hop / fs,
         )
         f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs)
+    elif f0_method == "mangio-crepe":
+        f0 = get_f0_crepe_computation(x, fs, f0_min, f0_max, 160, "full")
+    elif f0_method == "crepe":
+        f0 = get_f0_official_crepe_computation(x.astype(np.double), fs, f0_min, f0_max, "full")
     return f0
 
 
diff --git a/lib/rvc/train.py b/lib/rvc/train.py
@@ -13,6 +13,7 @@
 import torch.multiprocessing as mp
 import torchaudio
 import tqdm
+import json
 from sklearn.cluster import MiniBatchKMeans
 from torch.cuda.amp import GradScaler, autocast
 from torch.nn import functional as F
@@ -56,33 +57,37 @@ def glob_dataset(
     recursive: bool = True,
 ):
     globs = glob_str.split(",")
+    speaker_count = 0
     datasets_speakers = []
+    speaker_to_id_mapping = {}
     for glob_str in globs:
         if os.path.isdir(glob_str):
-            files = os.listdir(glob_str)
             if multiple_speakers:
-                # pattern: {glob_str}/{decimal}[_]* and isdir
-                multi_speakers_dir = [
-                    (os.path.join(glob_str, f), int(f.split("_")[0]))
-                    for f in files
-                    if os.path.isdir(os.path.join(glob_str, f))
-                    and f.split("_")[0].isdecimal()
-                ]
-
-                if len(multi_speakers_dir) > 0:
-                    # multi speakers at once train
-                    datasets_speakers = [
-                        (file, dir[1])
-                        for dir in multi_speakers_dir
-                        for file in glob.iglob(
-                            os.path.join(dir[0], "*"), recursive=recursive
-                        )
-                        if is_audio_file(file)
-                    ]
-                    continue
+                # Multispeaker format:
+                # dataset_path/
+                # - speakername/
+                #     - {wav name here}.wav
+                #     - ...
+                # - next_speakername/
+                #     - {wav name here}.wav
+                #     - ...
+                # - ...
+                print("Multispeaker dataset enabled; Processing speakers.")
+                for dir in tqdm.tqdm(os.listdir(glob_str)):
+                    print("Speaker ID " + str(speaker_count) + ": " + dir)
+                    speaker_to_id_mapping[dir] = speaker_count
+                    speaker_path = glob_str + "/" + dir
+                    for audio in tqdm.tqdm(os.listdir(speaker_path)):
+                        if is_audio_file(glob_str + "/" + dir + "/" + audio):
+                            datasets_speakers.append((glob_str + "/" + dir + "/" + audio, speaker_count))
+                    speaker_count += 1
+                with open("./speaker_info.json", "w") as outfile:
+                    print("Dumped speaker info to ./speaker_info.json")
+                    json.dump(speaker_to_id_mapping, outfile)
+                continue # Skip the normal speaker extend
 
             glob_str = os.path.join(glob_str, "**", "*")
-
+        print("Single speaker dataset enabled; Processing speaker as ID " + str(speaker_id) + ".")
         datasets_speakers.extend(
             [
                 (file, speaker_id)
@@ -91,7 +96,7 @@ def glob_dataset(
             ]
         )
 
-    return sorted(datasets_speakers, key=operator.itemgetter(0))
+    return sorted(datasets_speakers)
 
 
 def create_dataset_meta(training_dir: str, f0: bool):
diff --git a/modules/tabs/inference.py b/modules/tabs/inference.py
@@ -22,8 +22,8 @@ def inference_options_ui(show_out_dir=True):
                 minimum=-20, maximum=20, value=0, step=1, label="Transpose"
             )
             pitch_extraction_algo = gr.Radio(
-                choices=["dio", "harvest"],
-                value="dio",
+                choices=["dio", "harvest", "mangio-crepe", "crepe"],
+                value="crepe",
                 label="Pitch Extraction Algorithm",
             )
             embedding_model = gr.Radio(
diff --git a/modules/tabs/training.py b/modules/tabs/training.py
@@ -354,8 +354,8 @@ def train_all(
                             label="Normalize audio volume when preprocess",
                         )
                         pitch_extraction_algo = gr.Radio(
-                            choices=["dio", "harvest"],
-                            value="harvest",
+                            choices=["dio", "harvest", "mangio-crepe", "crepe"],
+                            value="crepe",
                             label="Pitch extraction algorithm",
                         )
                     with gr.Row().style(equal_height=False):
diff --git a/requirements/main.txt b/requirements/main.txt
@@ -5,13 +5,14 @@ faiss-cpu==1.7.3
 fairseq==0.12.2
 matplotlib==3.7.1
 scipy==1.9.3
-librosa==0.9.2
+librosa==0.9.1
 pyworld==0.3.2
 soundfile==0.12.1
 ffmpeg-python==0.2.0
 pydub==0.25.1
 soxr==0.3.5
 transformers==4.28.1
+torchcrepe==0.0.20
 
 tensorboard
 tensorboardX

Original file line number	Diff line number	Diff line change
`@@ -22,8 +22,8 @@ def inference_options_ui(show_out_dir=True):`
`22`	`22`	`minimum=-20, maximum=20, value=0, step=1, label="Transpose"`
`23`	`23`	`)`
`24`	`24`	`pitch_extraction_algo = gr.Radio(`
`25`		`- choices=["dio", "harvest"],`
`26`		`- value="dio",`
	`25`	`+ choices=["dio", "harvest", "mangio-crepe", "crepe"],`
	`26`	`+ value="crepe",`
`27`	`27`	`label="Pitch Extraction Algorithm",`
`28`	`28`	`)`
`29`	`29`	`embedding_model = gr.Radio(`
Original file line number	Diff line number	Diff line change
`@@ -354,8 +354,8 @@ def train_all(`
`354`	`354`	`label="Normalize audio volume when preprocess",`
`355`	`355`	`)`
`356`	`356`	`pitch_extraction_algo = gr.Radio(`
`357`		`- choices=["dio", "harvest"],`
`358`		`- value="harvest",`
	`357`	`+ choices=["dio", "harvest", "mangio-crepe", "crepe"],`
	`358`	`+ value="crepe",`
`359`	`359`	`label="Pitch extraction algorithm",`
`360`	`360`	`)`
`361`	`361`	`with gr.Row().style(equal_height=False):`