forked from r9y9/wavenet_vocoder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsut.py
101 lines (84 loc) · 3.58 KB
/
jsut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import audio
from nnmnkwii.datasets import jsut
from nnmnkwii.io import hts
from hparams import hparams
from os.path import exists
import librosa
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
transcriptions = jsut.TranscriptionDataSource(
in_dir, subsets=jsut.available_subsets).collect_files()
wav_paths = jsut.WavFileDataSource(
in_dir, subsets=jsut.available_subsets).collect_files()
for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, wav_path, text)))
return [future.result() for future in tqdm(futures)]
def _process_utterance(out_dir, index, wav_path, text):
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
sr = hparams.sample_rate
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Trim silence from hts labels if available
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
if exists(lab_path):
labels = hts.load(lab_path)
assert "sil" in labels[0][-1]
assert "sil" in labels[-1][-1]
b = int(labels[0][1] * 1e-7 * sr)
e = int(labels[-1][0] * 1e-7 * sr)
wav = wav[b:e]
else:
wav, _ = librosa.effects.trim(wav, top_db=30)
# Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
# [0, quantize_channels)
out = P.mulaw_quantize(wav, hparams.quantize_channels)
# Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start:end]
out = out[start:end]
constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
out_dtype = np.int16
elif is_mulaw(hparams.input_type):
# [-1, 1]
out = P.mulaw(wav, hparams.quantize_channels)
constant_values = P.mulaw(0.0, hparams.quantize_channels)
out_dtype = np.float32
else:
# [-1, 1]
out = wav
constant_values = 0.0
out_dtype = np.float32
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
# lws pads zeros internally before performing stft
# this is needed to adjust time resolution between audio and mel-spectrogram
l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
# zero pad for quantized signal
out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
N = mel_spectrogram.shape[0]
assert len(out) >= N * audio.get_hop_size()
# time resolution adjustment
# ensure length of raw audio is multiple of hop_size so that we can use
# transposed convolution to upsample
out = out[:N * audio.get_hop_size()]
assert len(out) % audio.get_hop_size() == 0
timesteps = len(out)
# Write the spectrograms to disk:
audio_filename = 'jsut-audio-%05d.npy' % index
mel_filename = 'jsut-mel-%05d.npy' % index
np.save(os.path.join(out_dir, audio_filename),
out.astype(out_dtype), allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename),
mel_spectrogram.astype(np.float32), allow_pickle=False)
# Return a tuple describing this training example:
return (audio_filename, mel_filename, timesteps, text)