[+] Runtime testing

2026-04-25 07:53:08 -05:00 · 2022-04-04 19:49:05 -04:00 · 2022-04-04 19:49:05 -04:00 · 33fada11c3
commit 33fada11c3
parent 3a83aa9a11
2 changed files with 330 additions and 0 deletions
--- a/experiment/test_runtime.py
+++ b/experiment/test_runtime.py
@ -0,0 +1,101 @@
+import librosa as librosa
+import numpy as np
+import parselmouth
+import torchaudio
+from inaSpeechSegmenter import Segmenter
+from inaSpeechSegmenter.features import to_wav, _wav2feats
+from inaSpeechSegmenter.sidekit_mfcc import read_wav
+
+from server.utils import Timer
+
+np.seterr(invalid='ignore')
+
+if __name__ == '__main__':
+    f = '/workspace/EECS 6414/voice_cnn/VT 150hz baseline example.mp3'
+    timer = Timer()
+
+    seg = Segmenter()
+    seg('/workspace/EECS 6414/voice_cnn/test.wav')
+    timer.log('ML engine loaded (One time expense, not counted in running time).')
+
+    print()
+
+    fp = f
+    fp = str(to_wav(f).absolute())
+    timer.log('FFMPEG Convert file to WAV 16000Hz')
+    fp = str(to_wav(f, sr=None).absolute())
+    timer.log('FFMPEG Convert file to WAV (original sr kept)')
+
+    print()
+
+    # Read file
+    parselmouth.Sound(fp)
+    timer.log('Parselmouth: Read file.')
+    sound = parselmouth.Sound(fp)
+    timer.log('Parselmouth: Read file.')
+
+    # librosa.load(fp)
+    # timer.log('Librosa: Read file')
+    # librosa.load(fp)
+    # timer.log('Librosa: Read file')
+
+    read_wav(fp)
+    timer.log('Read file with read_wav')
+    y, sr, _ = read_wav(fp)
+    timer.log(f'Read file with read_wav (decoded sr = {sr})')
+
+    torchaudio.load(fp)
+    timer.log('Read file with torchaudio')
+    torchaudio.load(fp)
+    timer.log('Read file with torchaudio')
+
+    print()
+
+    # Calculate features
+    pitch = sound.to_pitch(0.01)
+    timer.log('Parselmouth: Pitch calculated (0.01)')
+    sound.to_pitch(0.01)
+    timer.log('Parselmouth: Pitch calculated again (0.01)')
+    sound.to_pitch(0.032)
+    timer.log('Parselmouth: Pitch calculated again (0.032)')
+
+    print()
+
+    sound.to_formant_burg(0.01)
+    timer.log('Parselmouth: Formant calculated (0.01)')
+    sound.to_formant_burg(0.01)
+    timer.log('Parselmouth: Formant calculated again (0.01)')
+    sound.to_formant_burg(0.032)
+    timer.log('Parselmouth: Formant calculated again (0.032)')
+
+    print()
+
+    sound.to_spectrogram(window_length=0.128, time_step=0.032)
+    timer.log('Parselmouth: Spectrogram calculated (n_fft=0.128, step=0.032)')
+
+    print()
+
+    librosa.core.piptrack(y=y, sr=sr)
+    timer.log('Librosa: piptrack')
+    spec = np.abs(librosa.stft(y, n_fft=1024, hop_length=512))
+    timer.log('Librosa: STFT')
+    mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, htk=True)
+    timer.log('Librosa: Mel spectrogram')
+
+    print()
+
+    _wav2feats(fp)
+    timer.log('ML: Calculate mspect feats')
+    mspect, loge, diff_len = _wav2feats(fp)
+    timer.log('ML: Calculate mspect feats')
+
+    seg.segment_feats(mspect, loge, diff_len, 0)
+    timer.log('ML: Segment feats')
+    seg.segment_feats(mspect, loge, diff_len, 0)
+    timer.log('ML: Segment feats')
+
+    # Calculate ML
+    # seg(f)
+    # timer.log('ML Segmented')
+    # seg(f)
+    # timer.log('ML Segmented again')
--- a/experiment/test_runtime_2.py
+++ b/experiment/test_runtime_2.py
@ -0,0 +1,229 @@
+import base64
+import zlib
+from multiprocessing import cpu_count
+from subprocess import Popen, PIPE
+
+import librosa
+import numpy as np
+import pandas as pd
+import parselmouth
+import tensorflow
+import torch
+import torchaudio
+from inaSpeechSegmenter.features import to_wav
+from inaSpeechSegmenter.sidekit_mfcc import read_wav, mfcc
+import tensorflow as tf
+import tensorflow_io as tfio
+from tqdm.contrib.concurrent import process_map
+
+from server.utils import Timer
+
+
+def test_readfile(file: str, iterations: int):
+    results = []
+    timer = Timer()
+    for _ in range(iterations):
+        result = []
+        results.append(result)
+
+        parselmouth.Sound(file)
+        result.append(timer.elapsed())
+
+        librosa.load(file)
+        result.append(timer.elapsed())
+
+        read_wav(file)
+        result.append(timer.elapsed())
+
+        torchaudio.load(file)
+        result.append(timer.elapsed())
+    return pd.DataFrame(results, columns=['Parselmouth', 'librosa', 'read_wav', 'torchaudio'])
+
+
+def test_resampling(file: str, iterations: int, resample: bool):
+    results = []
+    timer = Timer()
+    sr = 16000 if resample else None
+    for _ in range(iterations):
+        result = []
+        results.append(result)
+
+        # FFMPEG
+        to_wav(file, sr=sr)
+        result.append(timer.elapsed())
+
+        # SOX
+        args = ['sox', file, '-c', '1', '-e', 'floating-point']
+        if sr:
+            args += ['-r', str(sr)]
+        args += ['output-sox.wav']
+        p = Popen(args, stdout=PIPE, stderr=PIPE)
+        output, error = p.communicate()
+        assert p.returncode == 0, error
+        result.append(timer.elapsed())
+
+        # MPlayer
+        args = ['mplayer', '-ao', 'pcm:fast:waveheader:file=output-mplayer.wav', '-vo', 'null', '-vc', 'null']
+        if sr:
+            args += ['-af', f'resample={sr},pan=1:0.5:0.5']
+        else:
+            args += ['-af', 'pan=1:0.5:0.5']
+        args += [file]
+        p = Popen(args, stdout=PIPE, stderr=PIPE)
+        output, error = p.communicate()
+        assert p.returncode == 0, error
+        result.append(timer.elapsed())
+
+    return pd.DataFrame(results, columns=['ffmpeg', 'sox', 'mplayer'])
+
+
+def test_spectrogram(y: np.ndarray, sr: int, iterations: int, n_fft=2048, hop_length=512):
+    results = []
+    timer = Timer()
+    nfft_s = n_fft / sr
+    step_s = hop_length / sr
+    for _ in range(iterations):
+        result = []
+        results.append(result)
+
+        sound = parselmouth.Sound(y, float(sr))
+        sound.to_spectrogram(window_length=nfft_s, time_step=step_s)
+        result.append(timer.elapsed())
+
+        librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, htk=True)
+        result.append(timer.elapsed())
+
+        mfcc(y.astype(np.float32), get_mspec=True, nwin=nfft_s, shift=step_s, fs=sr)
+        result.append(timer.elapsed())
+
+        t = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=n_fft, hop_length=hop_length)
+        tensor = torch.from_numpy(y)
+        t(tensor)
+        result.append(timer.elapsed())
+
+        t = tfio.audio.spectrogram(y, n_fft, n_fft, hop_length)
+        mel_spectrogram = tfio.audio.melscale(t, rate=sr, mels=128, fmin=0, fmax=8000)
+        result.append(timer.elapsed())
+
+    return pd.DataFrame(results, columns=['Parselmouth', 'librosa', 'sidekit', 'torchaudio', 'tensorflow-io'])
+
+
+def test_pitch(y: np.ndarray, sr: int, iterations: int, n_fft=2048, hop_length=512):
+    results = []
+    timer = Timer()
+    nfft_s = n_fft / sr
+    step_s = hop_length / sr
+    for _ in range(iterations):
+        result = []
+        results.append(result)
+
+        sound = parselmouth.Sound(y, float(sr))
+        sound.to_pitch(time_step=step_s)
+        result.append(timer.elapsed())
+
+        librosa.yin(y=y, sr=sr, frame_length=n_fft, hop_length=hop_length, fmin=75, fmax=600)
+        result.append(timer.elapsed())
+
+        librosa.pyin(y=y, sr=sr, frame_length=n_fft, hop_length=hop_length, fmin=75, fmax=600)
+        result.append(timer.elapsed())
+
+        # TODO: essentia (yin, pyin), in-formant (yin, mpm, rapt, irapt)
+
+    return pd.DataFrame(results, columns=['Parselmouth (Boersma 1993)', 'librosa.yin (Kawahara 2002)',
+                                          'librosa.pyin (Mauch 2014)'])
+
+
+def test_formant(y: np.ndarray, sr: int, iterations: int, n_fft=2048, hop_length=512):
+    results = []
+    timer = Timer()
+    nfft_s = n_fft / sr
+    step_s = hop_length / sr
+    for _ in range(iterations):
+        result = []
+        results.append(result)
+
+        sound = parselmouth.Sound(y, float(sr))
+        sound.to_formant_burg(time_step=step_s)
+        result.append(timer.elapsed())
+
+        # TODO: in-formant (deepformants, filteredlp, simplelp, karma)
+
+    return pd.DataFrame(results, columns=['Parselmouth (Marple 1980)'])
+
+
+def _formant(args: tuple[np.ndarray, float]):
+    y, sr = args
+    sound = parselmouth.Sound(y, sr)
+    step = 512 / sr
+    formant = sound.to_formant_burg(time_step=512 / sr)
+    result = np.ndarray([len(formant), 3], 'float32')
+    for i in range(len(formant)):
+        for f in range(1, 4):
+            result[i][f - 1] = formant.get_value_at_time(f, i * step)
+    return result
+
+
+if __name__ == '__main__':
+    f = '/workspace/EECS 6414/voice_cnn/VT 150hz baseline example.mp3'
+    fp = str(to_wav(f, sr=16000).absolute())
+
+    # print(read_wav(f))
+
+    # Test readfile
+    # df = test_readfile(fp, 10)
+    # print(df)
+    # print(df.mean())
+
+    # Test resampling
+    # df = test_resampling(f, 10, True)
+    # print(df)
+    # print(df.mean())
+
+    y, sr, _ = read_wav(fp)
+    #
+    # # Tensorflow warm-up
+    # t = tfio.audio.spectrogram(y, 1, 1, 2048)
+    # tfio.audio.melscale(t, rate=sr, mels=128, fmin=0, fmax=8000)
+    # print('Warmup done')
+    #
+    # # Test mel spect
+    # df = test_spectrogram(y, sr, 10)
+    # print(df)
+    # print(df.mean())
+
+    # Test pitch
+    # df = test_pitch(y, sr, 10)
+    # print(df)
+    # print(df.mean())
+
+    # Test formant
+    # df = test_formant(y, sr, 10)
+    # print(df)
+    # print(df.mean())
+    # timer = Timer()
+    # split = [(y, float(sr)) for y in np.array_split(y, 512 * 30)]
+    # print(split)
+    # print(len(split))
+    # formants = process_map(_formant, split, max_workers=cpu_count(), chunksize=1)
+    # timer.log('Done')
+    # print(formants)
+    # sound = parselmouth.Sound(y, float(sr))
+    # formant = sound.to_formant_burg(time_step=512 / sr)
+
+    # sound.to_formant_burg()
+
+    n_fft = 2048
+    hop_length = 512
+    t = tfio.audio.spectrogram(y, n_fft, n_fft, hop_length)
+    mel_spectrogram: tf.Tensor = tfio.audio.melscale(t, rate=sr, mels=128, fmin=0, fmax=8000)
+    nd: np.ndarray = mel_spectrogram.numpy()
+    print(nd)
+    print(nd.shape)
+    print(nd.dtype)
+    by = nd.tobytes()
+    print('Raw Numpy bytes:', type(by), f'{len(by) / 1024 / 1024:.2f}mb')
+    zl = zlib.compress(by, 9)
+    print('zlib compressed (level 9):', type(zl), f'{len(zl) / 1024 / 1024:.2f}mb')
+    b6 = base64.b64encode(by)
+    print('base64 encoded utf-8:', type(b6), f'{len(b6) / 1024 / 1024:.2f}mb')
+