SpeechGenderAnalysis/experiment/test_runtime_2.py

import base64
import zlib
from multiprocessing import cpu_count
from subprocess import Popen, PIPE

import librosa
import numpy as np
import pandas as pd
import parselmouth
import tensorflow
import torch
import torchaudio
from inaSpeechSegmenter.features import to_wav
from inaSpeechSegmenter.sidekit_mfcc import read_wav, mfcc
import tensorflow as tf
import tensorflow_io as tfio
from tqdm.contrib.concurrent import process_map

from server.utils import Timer


def test_readfile(file: str, iterations: int):
    results = []
    timer = Timer()
    for _ in range(iterations):
        result = []
        results.append(result)

        parselmouth.Sound(file)
        result.append(timer.elapsed())

        librosa.load(file)
        result.append(timer.elapsed())

        read_wav(file)
        result.append(timer.elapsed())

        torchaudio.load(file)
        result.append(timer.elapsed())
    return pd.DataFrame(results, columns=['Parselmouth', 'librosa', 'read_wav', 'torchaudio'])


def test_resampling(file: str, iterations: int, resample: bool):
    results = []
    timer = Timer()
    sr = 16000 if resample else None
    for _ in range(iterations):
        result = []
        results.append(result)

        # FFMPEG
        to_wav(file, sr=sr)
        result.append(timer.elapsed())

        # SOX
        args = ['sox', file, '-c', '1', '-e', 'floating-point']
        if sr:
            args += ['-r', str(sr)]
        args += ['output-sox.wav']
        p = Popen(args, stdout=PIPE, stderr=PIPE)
        output, error = p.communicate()
        assert p.returncode == 0, error
        result.append(timer.elapsed())

        # MPlayer
        args = ['mplayer', '-ao', 'pcm:fast:waveheader:file=output-mplayer.wav', '-vo', 'null', '-vc', 'null']
        if sr:
            args += ['-af', f'resample={sr},pan=1:0.5:0.5']
        else:
            args += ['-af', 'pan=1:0.5:0.5']
        args += [file]
        p = Popen(args, stdout=PIPE, stderr=PIPE)
        output, error = p.communicate()
        assert p.returncode == 0, error
        result.append(timer.elapsed())

    return pd.DataFrame(results, columns=['ffmpeg', 'sox', 'mplayer'])


def test_spectrogram(y: np.ndarray, sr: int, iterations: int, n_fft=2048, hop_length=512):
    results = []
    timer = Timer()
    nfft_s = n_fft / sr
    step_s = hop_length / sr
    for _ in range(iterations):
        result = []
        results.append(result)

        sound = parselmouth.Sound(y, float(sr))
        sound.to_spectrogram(window_length=nfft_s, time_step=step_s)
        result.append(timer.elapsed())

        librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, htk=True)
        result.append(timer.elapsed())

        mfcc(y.astype(np.float32), get_mspec=True, nwin=nfft_s, shift=step_s, fs=sr)
        result.append(timer.elapsed())

        t = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=n_fft, hop_length=hop_length)
        tensor = torch.from_numpy(y)
        t(tensor)
        result.append(timer.elapsed())

        t = tfio.audio.spectrogram(y, n_fft, n_fft, hop_length)
        mel_spectrogram = tfio.audio.melscale(t, rate=sr, mels=128, fmin=0, fmax=8000)
        result.append(timer.elapsed())

    return pd.DataFrame(results, columns=['Parselmouth', 'librosa', 'sidekit', 'torchaudio', 'tensorflow-io'])


def test_pitch(y: np.ndarray, sr: int, iterations: int, n_fft=2048, hop_length=512):
    results = []
    timer = Timer()
    nfft_s = n_fft / sr
    step_s = hop_length / sr
    for _ in range(iterations):
        result = []
        results.append(result)

        sound = parselmouth.Sound(y, float(sr))
        sound.to_pitch(time_step=step_s)
        result.append(timer.elapsed())

        librosa.yin(y=y, sr=sr, frame_length=n_fft, hop_length=hop_length, fmin=75, fmax=600)
        result.append(timer.elapsed())

        librosa.pyin(y=y, sr=sr, frame_length=n_fft, hop_length=hop_length, fmin=75, fmax=600)
        result.append(timer.elapsed())

        # TODO: essentia (yin, pyin), in-formant (yin, mpm, rapt, irapt)

    return pd.DataFrame(results, columns=['Parselmouth (Boersma 1993)', 'librosa.yin (Kawahara 2002)',
                                          'librosa.pyin (Mauch 2014)'])


def test_formant(y: np.ndarray, sr: int, iterations: int, n_fft=2048, hop_length=512):
    results = []
    timer = Timer()
    nfft_s = n_fft / sr
    step_s = hop_length / sr
    for _ in range(iterations):
        result = []
        results.append(result)

        sound = parselmouth.Sound(y, float(sr))
        sound.to_formant_burg(time_step=step_s)
        result.append(timer.elapsed())

        # TODO: in-formant (deepformants, filteredlp, simplelp, karma)

    return pd.DataFrame(results, columns=['Parselmouth (Marple 1980)'])


def _formant(args: tuple[np.ndarray, float]):
    y, sr = args
    sound = parselmouth.Sound(y, sr)
    step = 512 / sr
    formant = sound.to_formant_burg(time_step=512 / sr)
    result = np.ndarray([len(formant), 3], 'float32')
    for i in range(len(formant)):
        for f in range(1, 4):
            result[i][f - 1] = formant.get_value_at_time(f, i * step)
    return result


if __name__ == '__main__':
    f = '/workspace/EECS 6414/voice_cnn/VT 150hz baseline example.mp3'
    fp = str(to_wav(f, sr=16000).absolute())

    # print(read_wav(f))

    # Test readfile
    # df = test_readfile(fp, 10)
    # print(df)
    # print(df.mean())

    # Test resampling
    # df = test_resampling(f, 10, True)
    # print(df)
    # print(df.mean())

    y, sr, _ = read_wav(fp)
    #
    # # Tensorflow warm-up
    # t = tfio.audio.spectrogram(y, 1, 1, 2048)
    # tfio.audio.melscale(t, rate=sr, mels=128, fmin=0, fmax=8000)
    # print('Warmup done')
    #
    # # Test mel spect
    # df = test_spectrogram(y, sr, 10)
    # print(df)
    # print(df.mean())

    # Test pitch
    # df = test_pitch(y, sr, 10)
    # print(df)
    # print(df.mean())

    # Test formant
    # df = test_formant(y, sr, 10)
    # print(df)
    # print(df.mean())
    # timer = Timer()
    # split = [(y, float(sr)) for y in np.array_split(y, 512 * 30)]
    # print(split)
    # print(len(split))
    # formants = process_map(_formant, split, max_workers=cpu_count(), chunksize=1)
    # timer.log('Done')
    # print(formants)
    # sound = parselmouth.Sound(y, float(sr))
    # formant = sound.to_formant_burg(time_step=512 / sr)

    # sound.to_formant_burg()

    n_fft = 2048
    hop_length = 512
    t = tfio.audio.spectrogram(y, n_fft, n_fft, hop_length)
    mel_spectrogram: tf.Tensor = tfio.audio.melscale(t, rate=sr, mels=128, fmin=0, fmax=8000)
    nd: np.ndarray = mel_spectrogram.numpy()
    print(nd)
    print(nd.shape)
    print(nd.dtype)
    by = nd.tobytes()
    print('Raw Numpy bytes:', type(by), f'{len(by) / 1024 / 1024:.2f}mb')
    zl = zlib.compress(by, 9)
    print('zlib compressed (level 9):', type(zl), f'{len(zl) / 1024 / 1024:.2f}mb')
    b6 = base64.b64encode(by)
    print('base64 encoded utf-8:', type(b6), f'{len(b6) / 1024 / 1024:.2f}mb')