Source code for lingvo.tools.audio_lib

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Audio library."""

import subprocess
import lingvo.compat as tf
from lingvo.core import py_utils
from lingvo.tasks.asr import frontend as asr_frontend

from tensorflow.python.ops import gen_audio_ops as audio_ops  # pylint: disable=g-direct-tensorflow-import


# There are two ways to decode a wav in tensorflow:
# Through the tensorflow native audio decoder, exported
# via framework, or via tf.contrib.ffmpeg.decode_audio.
# While the latter could technically support FLAC, it does
# not. It also adds an extra dependency on ffmpeg.


[docs]def DecodeFlacToWav(input_bytes): """Decode a FLAC byte string to WAV.""" p = subprocess.Popen( ['sox', '-t', 'flac', '-', '-t', 'wav', '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate(input=input_bytes) assert p.returncode == 0, err return out
[docs]def DecodeWav(input_bytes): """Decode a wav file from its contents. Args: input_bytes: a byte array or Tensor with the wav file contents. Returns: A pair of Tensor for sample rate, decoded samples. """ result = tf.audio.decode_wav(input_bytes) return result.sample_rate, result.audio
[docs]def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms, num_coefficients): window_size_samples = sample_rate * window_size_ms // 1000 window_stride_samples = sample_rate * window_stride_ms // 1000 spectrogram = audio_ops.audio_spectrogram( audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) mfcc = audio_ops.mfcc( spectrogram, sample_rate, dct_coefficient_count=num_coefficients) return mfcc
[docs]def ExtractLogMelFeatures(wav_bytes_t): """Create Log-Mel Filterbank Features from raw bytes. Args: wav_bytes_t: Tensor representing raw wav file as a string of bytes. It is currently assumed that the wav file is encoded at 16KHz (see DecodeWav, below). Returns: A Tensor representing three stacked log-Mel filterbank energies, sub-sampled every three frames. """ def _CreateAsrFrontend(): """Parameters corresponding to default ASR frontend.""" p = asr_frontend.MelAsrFrontend.Params() p.sample_rate = 16000. p.frame_size_ms = 25. p.frame_step_ms = 10. p.num_bins = 80 p.lower_edge_hertz = 125. p.upper_edge_hertz = 7600. p.preemph = 0.97 p.noise_scale = 0. p.pad_end = False return p.Instantiate() sample_rate, audio = DecodeWav(wav_bytes_t) audio *= 32768 # Remove channel dimension, since we have a single channel. audio = tf.squeeze(audio, axis=1) # TODO(drpng): make batches. audio = tf.expand_dims(audio, axis=0) static_sample_rate = 16000 mel_frontend = _CreateAsrFrontend() with tf.control_dependencies( [tf.assert_equal(sample_rate, static_sample_rate)]): outputs = mel_frontend.FPropDefaultTheta( py_utils.NestedMap(src_inputs=audio, paddings=tf.zeros_like(audio))) log_mel = outputs.src_inputs return log_mel