diff --git a/python-server/audiosegment_wrapper.py b/python-server/audiosegment_wrapper.py new file mode 100644 index 0000000..6f8f949 --- /dev/null +++ b/python-server/audiosegment_wrapper.py @@ -0,0 +1,1060 @@ +""" +This module simply exposes a wrapper of a pydub.AudioSegment object. +""" +from __future__ import division +from __future__ import print_function + +import collections +import functools +import itertools +import math +import numpy as np +import pickle +import platform +import pydub +import os +import random +import scipy.signal as signal +import string +import subprocess +import sys +import tempfile +import warnings +import webrtcvad + +MS_PER_S = 1000 +S_PER_MIN = 60 +MS_PER_MIN = MS_PER_S * S_PER_MIN + +def deprecated(func): + """ + Deprecator decorator. + """ + + @functools.wraps(func) + def new_func(*args, **kwargs): + warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2) + return func(*args, **kwargs) + + return new_func + +class AudioSegment: + """ + This class is a wrapper for a pydub.AudioSegment that provides additional methods. + """ + + def __init__(self, pydubseg, name): + self.seg = pydubseg + self.name = name + + def __getattr__(self, attr): + orig_attr = self.seg.__getattribute__(attr) + if callable(orig_attr): + def hooked(*args, **kwargs): + result = orig_attr(*args, **kwargs) + if result == self.seg: + return self + elif type(result) == pydub.AudioSegment: + return AudioSegment(result, self.name) + else: + return result + return hooked + else: + return orig_attr + + def __len__(self): + return len(self.seg) + + def __eq__(self, other): + return self.seg == other + + def __ne__(self, other): + return self.seg != other + + def __iter__(self): + return (x for x in self.seg) + + def __getitem__(self, millisecond): + return AudioSegment(self.seg[millisecond], self.name) + + def __add__(self, arg): + if type(arg) == AudioSegment: + self.seg._data = self.seg._data + arg.seg._data + else: + self.seg = self.seg + arg + return self + + def __radd__(self, rarg): + return self.seg.__radd__(rarg) + + def __repr__(self): + return str(self) + + def __str__(self): + s = "%s: %s channels, %s bit, sampled @ %s kHz, %.3fs long" %\ + (self.name, str(self.channels), str(self.sample_width * 8),\ + str(self.frame_rate / 1000.0), self.duration_seconds) + return s + + def __sub__(self, arg): + if type(arg) == AudioSegment: + self.seg = self.seg - arg.seg + else: + self.seg = self.seg - arg + return self + + def __mul__(self, arg): + if type(arg) == AudioSegment: + self.seg = self.seg * arg.seg + else: + self.seg = self.seg * arg + return self + + @property + def spl(self): + """ + Sound Pressure Level - defined as 20 * log10(abs(value)). + + Returns a numpy array of SPL dB values. + """ + return 20.0 * np.log10(np.abs(self.to_numpy_array() + 1E-9)) + + @staticmethod + def _bandpass_filter(data, low, high, fs, order=5): + """ + :param data: The data (numpy array) to be filtered. + :param low: The low cutoff in Hz. + :param high: The high cutoff in Hz. + :param fs: The sample rate (in Hz) of the data. + :param order: The order of the filter. The higher the order, the tighter the roll-off. + :returns: Filtered data (numpy array). + """ + nyq = 0.5 * fs + low = low / nyq + high = high / nyq + b, a = signal.butter(order, [low, high], btype='band') + y = signal.lfilter(b, a, data) + return y + + @staticmethod + def lowpass_filter(data, cutoff, fs, order=5): + """ + :param data: The data (numpy array) to be filtered. + :param cutoff: The high cutoff in Hz. + :param fs: The sample rate in Hz of the data. + :param order: The order of the filter. The higher the order, the tighter the roll-off. + :returns: Filtered data (numpy array). + """ + nyq = 0.5 * fs + normal_cutoff = cutoff / nyq + b, a = signal.butter(order, normal_cutoff, btype='low', analog=False) + y = signal.lfilter(b, a, data) + return y + + def auditory_scene_analysis(self): + """ + Algorithm based on paper: Auditory Segmentation Based on Onset and Offset Analysis, + by Hu and Wang, 2007. + """ + + import matplotlib.pyplot as plt + + def visualize_time_domain(seg, title=""): + plt.plot(seg) + plt.title(title) + plt.show() + plt.clf() + + def visualize(spect, frequencies, title=""): + i = 0 + for freq, (index, row) in zip(frequencies[::-1], enumerate(spect[::-1, :])): + plt.subplot(spect.shape[0], 1, index + 1) + if i == 0: + plt.title(title) + i += 1 + plt.ylabel("{0:.0f}".format(freq)) + plt.plot(row) + plt.show() + plt.clf() + + # Normalize self into 25dB average SPL + normalized = self.normalize_spl_by_average(db=25) + visualize_time_domain(normalized.to_numpy_array(), "Normalized") + # Do a band-pass filter in each frequency + data = normalized.to_numpy_array() + start_frequency = 50 + stop_frequency = 8000 + start = np.log10(start_frequency) + stop = np.log10(stop_frequency) + frequencies = np.logspace(start, stop, num=10, endpoint=True, base=10.0) + print("Dealing with the following frequencies:", frequencies) + rows = [AudioSegment._bandpass_filter(data, freq*0.8, freq*1.2, self.frame_rate) for freq in frequencies] + rows = np.array(rows) + spect = np.vstack(rows) + visualize(spect, frequencies, "After bandpass filtering (cochlear model)") + + # Half-wave rectify each frequency channel + spect[spect < 0] = 0 + visualize(spect, frequencies, "After half-wave rectification in each frequency") + + # Low-pass filter each frequency channel + spect = np.apply_along_axis(AudioSegment.lowpass_filter, 1, spect, 30, self.frame_rate, 6) + visualize(spect, frequencies, "After low-pass filtering in each frequency") + + # Downsample each frequency to 400 Hz + downsample_freq_hz = 400 + if self.frame_rate > downsample_freq_hz: + step = int(round(self.frame_rate / downsample_freq_hz)) + spect = spect[:, ::step] + visualize(spect, frequencies, "After downsampling in each frequency") + + # Now you have the temporal envelope of each frequency channel + + # Smoothing + scales = [(6, 1/4), (6, 1/14), (1/2, 1/14)] + thetas = [0.95, 0.95, 0.85] + ## For each (sc, st) scale, smooth across time using st, then across frequency using sc + gaussian = lambda x, mu, sig: np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0))) + gaussian_kernel = lambda sig: gaussian(np.linspace(-10, 10, len(frequencies) / 2), 0, sig) + spectrograms = [] + for sc, st in scales: + time_smoothed = np.apply_along_axis(AudioSegment.lowpass_filter, 1, spect, 1/st, downsample_freq_hz, 6) + visualize(time_smoothed, frequencies, "After time smoothing with scale: " + str(st)) + freq_smoothed = np.apply_along_axis(np.convolve, 0, spect, gaussian_kernel(sc)) + spectrograms.append(freq_smoothed) + visualize(freq_smoothed, frequencies, "After time and frequency smoothing with scales (freq) " + str(sc) + " and (time) " + str(st)) + ## Now we have a set of scale-space spectrograms of different scales (sc, st) + + # Onset/Offset Detection and Matching + def theta_on(spect): + return np.nanmean(spect) + np.nanstd(spect) + + def compute_peaks_or_valleys_of_first_derivative(s, do_peaks=True): + """ + Takes a spectrogram and returns a 2D array of the form: + + 0 0 0 1 0 0 1 0 0 0 1 <-- Frequency 0 + 0 0 1 0 0 0 0 0 0 1 0 <-- Frequency 1 + 0 0 0 0 0 0 1 0 1 0 0 <-- Frequency 2 + *** Time axis ******* + + Where a 1 means that the value in that time bin in the spectrogram corresponds to + a peak/valley in the first derivative. + """ + gradient = np.nan_to_num(np.apply_along_axis(np.gradient, 1, s), copy=False) + half_window = 4 + if do_peaks: + indexes = [signal.argrelextrema(gradient[i, :], np.greater, order=half_window) for i in range(gradient.shape[0])] + else: + indexes = [signal.argrelextrema(gradient[i, :], np.less, order=half_window) for i in range(gradient.shape[0])] + extrema = np.zeros(s.shape) + for row_index, index_array in enumerate(indexes): + # Each index_array is a list of indexes corresponding to all the extrema in a given row + for col_index in index_array: + extrema[row_index, col_index] = 1 + return extrema + + for spect, (sc, st) in zip(spectrograms, scales): + # Compute sudden upward changes in spect, these are onsets of events + onsets = compute_peaks_or_valleys_of_first_derivative(spect) + # Compute sudden downward changes in spect, these are offsets of events + offsets = compute_peaks_or_valleys_of_first_derivative(spect, do_peaks=False) + print("TOTAL ONSETS:", np.sum(onsets, axis=1)) + print("TOTAL OFFSETS:", np.sum(offsets, axis=1)) + exit() + + # onsets and offsets are 2D arrays + + ## Determine the offset time for each onset: + ### If t_on[c, i] represents the time of the ith onset in frequency channel c, the corresponding offset + ### must occur between t_on[c, i] and t_on[c, i+1] + ### If there are more than one offsets candidates in this range, choose the one with largest intensity decrease. + ## Create onset/offset fronts by connecting onsets across frequency channels (connect two onsets + ## if they occur within 20ms of each other). Start over whenever a frequency band does not contain an offset + ## in this range. Do the same procedure for offsets. Now you have onset and offset fronts. + ## Now hook up the onsets with the offsets to form segments: + ## For each onset front, (t_on[c, i1, t_on[c + 1, i2], ..., t_on[c + m - 1, im]): + ## matching_offsets = (t_off[c, i1], t_off[c + 1, i2], ..., t_off[c + m - 1, im]) + ## Get all offset fronts which contain at least one of offset time found in matching_offsets + ## Among these offset fronts, the one that crosses the most of matching_offsets is chosen, + ## - call this offset front: matching_offset_front + ## Update all t_offs in matching_offsets whose 'c's are in matching_offset_front to be 'matched', and + ## - update their times to the corresponding channel offset in matching_offset_front. + ## If all t_offs in matching_offsets are 'matched', continue to next onset front + ## Now go through all the segments you have created and break them up along frequencies if the temporal + ## envelopes don't match well enough. That is, if we have two adjacent channels c and c+1, and they + ## are part of the same segment as determined above, break this segment into two along these lines + ## if the correlation between them is below theta_c. Theta_c is thetas[i] where i depends on the scale. + + # Multiscale Integration + ## + ## TODO + + def detect_voice(self, prob_detect_voice=0.5): + """ + Returns self as a list of tuples: + [('v', voiced segment), ('u', unvoiced segment), (etc.)] + + The overall order of the AudioSegment is preserved. + + :param prob_detect_voice: The raw probability that any random 20ms window of the audio file + contains voice. + :returns: The described list. + """ + assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates." + assert self.sample_width == 2, "Try resampling to 16 bit." + assert self.channels == 1, "Try resampling to one channel." + + class model_class: + def __init__(self, aggressiveness): + self.v = webrtcvad.Vad(int(aggressiveness)) + + def predict(self, vector): + if self.v.is_speech(vector.raw_data, vector.frame_rate): + return 1 + else: + return 0 + + model = model_class(aggressiveness=2) + pyesno = 0.3 # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced + pnoyes = 0.2 # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced + p_realyes_outputyes = 0.4 # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much + p_realyes_outputno = 0.05 # If it says no, we can be very certain that it really is a no + p_yes_raw = prob_detect_voice + filtered = self.detect_event(model=model, + ms_per_input=20, + transition_matrix=(pyesno, pnoyes), + model_stats=(p_realyes_outputyes, p_realyes_outputno), + event_length_s=0.25, + prob_raw_yes=p_yes_raw) + ret = [] + for tup in filtered: + t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1]) + ret.append(t) + return ret + + def dice(self, seconds, zero_pad=False): + """ + Cuts the AudioSegment into `seconds` segments (at most). So for example, if seconds=10, + this will return a list of AudioSegments, in order, where each one is at most 10 seconds + long. If `zero_pad` is True, the last item AudioSegment object will be zero padded to result + in `seconds` seconds. + + :param seconds: The length of each segment in seconds. Can be either a float/int, in which case + `self.duration_seconds` / `seconds` are made, each of `seconds` length, or a + list-like can be given, in which case the given list must sum to + `self.duration_seconds` and each segment is specified by the list - e.g. + the 9th AudioSegment in the returned list will be `seconds[8]` seconds long. + :param zero_pad: Whether to zero_pad the final segment if necessary. Ignored if `seconds` is + a list-like. + :returns: A list of AudioSegments, each of which is the appropriate number of seconds long. + :raises: ValueError if a list-like is given for `seconds` and the list's durations do not sum + to `self.duration_seconds`. + """ + try: + total_s = sum(seconds) + if not (self.duration_seconds <= total_s + 1 and self.duration_seconds >= total_s - 1): + raise ValueError("`seconds` does not sum to within one second of the duration of this AudioSegment.\ + given total seconds: %s and self.duration_seconds: %s" % (total_s, self.duration_seconds)) + starts = [] + stops = [] + time_ms = 0 + for dur in seconds: + starts.append(time_ms) + time_ms += dur * MS_PER_S + stops.append(time_ms) + zero_pad = False + except TypeError: + # `seconds` is not a list + starts = range(0, int(round(self.duration_seconds * MS_PER_S)), int(round(seconds * MS_PER_S))) + stops = (min(self.duration_seconds * MS_PER_S, start + seconds * MS_PER_S) for start in starts) + outs = [self[start:stop] for start, stop in zip(starts, stops)] + out_lens = [out.duration_seconds for out in outs] + # Check if our last slice is within one ms of expected - if so, we don't need to zero pad + if zero_pad and not (out_lens[-1] <= seconds * MS_PER_S + 1 and out_lens[-1] >= seconds * MS_PER_S - 1): + num_zeros = self.frame_rate * (seconds * MS_PER_S - out_lens[-1]) + outs[-1] = outs[-1].zero_extend(num_samples=num_zeros) + return outs + + def detect_event(self, model, ms_per_input, transition_matrix, model_stats, event_length_s, + start_as_yes=False, prob_raw_yes=0.5): + """ + A list of tuples of the form [('n', AudioSegment), ('y', AudioSegment), etc.] is returned, where tuples + of the form ('n', AudioSegment) are the segments of sound where the event was not detected, + while ('y', AudioSegment) tuples were the segments of sound where the event was detected. + + .. code-block:: python + + # Example usage + import audiosegment + import keras + import keras.models + import numpy as np + import sys + + class Model: + def __init__(self, modelpath): + self.model = keras.models.load_model(modelpath) + + def predict(self, seg): + _bins, fft_vals = seg.fft() + fft_vals = np.abs(fft_vals) / len(fft_vals) + predicted_np_form = self.model.predict(np.array([fft_vals]), batch_size=1) + prediction_as_int = int(round(predicted_np_form[0][0])) + return prediction_as_int + + modelpath = sys.argv[1] + wavpath = sys.argv[2] + model = Model(modelpath) + seg = audiosegment.from_file(wavpath).resample(sample_rate_Hz=32000, sample_width=2, channels=1) + pyes_to_no = 0.3 # The probability of one 30 ms sample being an event, and the next one not + pno_to_yes = 0.2 # The probability of one 30 ms sample not being an event, and the next one yes + ptrue_pos_rate = 0.8 # The true positive rate (probability of a predicted yes being right) + pfalse_neg_rate = 0.3 # The false negative rate (probability of a predicted no being wrong) + raw_prob = 0.7 # The raw probability of seeing the event in any random 30 ms slice of this file + events = seg.detect_event(model, ms_per_input=30, transition_matrix=[pyes_to_no, pno_to_yes], + model_stats=[ptrue_pos_rate, pfalse_neg_rate], event_length_s=0.25, + prob_raw_yes=raw_prob) + nos = [event[1] for event in events if event[0] == 'n'] + yeses = [event[1] for event in events if event[0] == 'y'] + if len(nos) > 1: + notdetected = nos[0].reduce(nos[1:]) + notdetected.export("notdetected.wav", format="WAV") + if len(yeses) > 1: + detected = yeses[0].reduce(yeses[1:]) + detected.export("detected.wav", format="WAV") + + + :param model: The model. The model must have a predict() function which takes an AudioSegment + of `ms_per_input` number of ms and which outputs 1 if the audio event is detected + in that input, and 0 if not. Make sure to resample the AudioSegment to the right + values before calling this function on it. + + :param ms_per_input: The number of ms of AudioSegment to be fed into the model at a time. If this does not + come out even, the last AudioSegment will be zero-padded. + + :param transition_matrix: An iterable of the form: [p(yes->no), p(no->yes)]. That is, the probability of moving + from a 'yes' state to a 'no' state and the probability of vice versa. + + :param model_stats: An iterable of the form: [p(reality=1|output=1), p(reality=1|output=0)]. That is, + the probability of the ground truth really being a 1, given that the model output a 1, + and the probability of the ground truth being a 1, given that the model output a 0. + + :param event_length_s: The typical duration of the event you are looking for in seconds (can be a float). + + :param start_as_yes: If True, the first `ms_per_input` will be in the 'y' category. Otherwise it will be + in the 'n' category. + + :param prob_raw_yes: The raw probability of finding the event in any given `ms_per_input` vector. + + :returns: A list of tuples of the form [('n', AudioSegment), ('y', AudioSegment), etc.], + where over the course of the list, the AudioSegment in tuple 3 picks up + where the one in tuple 2 left off. + + :raises: ValueError if `ms_per_input` is negative or larger than the number of ms in this + AudioSegment; if `transition_matrix` or `model_stats` do not have a __len__ attribute + or are not length 2; if the values in `transition_matrix` or `model_stats` are not + in the closed interval [0.0, 1.0]. + """ + if ms_per_input < 0 or ms_per_input / MS_PER_S > self.duration_seconds: + raise ValueError("ms_per_input cannot be negative and cannot be longer than the duration of the AudioSegment."\ + " The given value was " + str(ms_per_input)) + elif not hasattr(transition_matrix, "__len__") or len(transition_matrix) != 2: + raise ValueError("transition_matrix must be an iterable of length 2.") + elif not hasattr(model_stats, "__len__") or len(model_stats) != 2: + raise ValueError("model_stats must be an iterable of length 2.") + elif any([True for prob in transition_matrix if prob > 1.0 or prob < 0.0]): + raise ValueError("Values in transition_matrix are probabilities, and so must be in the range [0.0, 1.0].") + elif any([True for prob in model_stats if prob > 1.0 or prob < 0.0]): + raise ValueError("Values in model_stats are probabilities, and so must be in the range [0.0, 1.0].") + elif prob_raw_yes > 1.0 or prob_raw_yes < 0.0: + raise ValueError("`prob_raw_yes` is a probability, and so must be in the range [0.0, 1.0]") + + # Get the yeses or nos for when the filter is triggered (when the event is on/off) + filter_indices = [yes_or_no for yes_or_no in self._get_filter_indices(start_as_yes, + prob_raw_yes, + ms_per_input, + model, + transition_matrix, + model_stats)] + # Run a homogeneity filter over the values to make local regions more self-similar (reduce noise) + ret = self._homogeneity_filter(filter_indices, window_size=int(round(0.25 * MS_PER_S / ms_per_input))) + # Group the consecutive ones together + ret = self._group_filter_values(ret, ms_per_input) + # Take the groups and turn them into AudioSegment objects + real_ret = self._reduce_filtered_segments(ret) + + return real_ret + + def _get_filter_indices(self, start_as_yes, prob_raw_yes, ms_per_input, model, transition_matrix, model_stats): + """ + This has been broken out of the `filter` function to reduce cognitive load. + """ + filter_triggered = 1 if start_as_yes else 0 + prob_raw_no = 1.0 - prob_raw_yes + for segment, _timestamp in self.generate_frames_as_segments(ms_per_input): + yield filter_triggered + observation = int(round(model.predict(segment))) + assert observation == 1 or observation == 0, "The given model did not output a 1 or a 0, output: "\ + + str(observation) + prob_hyp_yes_given_last_hyp = 1.0 - transition_matrix[0] if filter_triggered else transition_matrix[1] + prob_hyp_no_given_last_hyp = transition_matrix[0] if filter_triggered else 1.0 - transition_matrix[1] + prob_hyp_yes_given_data = model_stats[0] if observation == 1 else model_stats[1] + prob_hyp_no_given_data = 1.0 - model_stats[0] if observation == 1 else 1.0 - model_stats[1] + hypothesis_yes = prob_raw_yes * prob_hyp_yes_given_last_hyp * prob_hyp_yes_given_data + hypothesis_no = prob_raw_no * prob_hyp_no_given_last_hyp * prob_hyp_no_given_data + # make a list of ints - each is 0 or 1. The number of 1s is hypotheis_yes * 100 + # the number of 0s is hypothesis_no * 100 + distribution = [1 for i in range(int(round(hypothesis_yes * 100)))] + distribution.extend([0 for i in range(int(round(hypothesis_no * 100)))]) + # shuffle + random.shuffle(distribution) + filter_triggered = random.choice(distribution) + + def _group_filter_values(self, filter_indices, ms_per_input): + """ + This has been broken out of the `filter` function to reduce cognitive load. + """ + ret = [] + for filter_value, (_segment, timestamp) in zip(filter_indices, self.generate_frames_as_segments(ms_per_input)): + if filter_value == 1: + if len(ret) > 0 and ret[-1][0] == 'n': + ret.append(['y', timestamp]) # The last one was different, so we create a new one + elif len(ret) > 0 and ret[-1][0] == 'y': + ret[-1][1] = timestamp # The last one was the same as this one, so just update the timestamp + else: + ret.append(['y', timestamp]) # This is the first one + else: + if len(ret) > 0 and ret[-1][0] == 'n': + ret[-1][1] = timestamp + elif len(ret) > 0 and ret[-1][0] == 'y': + ret.append(['n', timestamp]) + else: + ret.append(['n', timestamp]) + return ret + + def _homogeneity_filter(self, ls, window_size): + """ + This has been broken out of the `filter` function to reduce cognitive load. + + ls is a list of 1s or 0s for when the filter is on or off + """ + k = window_size + i = k + while i <= len(ls) - k: + # Get a window of k items + window = [ls[i + j] for j in range(k)] + # Change the items in the window to be more like the mode of that window + mode = 1 if sum(window) >= k / 2 else 0 + for j in range(k): + ls[i+j] = mode + i += k + return ls + + def _reduce_filtered_segments(self, ret): + """ + This has been broken out of the `filter` function to reduce cognitive load. + """ + real_ret = [] + for i, (this_yesno, next_timestamp) in enumerate(ret): + if i > 0: + _next_yesno, timestamp = ret[i - 1] + else: + timestamp = 0 + + data = self[timestamp * MS_PER_S:next_timestamp * MS_PER_S].raw_data + seg = AudioSegment(pydub.AudioSegment(data=data, sample_width=self.sample_width, + frame_rate=self.frame_rate, channels=self.channels), self.name) + real_ret.append((this_yesno, seg)) + return real_ret + + def _execute_sox_cmd(self, cmd, console_output=False): + """ + Executes a Sox command in a platform-independent manner. + + `cmd` must be a format string that includes {inputfile} and {outputfile}. + """ + on_windows = platform.system().lower() == "windows" + + # On Windows, a temporary file cannot be shared outside the process that creates it + # so we need to create a "permanent" file that we will use and delete afterwards + def _get_random_tmp_file(): + if on_windows: + rand_string = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(8)) + tmp = self.name + "_" + rand_string + WinTempFile = collections.namedtuple("WinTempFile", "name") + tmp = WinTempFile(tmp) + else: + tmp = tempfile.NamedTemporaryFile() + return tmp + + # Get a temp file to put our data and a temp file to store the result + tmp = _get_random_tmp_file() + othertmp = _get_random_tmp_file() + + # Store our data in the temp file + self.export(tmp.name, format="WAV") + + # Write the command to sox + stdout = stderr = subprocess.PIPE if console_output else subprocess.DEVNULL + command = cmd.format(inputfile=tmp.name, outputfile=othertmp.name) + res = subprocess.run(command.split(' '), stdout=stdout, stderr=stderr) + assert res.returncode == 0, "Sox did not work as intended, or perhaps you don't have Sox installed?" + + # Create a new AudioSegment from the other temp file (where Sox put the result) + other = AudioSegment(pydub.AudioSegment.from_wav(othertmp.name), self.name) + + # Clean up the temp files + if on_windows: + os.remove(tmp.name) + os.remove(othertmp.name) + else: + tmp.close() + othertmp.close() + + return other + + def filter_silence(self, duration_s=1, threshold_percentage=1, console_output=False): + """ + Returns a copy of this AudioSegment, but whose silence has been removed. + + .. note:: This method requires that you have the program 'sox' installed. + + .. warning:: This method uses the program 'sox' to perform the task. While this is very fast for a single + function call, the IO may add up for large numbers of AudioSegment objects. + + :param duration_s: The number of seconds of "silence" that must be present in a row to + be stripped. + :param threshold_percentage: Silence is defined as any samples whose absolute value is below + `threshold_percentage * max(abs(samples in this segment))`. + :param console_output: If True, will pipe all sox output to the console. + :returns: A copy of this AudioSegment, but whose silence has been removed. + """ + command = "sox {inputfile} -t wav {outputfile} silence -l 1 0.1 "\ + + str(threshold_percentage) + "% -1 " + str(float(duration_s)) + " " + str(threshold_percentage) + "%" + return self._execute_sox_cmd(command) + + def fft(self, start_s=None, duration_s=None, start_sample=None, num_samples=None, zero_pad=False): + """ + Transforms the indicated slice of the AudioSegment into the frequency domain and returns the bins + and the values. + + If neither `start_s` or `start_sample` is specified, the first sample of the slice will be the first sample + of the AudioSegment. + + If neither `duration_s` or `num_samples` is specified, the slice will be from the specified start + to the end of the segment. + + .. code-block:: python + + # Example for plotting the FFT using this function + import matplotlib.pyplot as plt + import numpy as np + + seg = audiosegment.from_file("furelise.wav") + # Just take the first 3 seconds + hist_bins, hist_vals = seg[1:3000].fft() + hist_vals_real_normed = np.abs(hist_vals) / len(hist_vals) + plt.plot(hist_bins / 1000, hist_vals_real_normed) + plt.xlabel("kHz") + plt.ylabel("dB") + plt.show() + + .. image:: images/fft.png + + :param start_s: The start time in seconds. If this is specified, you cannot specify `start_sample`. + :param duration_s: The duration of the slice in seconds. If this is specified, you cannot specify `num_samples`. + :param start_sample: The zero-based index of the first sample to include in the slice. + If this is specified, you cannot specify `start_s`. + :param num_samples: The number of samples to include in the slice. If this is specified, you cannot + specify `duration_s`. + :param zero_pad: If True and the combination of start and duration result in running off the end of + the AudioSegment, the end is zero padded to prevent this. + :returns: np.ndarray of frequencies, np.ndarray of amount of each frequency + :raises: ValueError If `start_s` and `start_sample` are both specified and/or if both `duration_s` and + `num_samples` are specified. + """ + if start_s is not None and start_sample is not None: + raise ValueError("Only one of start_s and start_sample can be specified.") + if duration_s is not None and num_samples is not None: + raise ValueError("Only one of duration_s and num_samples can be specified.") + if start_s is None and start_sample is None: + start_sample = 0 + if duration_s is None and num_samples is None: + num_samples = len(self.get_array_of_samples()) - int(start_sample) + + if duration_s is not None: + num_samples = int(round(duration_s * self.frame_rate)) + if start_s is not None: + start_sample = int(round(start_s * self.frame_rate)) + + end_sample = start_sample + num_samples # end_sample is excluded + if end_sample > len(self.get_array_of_samples()) and not zero_pad: + raise ValueError("The combination of start and duration will run off the end of the AudioSegment object.") + elif end_sample > len(self.get_array_of_samples()) and zero_pad: + arr = np.array(self.get_array_of_samples()) + zeros = np.zeros(end_sample - len(arr)) + arr = np.append(arr, zeros) + else: + arr = np.array(self.get_array_of_samples()) + + audioslice = np.array(arr[start_sample:end_sample]) + fft_result = np.fft.fft(audioslice)[range(int(round(num_samples/2)) + 1)] + step_size = self.frame_rate / num_samples + bins = np.arange(0, int(round(num_samples/2)) + 1, 1.0) * step_size + return bins, fft_result + + def generate_frames(self, frame_duration_ms, zero_pad=True): + """ + Yields self's data in chunks of frame_duration_ms. + + This function adapted from pywebrtc's example [https://github.com/wiseman/py-webrtcvad/blob/master/example.py]. + + :param frame_duration_ms: The length of each frame in ms. + :param zero_pad: Whether or not to zero pad the end of the AudioSegment object to get all + the audio data out as frames. If not, there may be a part at the end + of the Segment that is cut off (the part will be <= `frame_duration_ms` in length). + :returns: A Frame object with properties 'bytes (the data)', 'timestamp (start time)', and 'duration'. + """ + Frame = collections.namedtuple("Frame", "bytes timestamp duration") + + # (samples/sec) * (seconds in a frame) * (bytes/sample) + bytes_per_frame = int(self.frame_rate * (frame_duration_ms / 1000) * self.sample_width) + offset = 0 # where we are so far in self's data (in bytes) + timestamp = 0.0 # where we are so far in self (in seconds) + # (bytes/frame) * (sample/bytes) * (sec/samples) + frame_duration_s = (bytes_per_frame / self.frame_rate) / self.sample_width + while offset + bytes_per_frame < len(self.raw_data): + yield Frame(self.raw_data[offset:offset + bytes_per_frame], timestamp, frame_duration_s) + timestamp += frame_duration_s + offset += bytes_per_frame + + if zero_pad: + rest = self.raw_data[offset:] + zeros = bytes(bytes_per_frame - len(rest)) + yield Frame(rest + zeros, timestamp, frame_duration_s) + + def generate_frames_as_segments(self, frame_duration_ms, zero_pad=True): + """ + Does the same thing as `generate_frames`, but yields tuples of (AudioSegment, timestamp) instead of Frames. + """ + for frame in self.generate_frames(frame_duration_ms, zero_pad=zero_pad): + seg = AudioSegment(pydub.AudioSegment(data=frame.bytes, sample_width=self.sample_width, + frame_rate=self.frame_rate, channels=self.channels), self.name) + yield seg, frame.timestamp + + def normalize_spl_by_average(self, db): + """ + Normalize the values in the AudioSegment so that its average dB value + is `db`. + + The dB of a value is calculated as 20 * log10(abs(value + 1E-9)). + + :param db: The decibels to normalize average to. + :returns: A new AudioSegment object whose values are changed so that their + average is `db`. + """ + def inverse_spl(val): + """Calculates the (positive) 'PCM' value for the given SPl val""" + return 10 ** (val / 20.0) + + # Convert dB into 'PCM' + db_pcm = inverse_spl(db) + # Calculate current 'PCM' average + curavg = np.abs(np.mean(self.to_numpy_array())) + # Calculate ratio of dB_pcm / curavg_pcm + ratio = db_pcm / curavg + # Multiply all values by ratio + dtype_dict = {1: np.int8, 2: np.int16, 4: np.int32} + dtype = dtype_dict[self.sample_width] + new_seg = from_numpy_array(np.array(self.to_numpy_array() * ratio, dtype=dtype), self.frame_rate) + # Check SPL average to see if we are right + #assert math.isclose(np.mean(new_seg.spl), db), "new = " + str(np.mean(new_seg.spl)) + " != " + str(db) + return new_seg + + def reduce(self, others): + """ + Reduces others into this one by concatenating all the others onto this one and + returning the result. Does not modify self, instead, makes a copy and returns that. + + :param others: The other AudioSegment objects to append to this one. + :returns: The concatenated result. + """ + ret = AudioSegment(self.seg, self.name) + selfdata = [self.seg._data] + otherdata = [o.seg._data for o in others] + ret.seg._data = b''.join(selfdata + otherdata) + + return ret + + def resample(self, sample_rate_Hz=None, sample_width=None, channels=None, console_output=False): + """ + Returns a new AudioSegment whose data is the same as this one, but which has been resampled to the + specified characteristics. Any parameter left None will be unchanged. + + .. note:: This method requires that you have the program 'sox' installed. + + .. warning:: This method uses the program 'sox' to perform the task. While this is very fast for a single + function call, the IO may add up for large numbers of AudioSegment objects. + + :param sample_rate_Hz: The new sample rate in Hz. + :param sample_width: The new sample width in bytes, so sample_width=2 would correspond to 16 bit (2 byte) width. + :param channels: The new number of channels. + :param console_output: Will print the output of sox to the console if True. + :returns: The newly sampled AudioSegment. + """ + if sample_rate_Hz is None: + sample_rate_Hz = self.frame_rate + if sample_width is None: + sample_width = self.sample_width + if channels is None: + channels = self.channels + + command = "sox {inputfile} -b " + str(sample_width * 8) + " -r " + str(sample_rate_Hz) \ + + " -t wav {outputfile} channels " + str(channels) + + return self._execute_sox_cmd(command, console_output=console_output) + + def __getstate__(self): + """ + Serializes into a dict for the pickle protocol. + + :returns: The dict to pickle. + """ + return {'name': self.name, 'seg': self.seg} + + def __setstate__(self, d): + """ + Deserializes from a dict for the pickle protocol. + + :param d: The dict to unpickle from. + """ + self.__dict__.update(d) + + def serialize(self): + """ + Serializes into a bytestring. + + :returns: An object of type Bytes. + """ + d = self.__getstate__() + return pickle.dumps({ + 'name': d['name'], + 'seg': pickle.dumps(d['seg'], protocol=-1), + }, protocol=-1) + + def spectrogram(self, start_s=None, duration_s=None, start_sample=None, num_samples=None, + window_length_s=None, window_length_samples=None, overlap=0.5): + """ + Does a series of FFTs from `start_s` or `start_sample` for `duration_s` or `num_samples`. + Effectively, transforms a slice of the AudioSegment into the frequency domain across different + time bins. + + .. code-block:: python + + # Example for plotting a spectrogram using this function + import audiosegment + import matplotlib.pyplot as plt + + #... + seg = audiosegment.from_file("somebodytalking.wav") + freqs, times, amplitudes = seg.spectrogram(window_length_s=0.03, overlap=0.5) + amplitudes = 10 * np.log10(amplitudes + 1e-9) + + # Plot + plt.pcolormesh(times, freqs, amplitudes) + plt.xlabel("Time in Seconds") + plt.ylabel("Frequency in Hz") + plt.show() + + .. image:: images/spectrogram.png + + :param start_s: The start time. Starts at the beginning if neither this nor `start_sample` is specified. + :param duration_s: The duration of the spectrogram in seconds. Goes to the end if neither this nor + `num_samples` is specified. + :param start_sample: The index of the first sample to use. Starts at the beginning if neither this nor + `start_s` is specified. + :param num_samples: The number of samples in the spectrogram. Goes to the end if neither this nor + `duration_s` is specified. + :param window_length_s: The length of each FFT in seconds. If the total number of samples in the spectrogram + is not a multiple of the window length in samples, the last window will be zero-padded. + :param window_length_samples: The length of each FFT in number of samples. If the total number of samples in the + spectrogram is not a multiple of the window length in samples, the last window will + be zero-padded. + :param overlap: The fraction of each window to overlap. + :returns: Three np.ndarrays: The frequency values in Hz (the y-axis in a spectrogram), the time values starting + at start time and then increasing by `duration_s` each step (the x-axis in a spectrogram), and + the dB of each time/frequency bin as a 2D array of shape [len(frequency values), len(duration)]. + :raises ValueError: If `start_s` and `start_sample` are both specified, if `duration_s` and `num_samples` are both + specified, if the first window's duration plus start time lead to running off the end + of the AudioSegment, or if `window_length_s` and `window_length_samples` are either + both specified or if they are both not specified. + """ + if start_s is not None and start_sample is not None: + raise ValueError("Only one of start_s and start_sample may be specified.") + if duration_s is not None and num_samples is not None: + raise ValueError("Only one of duration_s and num_samples may be specified.") + if window_length_s is not None and window_length_samples is not None: + raise ValueError("Only one of window_length_s and window_length_samples may be specified.") + if window_length_s is None and window_length_samples is None: + raise ValueError("You must specify a window length, either in window_length_s or in window_length_samples.") + + if start_s is None and start_sample is None: + start_sample = 0 + if duration_s is None and num_samples is None: + num_samples = len(self.get_array_of_samples()) - int(start_sample) + + if duration_s is not None: + num_samples = int(round(duration_s * self.frame_rate)) + if start_s is not None: + start_sample = int(round(start_s * self.frame_rate)) + + if window_length_s is not None: + window_length_samples = int(round(window_length_s * self.frame_rate)) + + if start_sample + num_samples > len(self.get_array_of_samples()): + raise ValueError("The combination of start and duration will run off the end of the AudioSegment object.") + + f, t, sxx = signal.spectrogram(self.to_numpy_array(), self.frame_rate, scaling='spectrum', nperseg=window_length_samples, + noverlap=int(round(overlap * window_length_samples)), + mode='magnitude') + return f, t, sxx + + def to_numpy_array(self): + """ + Convenience function for `np.array(self.get_array_of_samples())` while + keeping the appropriate dtype. + """ + dtype_dict = { + 1: np.int8, + 2: np.int16, + 4: np.int32 + } + dtype = dtype_dict[self.sample_width] + return np.array(self.get_array_of_samples(), dtype=dtype) + + @deprecated + def trim_to_minutes(self, strip_last_seconds=False): + """ + Returns a list of minute-long (at most) Segment objects. + + .. note:: This function has been deprecated. Use the `dice` function instead. + + :param strip_last_seconds: If True, this method will return minute-long segments, + but the last three seconds of this AudioSegment won't be returned. + This is useful for removing the microphone artifact at the end of the recording. + :returns: A list of AudioSegment objects, each of which is one minute long at most + (and only the last one - if any - will be less than one minute). + """ + outs = self.dice(seconds=60, zero_pad=False) + + # Now cut out the last three seconds of the last item in outs (it will just be microphone artifact) + # or, if the last item is less than three seconds, just get rid of it + if strip_last_seconds: + if outs[-1].duration_seconds > 3: + outs[-1] = outs[-1][:-MS_PER_S * 3] + else: + outs = outs[:-1] + + return outs + + def zero_extend(self, duration_s=None, num_samples=None): + """ + Adds a number of zeros (digital silence) to the AudioSegment (returning a new one). + + :param duration_s: The number of seconds of zeros to add. If this is specified, `num_samples` must be None. + :param num_samples: The number of zeros to add. If this is specified, `duration_s` must be None. + :returns: A new AudioSegment object that has been zero extended. + :raises: ValueError if duration_s and num_samples are both specified. + """ + if duration_s is not None and num_samples is not None: + raise ValueError("`duration_s` and `num_samples` cannot both be specified.") + elif duration_s is not None: + num_samples = self.frame_rate * duration_s + seg = AudioSegment(self.seg, self.name) + zeros = silent(duration=num_samples / self.frame_rate, frame_rate=self.frame_rate) + return zeros.overlay(seg) + +def deserialize(bstr): + """ + Attempts to deserialize a bytestring into an audiosegment. + + :param bstr: The bytestring serialized via an audiosegment's serialize() method. + :returns: An AudioSegment object deserialized from `bstr`. + """ + d = pickle.loads(bstr) + seg = pickle.loads(d['seg']) + return AudioSegment(seg, d['name']) + +def empty(): + """ + Creates a zero-duration AudioSegment object. + + :returns: An empty AudioSegment object. + """ + dubseg = pydub.AudioSegment.empty() + return AudioSegment(dubseg, "") + +def from_file(path): + """ + Returns an AudioSegment object from the given file based on its file extension. + If the extension is wrong, this will throw some sort of error. + + :param path: The path to the file, including the file extension. + :returns: An AudioSegment instance from the file. + """ + _name, ext = os.path.splitext(path) + ext = ext.lower()[1:] + if "m4a" in path: + ext="m4a" + elif "wav" in path: + ext="wav" + seg = pydub.AudioSegment.from_file(path, format=ext) + return AudioSegment(seg, path) + +def from_mono_audiosegments(*args): + """ + Creates a multi-channel AudioSegment out of multiple mono AudioSegments (two or more). Each mono + AudioSegment passed in should be exactly the same number of samples. + + :returns: An AudioSegment of multiple channels formed from the given mono AudioSegments. + """ + return AudioSegment(pydub.AudioSegment.from_mono_audiosegments(*args), "") + +def from_numpy_array(nparr, framerate): + """ + Returns an AudioSegment created from the given numpy array. + + The numpy array must have shape = (num_samples, num_channels). + + :param nparr: The numpy array to create an AudioSegment from. + :returns: An AudioSegment created from the given array. + """ + # interleave the audio across all channels and collapse + if nparr.dtype.itemsize not in (1, 2, 4): + raise ValueError("Numpy Array must contain 8, 16, or 32 bit values.") + if len(nparr.shape) == 1: + arrays = [nparr] + elif len(nparr.shape) == 2: + arrays = [nparr[i,:] for i in range(nparr.shape[0])] + else: + raise ValueError("Numpy Array must be one or two dimensional. Shape must be: (num_samples, num_channels).") + interleaved = np.vstack(arrays).reshape((-1,), order='F') + dubseg = pydub.AudioSegment(interleaved.tobytes(), + frame_rate=framerate, + sample_width=interleaved.dtype.itemsize, + channels=len(interleaved.shape) + ) + return AudioSegment(dubseg, "") + +def silent(duration=1000, frame_rate=11025): + """ + Creates an AudioSegment object of the specified duration/frame_rate filled with digital silence. + + :param duration: The duration of the returned object in ms. + :param frame_rate: The samples per second of the returned object. + :returns: AudioSegment object filled with pure digital silence. + """ + seg = pydub.AudioSegment.silent(duration=duration, frame_rate=frame_rate) + return AudioSegment(seg, "") + diff --git a/python-server/filesystem.py b/python-server/filesystem.py index 1c9d512..d7a6ad6 100644 --- a/python-server/filesystem.py +++ b/python-server/filesystem.py @@ -1,6 +1,6 @@ import base64 import os.path -from pydub import AudioSegment +import audiosegment_wrapper as AudioSegment def save_audio(filename, base64_string): decoded = None @@ -15,7 +15,9 @@ def save_audio(filename, base64_string): # return b"ERROR_FILE_EXISTS" with open(orig_filename,"wb") as f: f.write(decoded) - AudioSegment.from_file(orig_filename).export(filename,format="wav") + seg = AudioSegment.from_file(orig_filename) + seg = seg.resample(sample_rate_Hz=32000, sample_width=2, channels=1) + seg.export(filename,format="wav") return b"SUCCESS" def save_audio_chain(file_str_tupels): @@ -37,6 +39,7 @@ def save_audio_chain(file_str_tupels): if not completeAudio: return b"ERROR_AUDIO_CONCAT_FAILED" else: + completeAudio = completeAudio.resample(sample_rate_Hz=32000, sample_width=2, channels=1) completeAudio.export(file_str_tupels[0][0],format="wav") return b"SUCCESS" diff --git a/python-server/speech.py b/python-server/speech.py index 397e7d6..9d91069 100644 --- a/python-server/speech.py +++ b/python-server/speech.py @@ -3,6 +3,7 @@ import multiprocessing as mp import os.path import filesystem import log +import transcribe_async USE_FREE=False USE_PAID=True @@ -17,26 +18,24 @@ def create_and_save_transcript(filename): def analyse(filename): ''' returns the transcripted audio, or None if the analysis fails ''' - recognizer = spr.Recognizer() - with spr.AudioFile(filename) as source: - audio = recognizer.record(source) - try: if USE_FREE: + recognizer = spr.Recognizer() + with spr.AudioFile(filename) as source: + audio = recognizer.record(source) string = free_google_backend(recognizer, audio) elif USE_PAID: - string = paid_google_backend(recognizer,audio) + string = paid_google_backend(filename) except spr.UnknownValueError: log.log("Audio file is broken or not an audio file") return "ERROR_AUDIO_FILE_INVALID" except spr.RequestError as e: log.log("Could not connect to google API: {}".format(e)) return "ERROR_API_FAILURE" - return string def free_google_backend(recognizer, audio): return recognizer.recognize_google(audio,language="de-DE") -def paid_google_backend(recognizer, audio): - pass +def paid_google_backend(filename): + return transcribe_async.transcribe_file(filename) diff --git a/python-server/transcribe_async.py b/python-server/transcribe_async.py index 0a35005..1e4b643 100644 --- a/python-server/transcribe_async.py +++ b/python-server/transcribe_async.py @@ -24,68 +24,50 @@ Example usage: import argparse import io +from gcloud import storage +from google.cloud import speech +from google.cloud.speech import enums +from google.cloud.speech import types # [START speech_transcribe_async] def transcribe_file(speech_file): - """Transcribe the given audio file asynchronously.""" - from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types - client = speech.SpeechClient() + url = upload_file(speech_file) + print(url) + return transcribe_gcs("gs://"+url) - # [START speech_python_migration_async_request] - with io.open(speech_file, 'rb') as audio_file: - content = audio_file.read() +def upload_file(filename): + bukket = "ths-speech-audio/" + client = storage.Client() + cb = client.get_bucket("ths-speech-audio") + blob = cb.blob(filename) + blob.upload_from_filename(filename) + return bukket + filename - audio = types.RecognitionAudio(content=content) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, - sample_rate_hertz=16000, - language_code='en-US') - - # [START speech_python_migration_async_response] - operation = client.long_running_recognize(config, audio) - # [END speech_python_migration_async_request] - - print('Waiting for operation to complete...') - response = operation.result(timeout=90) - - # Each result is for a consecutive portion of the audio. Iterate through - # them to get the transcripts for the entire audio file. - for result in response.results: - # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) - print('Confidence: {}'.format(result.alternatives[0].confidence)) - # [END speech_python_migration_async_response] -# [END speech_transcribe_async] - - -# [START speech_transcribe_async_gcs] def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" - from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( - #encoding=enums.RecognitionConfig.AudioEncoding.FLAC, - #sample_rate_hertz=16000, + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=32000, language_code='de-DE') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') - response = operation.result(timeout=90) + response = operation.result(timeout=900) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. + ret = "" for result in response.results: # The first alternative is the most likely one for this portion. + ret += result.alternatives[0].transcript print(u'Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence)) + return ret # [END speech_transcribe_async_gcs]