diff --git a/ITU_P56.py b/ITU_P56.py new file mode 100644 index 0000000..f597127 --- /dev/null +++ b/ITU_P56.py @@ -0,0 +1,150 @@ +# this implements ITU P.56 method B. +# 'x' is the speech file to calculate active speech level for, +# 'actfact' is the activity factor (between 0 and 1) +# This is the proportion of the time that the speech is deemed "active" +# 'asl_msq' is the active speech level mean square energy. +# This is the mean square value in uPa^2 if x is in uPa. +# For active speech with x in uPa, +# the Leq in dB re 20 uPa is 10log10[asl_msq/20^2] +# +# 'c0' is the active speech level threshold. +# thi is the level in uPa above which the speech is deemed active +# +# Coded by Fred Commented by BL 16/6/2012. +# Adapted for python by Sam Perry 11/01/2020 +# +# # x is the column vector of floating point speech data + +# function [asl_msq, actfact, c0]= asl_P56_Fred_v2 ( x, fs, nbits) + +import numpy as np +import scipy.signal as signal +import matplotlib.pyplot as plt + +def asl_P56(x, fs, nbits): + eps = np.finfo(float).eps + x = x[:] # make sure x is column vector + if len(x.shape) < 2: + x = x[:, np.newaxis] + T = 0.03 # time constant of smoothing, in seconds + H = 0.2 # hangover time in seconds + M = 15.9 # margin in dB of the difference between threshold and ASL + thres_no = nbits-1 # number of thresholds, for 16 bit, it's 15 + + I = np.ceil(fs*H) # hangover in samples + g = np.exp(-1/(fs*T)) # smoothing factor in envelop detection + c = 2**np.arange(-15, thres_no-15, dtype=float) + + # vector with thresholds from one quantizing level up to half the maximum + # code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5 + a = np.full(thres_no, -1) # activity counter for each level threshold + hang = np.full(thres_no, I) # hangover counter for each level threshold + + sq = x.T@x # long-term level square energy of x + x_len = x.shape[0] # length of x + + # use a 2nd order IIR filter to detect the envelope q + x_abs = np.abs( x) + p = signal.lfilter([1-g, 0], [1, -g], np.squeeze(x_abs)) + q = signal.lfilter([1-g, 0], [1, -g], np.squeeze(p)) # q is the envelope, obtained from moving average of abs(x) (with slight "hangover"). + + for k in range(x_len): + for j in range(thres_no): + if q[k] >= c[j]: + a[j] = a[j]+1 + hang[j] = 0 + elif hang[j] < I: + a[j] = a[j]+1 + hang[j] = hang[j]+1 + else: + break + actfact = 0 + asl_msq = 0 + if a[0] == -1: + return asl_msq, actfact, c0 + else: + a+=2 + AdB1=10*np.log10(sq/a[0]+eps) + + CdB1 = 20*np.log10(c[0]+eps) + if (AdB1-CdB1 < M): + return asl_msq, actfact, c0 + + AdB = np.zeros(thres_no) + CdB = np.zeros(thres_no) + Delta = np.zeros(thres_no) + AdB[0] = AdB1 + CdB[0] = CdB1 + Delta[0] = AdB1-CdB1 + + for j in range(1, thres_no): + AdB[j] = 10*np.log10(sq/(a[j]+eps)+eps) + CdB[j] = 20*np.log10(c[j]+eps) + + for j in range(1, thres_no): + if a[j] != 0: + Delta[j]= AdB[j]- CdB[j] + if Delta[j] <= M: + # interpolate to find the actfact + asl_ms_log, cl0 = bin_interp(AdB[j], + AdB[j-1], CdB[j], CdB[j-1], M, 0.5) + asl_msq = 10**(asl_ms_log/10) # this is the mean square value NOT the rms + actfact = (sq/x_len)/asl_msq # this is the proportion of the time that the speech is deemed "active" + c0= 10**(cl0/20) # this is the threshold above which the speech is deemed "active". + break + return asl_msq, actfact, c0 + +# -------------------------------------------------------------------------- + +def bin_interp(upcount, lwcount, upthr, lwthr, Margin, tol): + + if tol < 0: + tol = -tol + + # Check if extreme counts are not already the true active value + iterno = 1 + if np.abs(upcount - upthr - Margin) < tol: + asl_ms_log = upcount + cc = upthr + return asl_ms_log, cc + + if np.abs(lwcount - lwthr - Margin) < tol: + asl_ms_log= lwcount + cc= lwthr + return asl_ms_log, cc + + # Initialize first middle for given (initial) bounds + midcount = (upcount + lwcount) / 2.0 + midthr = (upthr + lwthr) / 2.0 + + # Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol) + while 1: + diff = midcount-midthr-Margin + if abs(diff) <= tol: + break + + # if tolerance is not met up to 20 iteractions, then relax the + # tolerance by 10# + iterno += 1 + + if iterno>20: + tol = tol*1.1 + + if diff > tol: # then new bounds are... + midcount = (upcount+midcount)/2.0 + # upper and middle activities + midthr = (upthr+midthr)/2.0 + # ... and thresholds + elif diff < -tol: # then new bounds are... + midcount = (midcount+lwcount)/2.0 + # middle and lower activities + midthr = (midthr+lwthr)/2.0 + # ... and thresholds + + # Since the tolerance has been satisfied, midcount is selected + # as the interpolated value with a tol [dB] tolerance. + + asl_ms_log = midcount + cc = midthr + return asl_ms_log, cc + diff --git a/ITU_test.py b/ITU_test.py new file mode 100755 index 0000000..f0b9d8b --- /dev/null +++ b/ITU_test.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +from ITU_P56 import asl_P56 +import numpy as np + +def main(): + fs = 44100 + x = np.sin(2*np.pi*440*(np.arange(fs)/fs)) + asl_msq, actfact, c0 = asl_P56(x, fs, 16) + breakpoint() + + +if __name__ == '__main__': + main()