Files
BPLabs/ITU_P56.py
T

153 lines
5.1 KiB
Python

# this implements ITU P.56 method B.
# 'x' is the speech file to calculate active speech level for,
# 'actfact' is the activity factor (between 0 and 1)
# This is the proportion of the time that the speech is deemed "active"
# 'asl_msq' is the active speech level mean square energy.
# This is the mean square value in uPa^2 if x is in uPa.
# For active speech with x in uPa,
# the Leq in dB re 20 uPa is 10log10[asl_msq/20^2]
#
# 'c0' is the active speech level threshold.
# thi is the level in uPa above which the speech is deemed active
#
# Coded by Fred Commented by BL 16/6/2012.
# Adapted for python by Sam Perry 11/01/2020
#
# # x is the column vector of floating point speech data
# function [asl_msq, actfact, c0]= asl_P56_Fred_v2 ( x, fs, nbits)
import numpy as np
import scipy.signal as signal
import matplotlib.pyplot as plt
def asl_P56(x, fs, nbits):
nbits = int(nbits)
eps = np.finfo(float).eps
x = x[:] # make sure x is column vector
if len(x.shape) < 2:
x = x[:, np.newaxis]
T = 0.03 # time constant of smoothing, in seconds
H = 0.2 # hangover time in seconds
M = 15.9 # margin in dB of the difference between threshold and ASL
thres_no = nbits-1 # number of thresholds, for 16 bit, it's 15
c0 = None
I = np.ceil(fs*H) # hangover in samples
g = np.exp(-1/(fs*T)) # smoothing factor in envelop detection
c = 2**np.arange(-15, thres_no-15, dtype=float)
# vector with thresholds from one quantizing level up to half the maximum
# code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5
a = np.full(thres_no, -1) # activity counter for each level threshold
hang = np.full(thres_no, I) # hangover counter for each level threshold
sq = x.T@x # long-term level square energy of x
x_len = x.shape[0] # length of x
# use a 2nd order IIR filter to detect the envelope q
x_abs = np.abs( x)
p = signal.lfilter([1-g, 0], [1, -g], np.squeeze(x_abs))
q = signal.lfilter([1-g, 0], [1, -g], np.squeeze(p)) # q is the envelope, obtained from moving average of abs(x) (with slight "hangover").
for k in range(x_len):
for j in range(thres_no):
if q[k] >= c[j]:
a[j] = a[j]+1
hang[j] = 0
elif hang[j] < I:
a[j] = a[j]+1
hang[j] = hang[j]+1
else:
break
actfact = 0
asl_msq = 0
if a[0] == -1:
return asl_msq, actfact, c0
else:
a+=2
AdB1=10*np.log10(sq/a[0]+eps)
CdB1 = 20*np.log10(c[0]+eps)
if (AdB1-CdB1 < M):
return asl_msq, actfact, c0
AdB = np.zeros(thres_no)
CdB = np.zeros(thres_no)
Delta = np.zeros(thres_no)
AdB[0] = AdB1
CdB[0] = CdB1
Delta[0] = AdB1-CdB1
for j in range(1, thres_no):
AdB[j] = 10*np.log10(sq/(a[j]+eps)+eps)
CdB[j] = 20*np.log10(c[j]+eps)
for j in range(1, thres_no):
if a[j] != 0:
Delta[j]= AdB[j]- CdB[j]
if Delta[j] <= M:
# interpolate to find the actfact
asl_ms_log, cl0 = bin_interp(AdB[j],
AdB[j-1], CdB[j], CdB[j-1], M, 0.5)
asl_msq = 10**(asl_ms_log/10) # this is the mean square value NOT the rms
actfact = (sq/x_len)/asl_msq # this is the proportion of the time that the speech is deemed "active"
c0= 10**(cl0/20) # this is the threshold above which the speech is deemed "active".
break
return asl_msq, actfact, c0
# --------------------------------------------------------------------------
def bin_interp(upcount, lwcount, upthr, lwthr, Margin, tol):
if tol < 0:
tol = -tol
# Check if extreme counts are not already the true active value
iterno = 1
if np.abs(upcount - upthr - Margin) < tol:
asl_ms_log = upcount
cc = upthr
return asl_ms_log, cc
if np.abs(lwcount - lwthr - Margin) < tol:
asl_ms_log= lwcount
cc= lwthr
return asl_ms_log, cc
# Initialize first middle for given (initial) bounds
midcount = (upcount + lwcount) / 2.0
midthr = (upthr + lwthr) / 2.0
# Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
while 1:
diff = midcount-midthr-Margin
if abs(diff) <= tol:
break
# if tolerance is not met up to 20 iteractions, then relax the
# tolerance by 10#
iterno += 1
if iterno>20:
tol = tol*1.1
if diff > tol: # then new bounds are...
midcount = (upcount+midcount)/2.0
# upper and middle activities
midthr = (upthr+midthr)/2.0
# ... and thresholds
elif diff < -tol: # then new bounds are...
midcount = (midcount+lwcount)/2.0
# middle and lower activities
midthr = (midthr+lwthr)/2.0
# ... and thresholds
# Since the tolerance has been satisfied, midcount is selected
# as the interpolated value with a tol [dB] tolerance.
asl_ms_log = midcount
cc = midthr
return asl_ms_log, cc