Implemented ITU loudness estimation
This commit is contained in:
+150
@@ -0,0 +1,150 @@
|
||||
# this implements ITU P.56 method B.
|
||||
# 'x' is the speech file to calculate active speech level for,
|
||||
# 'actfact' is the activity factor (between 0 and 1)
|
||||
# This is the proportion of the time that the speech is deemed "active"
|
||||
# 'asl_msq' is the active speech level mean square energy.
|
||||
# This is the mean square value in uPa^2 if x is in uPa.
|
||||
# For active speech with x in uPa,
|
||||
# the Leq in dB re 20 uPa is 10log10[asl_msq/20^2]
|
||||
#
|
||||
# 'c0' is the active speech level threshold.
|
||||
# thi is the level in uPa above which the speech is deemed active
|
||||
#
|
||||
# Coded by Fred Commented by BL 16/6/2012.
|
||||
# Adapted for python by Sam Perry 11/01/2020
|
||||
#
|
||||
# # x is the column vector of floating point speech data
|
||||
|
||||
# function [asl_msq, actfact, c0]= asl_P56_Fred_v2 ( x, fs, nbits)
|
||||
|
||||
import numpy as np
|
||||
import scipy.signal as signal
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def asl_P56(x, fs, nbits):
|
||||
eps = np.finfo(float).eps
|
||||
x = x[:] # make sure x is column vector
|
||||
if len(x.shape) < 2:
|
||||
x = x[:, np.newaxis]
|
||||
T = 0.03 # time constant of smoothing, in seconds
|
||||
H = 0.2 # hangover time in seconds
|
||||
M = 15.9 # margin in dB of the difference between threshold and ASL
|
||||
thres_no = nbits-1 # number of thresholds, for 16 bit, it's 15
|
||||
|
||||
I = np.ceil(fs*H) # hangover in samples
|
||||
g = np.exp(-1/(fs*T)) # smoothing factor in envelop detection
|
||||
c = 2**np.arange(-15, thres_no-15, dtype=float)
|
||||
|
||||
# vector with thresholds from one quantizing level up to half the maximum
|
||||
# code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5
|
||||
a = np.full(thres_no, -1) # activity counter for each level threshold
|
||||
hang = np.full(thres_no, I) # hangover counter for each level threshold
|
||||
|
||||
sq = x.T@x # long-term level square energy of x
|
||||
x_len = x.shape[0] # length of x
|
||||
|
||||
# use a 2nd order IIR filter to detect the envelope q
|
||||
x_abs = np.abs( x)
|
||||
p = signal.lfilter([1-g, 0], [1, -g], np.squeeze(x_abs))
|
||||
q = signal.lfilter([1-g, 0], [1, -g], np.squeeze(p)) # q is the envelope, obtained from moving average of abs(x) (with slight "hangover").
|
||||
|
||||
for k in range(x_len):
|
||||
for j in range(thres_no):
|
||||
if q[k] >= c[j]:
|
||||
a[j] = a[j]+1
|
||||
hang[j] = 0
|
||||
elif hang[j] < I:
|
||||
a[j] = a[j]+1
|
||||
hang[j] = hang[j]+1
|
||||
else:
|
||||
break
|
||||
actfact = 0
|
||||
asl_msq = 0
|
||||
if a[0] == -1:
|
||||
return asl_msq, actfact, c0
|
||||
else:
|
||||
a+=2
|
||||
AdB1=10*np.log10(sq/a[0]+eps)
|
||||
|
||||
CdB1 = 20*np.log10(c[0]+eps)
|
||||
if (AdB1-CdB1 < M):
|
||||
return asl_msq, actfact, c0
|
||||
|
||||
AdB = np.zeros(thres_no)
|
||||
CdB = np.zeros(thres_no)
|
||||
Delta = np.zeros(thres_no)
|
||||
AdB[0] = AdB1
|
||||
CdB[0] = CdB1
|
||||
Delta[0] = AdB1-CdB1
|
||||
|
||||
for j in range(1, thres_no):
|
||||
AdB[j] = 10*np.log10(sq/(a[j]+eps)+eps)
|
||||
CdB[j] = 20*np.log10(c[j]+eps)
|
||||
|
||||
for j in range(1, thres_no):
|
||||
if a[j] != 0:
|
||||
Delta[j]= AdB[j]- CdB[j]
|
||||
if Delta[j] <= M:
|
||||
# interpolate to find the actfact
|
||||
asl_ms_log, cl0 = bin_interp(AdB[j],
|
||||
AdB[j-1], CdB[j], CdB[j-1], M, 0.5)
|
||||
asl_msq = 10**(asl_ms_log/10) # this is the mean square value NOT the rms
|
||||
actfact = (sq/x_len)/asl_msq # this is the proportion of the time that the speech is deemed "active"
|
||||
c0= 10**(cl0/20) # this is the threshold above which the speech is deemed "active".
|
||||
break
|
||||
return asl_msq, actfact, c0
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
def bin_interp(upcount, lwcount, upthr, lwthr, Margin, tol):
|
||||
|
||||
if tol < 0:
|
||||
tol = -tol
|
||||
|
||||
# Check if extreme counts are not already the true active value
|
||||
iterno = 1
|
||||
if np.abs(upcount - upthr - Margin) < tol:
|
||||
asl_ms_log = upcount
|
||||
cc = upthr
|
||||
return asl_ms_log, cc
|
||||
|
||||
if np.abs(lwcount - lwthr - Margin) < tol:
|
||||
asl_ms_log= lwcount
|
||||
cc= lwthr
|
||||
return asl_ms_log, cc
|
||||
|
||||
# Initialize first middle for given (initial) bounds
|
||||
midcount = (upcount + lwcount) / 2.0
|
||||
midthr = (upthr + lwthr) / 2.0
|
||||
|
||||
# Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
|
||||
while 1:
|
||||
diff = midcount-midthr-Margin
|
||||
if abs(diff) <= tol:
|
||||
break
|
||||
|
||||
# if tolerance is not met up to 20 iteractions, then relax the
|
||||
# tolerance by 10#
|
||||
iterno += 1
|
||||
|
||||
if iterno>20:
|
||||
tol = tol*1.1
|
||||
|
||||
if diff > tol: # then new bounds are...
|
||||
midcount = (upcount+midcount)/2.0
|
||||
# upper and middle activities
|
||||
midthr = (upthr+midthr)/2.0
|
||||
# ... and thresholds
|
||||
elif diff < -tol: # then new bounds are...
|
||||
midcount = (midcount+lwcount)/2.0
|
||||
# middle and lower activities
|
||||
midthr = (midthr+lwthr)/2.0
|
||||
# ... and thresholds
|
||||
|
||||
# Since the tolerance has been satisfied, midcount is selected
|
||||
# as the interpolated value with a tol [dB] tolerance.
|
||||
|
||||
asl_ms_log = midcount
|
||||
cc = midthr
|
||||
return asl_ms_log, cc
|
||||
|
||||
Executable
+14
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from ITU_P56 import asl_P56
|
||||
import numpy as np
|
||||
|
||||
def main():
|
||||
fs = 44100
|
||||
x = np.sin(2*np.pi*440*(np.arange(fs)/fs))
|
||||
asl_msq, actfact, c0 = asl_P56(x, fs, 16)
|
||||
breakpoint()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user