BPLabs/ITU_P56.py

# this implements ITU P.56 method B.
# 'x' is the speech file to calculate active speech level for,
# 'actfact' is the activity factor (between 0 and 1)
#        This is the proportion of the time that the speech is deemed "active"
# 'asl_msq' is the active speech level mean square energy.
#        This is the mean square value in uPa^2 if x is in uPa.
#        For active speech with x in uPa,
#        the Leq in dB re 20 uPa is 10log10[asl_msq/20^2]
#
# 'c0' is the active speech level threshold.
#     thi is the level in uPa above which the speech is deemed active
#
# Coded by Fred Commented by BL 16/6/2012.
# Adapted for python by Sam Perry 11/01/2020
#
# # x is the column vector of floating point speech data

# function [asl_msq, actfact, c0]= asl_P56_Fred_v2 ( x, fs, nbits)

import numpy as np
import scipy.signal as signal
import matplotlib.pyplot as plt

def asl_P56(x, fs, nbits):
    nbits = int(nbits)
    eps = np.finfo(float).eps
    x = x[:]  # make sure x is column vector
    if len(x.shape) < 2:
        x = x[:, np.newaxis]
    T = 0.03  # time constant of smoothing, in seconds
    H = 0.2  # hangover time in seconds
    M = 15.9  # margin in dB of the difference between threshold and ASL
    thres_no = nbits-1  # number of thresholds, for 16 bit, it's 15
    c0 = None

    I = np.ceil(fs*H)  # hangover in samples
    g = np.exp(-1/(fs*T))  # smoothing factor in envelop detection
    c = 2**np.arange(-15, thres_no-15, dtype=float)

    # vector with thresholds from one quantizing level up to half the maximum
    # code, at a step of 2, in the case of 16bit samples, from 2^-15 to 0.5
    a = np.full(thres_no, -1)  # activity counter for each level threshold
    hang = np.full(thres_no, I)  # hangover counter for each level threshold

    sq = x.T@x  # long-term level square energy of x
    x_len = x.shape[0] # length of x

    # use a 2nd order IIR filter to detect the envelope q
    x_abs = np.abs( x)
    p = signal.lfilter([1-g, 0], [1, -g], np.squeeze(x_abs))
    q = signal.lfilter([1-g, 0], [1, -g], np.squeeze(p))  # q is the envelope, obtained from moving average of abs(x) (with slight "hangover").

    for k in range(x_len):
        for j in range(thres_no):
            if q[k] >= c[j]:
                a[j] = a[j]+1
                hang[j] = 0
            elif hang[j] < I:
                a[j] = a[j]+1
                hang[j] = hang[j]+1
            else:
                break
    actfact = 0
    asl_msq = 0
    if a[0] == -1:
        return asl_msq, actfact, c0
    else:
        a+=2
        AdB1=10*np.log10(sq/a[0]+eps)

    CdB1 = 20*np.log10(c[0]+eps)
    if (AdB1-CdB1 < M):
        return asl_msq, actfact, c0

    AdB = np.zeros(thres_no)
    CdB = np.zeros(thres_no)
    Delta = np.zeros(thres_no)
    AdB[0] = AdB1
    CdB[0] = CdB1
    Delta[0] = AdB1-CdB1

    for j in range(1, thres_no):
        AdB[j] = 10*np.log10(sq/(a[j]+eps)+eps)
        CdB[j] = 20*np.log10(c[j]+eps)

    for j in range(1, thres_no):
        if a[j] != 0:
            Delta[j]= AdB[j]- CdB[j]
            if Delta[j] <= M:
                # interpolate to find the actfact
                asl_ms_log, cl0 = bin_interp(AdB[j],
                    AdB[j-1], CdB[j], CdB[j-1], M, 0.5)
                asl_msq = 10**(asl_ms_log/10) # this is the mean square value NOT the rms
                actfact = (sq/x_len)/asl_msq # this is the proportion of the time that the speech is deemed "active"
                c0= 10**(cl0/20) # this is the threshold above which the speech is deemed "active".
                break
    return asl_msq, actfact, c0

# --------------------------------------------------------------------------

def bin_interp(upcount, lwcount, upthr, lwthr, Margin, tol):

    if tol < 0:
        tol = -tol

    # Check if extreme counts are not already the true active value
    iterno = 1
    if np.abs(upcount - upthr - Margin) < tol:
        asl_ms_log = upcount
        cc = upthr
        return asl_ms_log, cc

    if np.abs(lwcount - lwthr - Margin) < tol:
        asl_ms_log= lwcount
        cc= lwthr
        return asl_ms_log, cc

    # Initialize first middle for given (initial) bounds
    midcount = (upcount + lwcount) / 2.0
    midthr = (upthr + lwthr) / 2.0

    # Repeats loop until `diff' falls inside the tolerance (-tol<=diff<=tol)
    while 1:
        diff = midcount-midthr-Margin
        if abs(diff) <= tol:
            break

        # if tolerance is not met up to 20 iteractions, then relax the
        # tolerance by 10#
        iterno += 1

        if iterno>20:
            tol = tol*1.1

        if diff > tol:   # then new bounds are...
            midcount = (upcount+midcount)/2.0
            # upper and middle activities
            midthr = (upthr+midthr)/2.0
            # ... and thresholds
        elif diff < -tol:  # then new bounds are...
            midcount = (midcount+lwcount)/2.0
            # middle and lower activities
            midthr = (midthr+lwthr)/2.0
            # ... and thresholds

    #   Since the tolerance has been satisfied, midcount is selected
    #   as the interpolated value with a tol [dB] tolerance.

    asl_ms_log = midcount
    cc = midthr
    return asl_ms_log, cc