FrameLib/FrameLib_Dependencies/KernelSmoother.hpp


#ifndef KERNELSMOOTHER_HPP
#define KERNELSMOOTHER_HPP

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <type_traits>

#include "Allocator.hpp"
#include "SIMDSupport.hpp"
#include "SpectralProcessor.hpp"

template <typename T, typename Allocator = aligned_allocator, bool auto_resize_fft = false>
class kernel_smoother : private spectral_processor<T, Allocator>
{
    using Split = typename FFTTypes<T>::Split;

    using processor = spectral_processor<T, Allocator>;
    using binary_sizes = typename processor::binary_sizes;
    using zipped_pointer = typename processor::zipped_pointer;
    using in_ptr = typename processor::in_ptr;

    template <bool B>
    using enable_if_t = typename std::enable_if<B>::type;

public:

    enum SmoothMode { kSmoothZeroPad, kSmoothWrap, kSmoothFold };

    template <typename U = Allocator, typename = enable_if_t<std::is_default_constructible<U>::value>>
    kernel_smoother()
    {
        set_max_fft_size(1 << 18);
    }

    template <typename U = Allocator, typename = enable_if_t<std::is_copy_constructible<U>::value>>
    kernel_smoother(const Allocator& allocator) : spectral_processor<T, Allocator>(allocator)
    {
        set_max_fft_size(1 << 18);
    }

    template <typename U = Allocator, typename = enable_if_t<std::is_move_constructible<U>::value>>
    kernel_smoother(Allocator&& allocator) : spectral_processor<T, Allocator>(allocator)
    {
        set_max_fft_size(1 << 18);
    }

    void set_max_fft_size(uintptr_t size) { processor::set_max_fft_size(size); }

    void smooth(T *out, const T *in, const T *kernel, uintptr_t length, uintptr_t kernel_length, double width_lo, double width_hi, SmoothMode mode)
    {
        Allocator& allocator = processor::m_allocator;

        const int N = SIMDLimits<T>::max_size;

        width_lo = std::min(static_cast<double>(length), std::max(1.0, width_lo));
        width_hi = std::min(static_cast<double>(length), std::max(1.0, width_hi));

        double width_mul = (width_hi - width_lo) / (length - 1);

        auto half_width_calc = [&](uintptr_t a)
        {
            return static_cast<uintptr_t>(std::round((width_lo + a * width_mul) * 0.5));
        };

        uintptr_t filter_size = std::ceil(std::max(width_lo, width_hi) * 0.5);
        uintptr_t filter_full = filter_size * 2 - 1;
        uintptr_t max_per_filter = width_mul ? (2.0 / width_mul) + 1.0 : length;
        uintptr_t data_width = max_per_filter + (filter_size - 1) * 2;

        binary_sizes sizes(filter_full, data_width);

        if (auto_resize_fft && processor::max_fft_size() < sizes.fft())
            set_max_fft_size(sizes.fft());

        uintptr_t fft_size = processor::max_fft_size() >= sizes.fft() ? sizes.fft() : 0;

        T *ptr = allocator.template allocate<T>(fft_size * 2 + filter_full + length + filter_size * 2);
        Split io { ptr, ptr + (fft_size >> 1) };
        Split st { io.realp + fft_size, io.imagp + fft_size };
        T *filter = ptr + (fft_size << 1) + filter_size - 1;
        T *temp = filter + filter_size;

        bool non_zero_end = true;

        if (kernel_length)
        {
            const T max_value = *std::max_element(kernel, kernel + kernel_length);
            const T test_value = kernel[kernel_length - 1] / max_value;
            const T epsilon = std::numeric_limits<T>::epsilon();

            if (test_value < epsilon)
                non_zero_end = false;
        }

        // Copy data

        switch (mode)
        {
            case kSmoothZeroPad:
                std::fill_n(temp, filter_size, 0.0);
                std::copy_n(in, length, temp + filter_size);
                std::fill_n(temp + filter_size + length, filter_size, 0.0);
                break;

            case kSmoothWrap:
                std::copy_n(in + length - filter_size, filter_size, temp);
                std::copy_n(in, length, temp + filter_size);
                std::copy_n(in, filter_size, temp + filter_size + length);
                break;

            case kSmoothFold:
                std::reverse_copy(in + 1, in + 1 + filter_size, temp);
                std::copy(in, in + length, temp + filter_size);
                std::reverse_copy(in + length - (filter_size + 1), in + length - 1, temp + filter_size + length);
                break;
        }

        const double *data = temp + filter_size;

        for (uintptr_t i = 0, j = 0; i < length; i = j)
        {
            uintptr_t half_width = static_cast<uintptr_t>(half_width_calc(i));
            const T filter_normalise = make_filter(filter, kernel, kernel_length, half_width, non_zero_end);

            for (j = i; (j < length) && half_width == half_width_calc(j); j++);

            //uintptr_t optimal_fft = 1 << processor::calc_fft_size_log2(half_width * 4);
            uintptr_t n = j - i;
            uintptr_t k = 0;
            uintptr_t m = n;//std::min(optimal_fft / 2, n);

            m = use_fft(n, half_width, fft_size) ? m : 0;

            for (; k + (m - 1) < n; k += m)
                apply_filter_fft(out + i + k, data + i + k, filter, io, st, half_width, m, filter_normalise);

            for (; k + (N - 1) < n; k += N)
                apply_filter<N>(out + i + k, data + i + k, filter, half_width, filter_normalise);

            for (; k < n; k++)
                apply_filter<1>(out + i + k, data + i + k, filter, half_width, filter_normalise);
        }
        /*
         for (uintptr_t i = 0; i < length; i++)
         {
         // FIX - not safe
         double width = width_lo + i * width_mul;
         double width_normalise = 2.0 / width;
         uintptr_t half_width = width * 0.5;

         T filter_sum = kernel[0];
         T filter_val = data[i] * filter_sum;

         for (uintptr_t j = 1; j < half_width; j++)
         {
         T filter = filter_kernel(kernel, j * (kernel_length - 1) * width_normalise);
         filter_val += filter * (data[i - j] + data[i + j]);
         filter_sum += filter + filter;
         }

         out[i] = filter_val / filter_sum;
         }*/

        allocator.deallocate(ptr);
    }

private:

    bool use_fft(uintptr_t n, uintptr_t half_width, uintptr_t fft_size)
    {
        return fft_size && n > 2 && half_width > 2 && (32 * n > half_width);
    }

    T filter_kernel(const T *kernel, double position)
    {
        uintptr_t index = static_cast<uintptr_t>(position);

        const T lo = kernel[index];
        const T hi = kernel[index + 1];

        return static_cast<T>(lo + (position - index) * (hi - lo));
    }

    T make_filter(T *filter, const T *kernel, uintptr_t kernel_length, uintptr_t half_width, bool non_zero_end)
    {
        const double width_normalise = 1.0 / std::max(uintptr_t(1), half_width - (non_zero_end ? 1 : 0));
        T filter_sum = 0.0;

        uintptr_t loop_size = non_zero_end ? half_width - 1 : half_width;

        for (uintptr_t j = 0; j < loop_size; j++)
        {
            filter[j] = filter_kernel(kernel, j * (kernel_length - 1) * width_normalise);
            filter_sum += filter[j];
        }

        if (non_zero_end)
        {
            filter[half_width - 1] = kernel[kernel_length - 1];
            filter_sum += filter[half_width - 1];
        }

        return T(1) / (filter_sum * T(2) - filter[0]);
    }

    template <int N>
    void apply_filter(T *out, const T *data, const T *filter, uintptr_t half_width, T gain)
    {
        using VecType = SIMDType<double, N>;

        VecType filter_val = SIMDType<double, N>(data) * filter[0];

        for (uintptr_t j = 1; j < half_width; j++)
            filter_val += filter[j] * (VecType(data - j) + VecType(data + j));

        filter_val *= gain;
        filter_val.store(out);
    }

    void apply_filter_fft(T *out, const T *data, T *filter, Split& io, Split& temp, uintptr_t half_width, uintptr_t n, T gain)
    {
        uintptr_t filter_width = half_width * 2 - 1;
        uintptr_t data_width = n + (half_width - 1) * 2;
        binary_sizes sizes(data_width, filter_width);
        in_ptr data_in(data - (half_width - 1), data_width);
        in_ptr filter_in(filter - (half_width - 1), filter_width);

        // Mirror the filter

        for (intptr_t i = 1; i < (intptr_t) half_width; i++)
            filter[-i] = filter[i];

        // Process

        processor::template binary_op<ir_convolve_real>(io, temp, sizes, data_in, filter_in);

        // Copy output with scaling

        zipped_pointer p(io, half_width - 1);

        for (uintptr_t i = 0; i < n; i++)
            out[i] = *p++ * gain;
    }
};


#endif