Updates from HISSTools_Library

2019-08-14 13:57:21 +01:00
parent 50474b2985
commit 1f84220a10
3 changed files with 246 additions and 144 deletions
@@ -0,0 +1,41 @@
+
+#ifndef ALLOCATOR_HPP
+#define ALLOCATOR_HPP
+
+#include <cstdlib>
+
+#include "SIMDSupport.hpp"
+
+namespace impl
+{
+    typedef void *(*allocate_function)(size_t);
+    typedef void (*free_function)(void *);
+};
+
+// Atemplate for wrapping functions as an allocator
+
+template<impl::allocate_function alloc, impl::free_function dealloc>
+struct function_allocator
+{
+    template <typename T>
+    T* allocate(size_t size) { return reinterpret_cast<T*>(alloc(size * sizeof(T))); }
+    
+    template <typename T>
+    void deallocate(T *ptr) { dealloc(ptr); }
+};
+
+using malloc_allocator = function_allocator<malloc, free>;
+
+// Aligned allocator
+
+struct aligned_allocator
+{
+    
+    template <typename T>
+    T* allocate(size_t size) { return allocate_aligned<T>(size); }
+    
+    template <typename T>
+    void deallocate(T *ptr) { deallocate_aligned(ptr); }
+};
+
+#endif
@@ -8,168 +8,227 @@
 #include <cstdlib>
 #include <limits>

+#include "Allocator.hpp"
 #include "SIMDSupport.hpp"
+#include "SpectralProcessor.hpp"

-enum SmoothMode { kSmoothZeroPad, kSmoothWrap, kSmoothFold };
-
-template <typename T>
-T filter_kernel(const T *kernel, double position)
+template <typename T, typename Allocator = aligned_allocator, bool auto_resize_fft = false>
+class kernel_smoother : private spectral_processor<T, Allocator>
 {
-    uintptr_t index = static_cast<uintptr_t>(position);
+    using Split = typename FFTTypes<T>::Split;
    
-    const T lo = kernel[index];
-    const T hi = kernel[index + 1];
+    using processor = spectral_processor<T, Allocator>;
+    using binary_sizes = typename processor::binary_sizes;
+    using zipped_pointer = typename processor::zipped_pointer;
+    using in_ptr = typename processor::in_ptr;
    
-    return static_cast<T>(lo + (position - index) * (hi - lo));
-}
-
-template <typename T>
-T make_filter(T *filter, const T *kernel, uintptr_t kernel_length, uintptr_t half_width, bool non_zero_end)
-{
-    const double width_normalise = 1.0 / std::max(uintptr_t(1), half_width - (non_zero_end ? 1 : 0));
-    T filter_sum = 0.0;
+public:
    
-    uintptr_t loop_size = non_zero_end ? half_width - 1 : half_width;
-
-    for (uintptr_t j = 0; j < loop_size; j++)
+    enum SmoothMode { kSmoothZeroPad, kSmoothWrap, kSmoothFold };
+    
+    kernel_smoother() : spectral_processor<T, Allocator>(aligned_allocator())
    {
-        filter[j] = filter_kernel(kernel, j * (kernel_length - 1) * width_normalise);
-        filter_sum += filter[j];
+        set_max_fft_size(1 << 18);
    }
-    
-    if (non_zero_end)
-    {
-        filter[half_width - 1] = kernel[kernel_length - 1];
-        filter_sum += filter[half_width - 1];
-    }
-    
-    return T(1) / (filter_sum * T(2) - filter[0]);
-}

-template <int N, typename T>
-void apply_filter(T *out, const T *data, const T *filter, uintptr_t half_width, T gain)
-{
-    using VecType = SIMDType<double, N>;
+    void set_max_fft_size(uintptr_t size) { processor::set_max_fft_size(size); }
    
-    VecType filter_val = SIMDType<double, N>(data) * filter[0];
-    
-    for (uintptr_t j = 1; j < half_width; j++)
-        filter_val += filter[j] * (VecType(data -j) + VecType(data + j));
-    
-    filter_val *= gain;
-    filter_val.store(out);
-}
-
-template <typename T, typename Allocator>
-void kernel_smooth(T *out, const T *in, const T *kernel, uintptr_t length, uintptr_t kernel_length, double width_lo, double width_hi, SmoothMode mode, Allocator allocator)
-{
-    const int N = SIMDLimits<T>::max_size;
-
-    width_lo = std::min(static_cast<double>(length), std::max(1.0, width_lo));
-    width_hi = std::min(static_cast<double>(length), std::max(1.0, width_hi));
-    
-    double width_mul = (width_hi - width_lo) / (length - 1);
-    
-    auto half_width_calc = [&](uintptr_t a)
+    void smooth(T *out, const T *in, const T *kernel, uintptr_t length, uintptr_t kernel_length, double width_lo, double width_hi, SmoothMode mode)
    {
-        return static_cast<uintptr_t>(std::round((width_lo + a * width_mul) * 0.5));
-    };
-                                                                
-    uintptr_t filter_size = std::ceil(std::max(width_lo, width_hi) * 0.5);
-    
-    T *filter = allocator.template alloc<T>(filter_size);
-    T *temp = allocator.template alloc<T>(length + filter_size * 2);
-    
-    bool non_zero_end = true;
-    
-    if (kernel_length)
-    {
-        const T max_value = *std::max_element(kernel, kernel + kernel_length);
-        const T test_value = kernel[kernel_length - 1] / max_value;
-        const T epsilon = std::numeric_limits<T>::epsilon();
+        Allocator& allocator = processor::m_allocator;
        
-        if (test_value < epsilon)
-            non_zero_end = false;
-    }
-    
-    // Copy data
-    
-    switch (mode)
-    {
-        case kSmoothZeroPad:
-            std::fill_n(temp, filter_size, 0.0);
-            std::copy_n(in, length, temp + filter_size);
-            std::fill_n(temp + filter_size + length, filter_size, 0.0);
-            break;
-    
-        case kSmoothWrap:
-            std::copy_n(in + length - filter_size, filter_size, temp);
-            std::copy_n(in, length, temp + filter_size);
-            std::copy_n(in, filter_size, temp + filter_size + length);
-            break;
-            
-        case kSmoothFold:
-            std::reverse_copy(in + 1, in + 1 + filter_size, temp);
-            std::copy(in, in + length, temp + filter_size);
-            std::reverse_copy(in, in + filter_size, temp + filter_size + length);
-            break;
-    }
-    
-    const double *data = temp + filter_size;
-    
-    for (uintptr_t i = 0, j = 0; i < length; i = j)
-    {
-        uintptr_t half_width = static_cast<uintptr_t>(half_width_calc(i));
-        const T filter_normalise = make_filter(filter, kernel, kernel_length, half_width, non_zero_end);
-                
-        for (j = i; (j < length) && half_width == half_width_calc(j); j++);
+        const int N = SIMDLimits<T>::max_size;
        
-        uintptr_t n = j - i;
-        uintptr_t k = 0;
+        width_lo = std::min(static_cast<double>(length), std::max(1.0, width_lo));
+        width_hi = std::min(static_cast<double>(length), std::max(1.0, width_hi));
        
-        for (; k + (N - 1) < n; k += N)
-            apply_filter<N>(out + i + k, data + i + k, filter, half_width, filter_normalise);
+        double width_mul = (width_hi - width_lo) / (length - 1);
        
-        for (; k < n; k++)
-            apply_filter<1>(out + i + k, data + i + k, filter, half_width, filter_normalise);
-    }
-    /*
-    for (uintptr_t i = 0; i < length; i++)
-    {
-     // FIX - not safe
-        double width = width_lo + i * width_mul;
-        double width_normalise = 2.0 / width;
-        uintptr_t half_width = width * 0.5;
-        
-        T filter_sum = kernel[0];
-        T filter_val = data[i] * filter_sum;
-        
-        for (uintptr_t j = 1; j < half_width; j++)
+        auto half_width_calc = [&](uintptr_t a)
        {
-            T filter = filter_kernel(kernel, j * (kernel_length - 1) * width_normalise);
-            filter_val += filter * (data[i - j] + data[i + j]);
-            filter_sum += filter + filter;
+            return static_cast<uintptr_t>(std::round((width_lo + a * width_mul) * 0.5));
+        };
+        
+        uintptr_t filter_size = std::ceil(std::max(width_lo, width_hi) * 0.5);
+        uintptr_t filter_full = filter_size * 2 - 1;
+        uintptr_t max_per_filter = (2.0 / width_mul) + 1.5;
+        uintptr_t data_width = max_per_filter + (filter_size - 1) * 2;
+        
+        binary_sizes sizes(filter_full, data_width);
+        
+        if (auto_resize_fft && processor::max_fft_size() < sizes.fft())
+            set_max_fft_size(sizes.fft());
+        
+        uintptr_t fft_size = processor::max_fft_size() >= sizes.fft() ? sizes.fft() : 0;
+        
+        T *ptr = allocator.template allocate<T>(fft_size * 2 + filter_full + length + filter_size * 2);
+        Split io { ptr, ptr + fft_size / 2};
+        Split st { ptr + fft_size, ptr + 3 * fft_size / 2};
+        T *filter = ptr + fft_size * 2 + filter_size - 1;
+        T *temp = filter + filter_size;
+        
+        bool non_zero_end = true;
+        
+        if (kernel_length)
+        {
+            const T max_value = *std::max_element(kernel, kernel + kernel_length);
+            const T test_value = kernel[kernel_length - 1] / max_value;
+            const T epsilon = std::numeric_limits<T>::epsilon();
+            
+            if (test_value < epsilon)
+                non_zero_end = false;
        }
        
-        out[i] = filter_val / filter_sum;
-    }*/
-    
-    allocator.dealloc(filter);
-    allocator.dealloc(temp);
-}
+        // Copy data
+        
+        switch (mode)
+        {
+            case kSmoothZeroPad:
+                std::fill_n(temp, filter_size, 0.0);
+                std::copy_n(in, length, temp + filter_size);
+                std::fill_n(temp + filter_size + length, filter_size, 0.0);
+                break;
+                
+            case kSmoothWrap:
+                std::copy_n(in + length - filter_size, filter_size, temp);
+                std::copy_n(in, length, temp + filter_size);
+                std::copy_n(in, filter_size, temp + filter_size + length);
+                break;
+                
+            case kSmoothFold:
+                std::reverse_copy(in + 1, in + 1 + filter_size, temp);
+                std::copy(in, in + length, temp + filter_size);
+                std::reverse_copy(in, in + filter_size, temp + filter_size + length);
+                break;
+        }
+        
+        const double *data = temp + filter_size;
+        
+        for (uintptr_t i = 0, j = 0; i < length; i = j)
+        {
+            uintptr_t half_width = static_cast<uintptr_t>(half_width_calc(i));
+            const T filter_normalise = make_filter(filter, kernel, kernel_length, half_width, non_zero_end);
+            
+            for (j = i; (j < length) && half_width == half_width_calc(j); j++);
+            
+            uintptr_t optimal_fft = 1 << processor::calc_fft_size_log2(half_width * 4);
+            uintptr_t n = j - i;
+            uintptr_t k = 0;
+            uintptr_t m = std::min(optimal_fft / 2, n);
+                
+            m = use_fft(n, half_width, fft_size) ? m : 0;

-struct malloc_allocator
-{
-    template <typename T>
-    T* alloc(size_t size) { return reinterpret_cast<T*>(malloc(size * sizeof(T))); }
+            for (; k + (m - 1) < n; k += m)
+                apply_filter_fft(out + i + k, data + i + k, filter, io, st, half_width, n, filter_normalise);
+                
+            for (; k + (N - 1) < n; k += N)
+                apply_filter<N>(out + i + k, data + i + k, filter, half_width, filter_normalise);
+            
+            for (; k < n; k++)
+                apply_filter<1>(out + i + k, data + i + k, filter, half_width, filter_normalise);
+        }
+        /*
+         for (uintptr_t i = 0; i < length; i++)
+         {
+         // FIX - not safe
+         double width = width_lo + i * width_mul;
+         double width_normalise = 2.0 / width;
+         uintptr_t half_width = width * 0.5;
+         
+         T filter_sum = kernel[0];
+         T filter_val = data[i] * filter_sum;
+         
+         for (uintptr_t j = 1; j < half_width; j++)
+         {
+         T filter = filter_kernel(kernel, j * (kernel_length - 1) * width_normalise);
+         filter_val += filter * (data[i - j] + data[i + j]);
+         filter_sum += filter + filter;
+         }
+         
+         out[i] = filter_val / filter_sum;
+         }*/
+        
+        allocator.deallocate(ptr);
+    }
    
-    template <typename T>
-    void dealloc(T *ptr) { free(ptr); }
+private:
+    
+    bool use_fft(uintptr_t n, uintptr_t half_width, uintptr_t fft_size)
+    {
+        return fft_size && n > 2 && half_width > 2 && (32 * n > half_width);
+    }
+    
+    T filter_kernel(const T *kernel, double position)
+    {
+        uintptr_t index = static_cast<uintptr_t>(position);
+        
+        const T lo = kernel[index];
+        const T hi = kernel[index + 1];
+        
+        return static_cast<T>(lo + (position - index) * (hi - lo));
+    }
+    
+    T make_filter(T *filter, const T *kernel, uintptr_t kernel_length, uintptr_t half_width, bool non_zero_end)
+    {
+        const double width_normalise = 1.0 / std::max(uintptr_t(1), half_width - (non_zero_end ? 1 : 0));
+        T filter_sum = 0.0;
+        
+        uintptr_t loop_size = non_zero_end ? half_width - 1 : half_width;
+        
+        for (uintptr_t j = 0; j < loop_size; j++)
+        {
+            filter[j] = filter_kernel(kernel, j * (kernel_length - 1) * width_normalise);
+            filter_sum += filter[j];
+        }
+        
+        if (non_zero_end)
+        {
+            filter[half_width - 1] = kernel[kernel_length - 1];
+            filter_sum += filter[half_width - 1];
+        }
+        
+        return T(1) / (filter_sum * T(2) - filter[0]);
+    }
+    
+    template <int N>
+    void apply_filter(T *out, const T *data, const T *filter, uintptr_t half_width, T gain)
+    {
+        using VecType = SIMDType<double, N>;
+        
+        VecType filter_val = SIMDType<double, N>(data) * filter[0];
+        
+        for (uintptr_t j = 1; j < half_width; j++)
+            filter_val += filter[j] * (VecType(data -j) + VecType(data + j));
+        
+        filter_val *= gain;
+        filter_val.store(out);
+    }
+    
+    void apply_filter_fft(T *out, const T *data, T *filter, Split& io, Split& temp, uintptr_t half_width, uintptr_t n, T gain)
+    {
+        uintptr_t filter_width = half_width * 2 - 1;
+        uintptr_t data_width = n + (half_width - 1) * 2;
+        binary_sizes sizes(data_width, filter_width);
+        in_ptr data_in(data - (half_width - 1), data_width);
+        in_ptr filter_in(filter - (half_width - 1), filter_width);
+        
+        // Mirror the filter
+        
+        for (uintptr_t i = 1; i < half_width; i++)
+            filter[-i] = filter[i];
+        
+        // Process
+        
+        processor::template binary_op<ir_convolve_real>(io, temp, sizes, data_in, filter_in);
+        
+        // Copy output with scaling
+        
+        zipped_pointer p(io, half_width - 1);
+        
+        for (uintptr_t i = 0; i < n; i++)
+            out[i] = *p++ * gain;
+    }
 };
-
-template <typename T>
-void kernel_smooth(T *out, const T *in, const T *kernel, uintptr_t length, uintptr_t kernel_length, double width_lo, double width_hi, SmoothMode mode)
-{
-    kernel_smooth(out, in, kernel, length, kernel_length, width_lo, width_hi, mode, malloc_allocator());
-}
+    
+    
 #endif
@@ -3,10 +3,12 @@
 #define SPECTRALPROCESSOR_H

 #include <algorithm>
+
+#include "Allocator.hpp"
 #include "HISSTools_FFT/HISSTools_FFT.h"
 #include "SpectralFunctions.hpp"

-template <typename T, typename Allocator>
+template <typename T, typename Allocator = aligned_allocator>
 class spectral_processor
 {
    using Split = typename FFTTypes<T>::Split;