408 lines
16 KiB
Python
408 lines
16 KiB
Python
# Authors: Eric Larson <larson.eric.d@gmail.com>
|
|
#
|
|
# License: BSD (3-clause)
|
|
|
|
import numpy as np
|
|
from scipy.fftpack import fft, ifft, rfft, irfft
|
|
|
|
from .utils import sizeof_fmt, logger, get_config, warn, _explain_exception
|
|
|
|
|
|
# Support CUDA for FFTs; requires scikits.cuda and pycuda
|
|
_cuda_capable = False
|
|
_multiply_inplace_c128 = _halve_c128 = _real_c128 = None
|
|
|
|
|
|
def _get_cudafft():
|
|
"""Deal with scikit-cuda namespace change."""
|
|
try:
|
|
from skcuda import fft
|
|
except ImportError:
|
|
try:
|
|
from scikits.cuda import fft
|
|
except ImportError:
|
|
fft = None
|
|
return fft
|
|
|
|
|
|
def get_cuda_memory():
|
|
"""Get the amount of free memory for CUDA operations.
|
|
|
|
Returns
|
|
-------
|
|
memory : str
|
|
The amount of available memory as a human-readable string.
|
|
"""
|
|
if not _cuda_capable:
|
|
warn('CUDA not enabled, returning zero for memory')
|
|
mem = 0
|
|
else:
|
|
from pycuda.driver import mem_get_info
|
|
mem = mem_get_info()[0]
|
|
return sizeof_fmt(mem)
|
|
|
|
|
|
def init_cuda(ignore_config=False):
|
|
"""Initialize CUDA functionality.
|
|
|
|
This function attempts to load the necessary interfaces
|
|
(hardware connectivity) to run CUDA-based filtering. This
|
|
function should only need to be run once per session.
|
|
|
|
If the config var (set via mne.set_config or in ENV)
|
|
MNE_USE_CUDA == 'true', this function will be executed when
|
|
the first CUDA setup is performed. If this variable is not
|
|
set, this function can be manually executed.
|
|
"""
|
|
global _cuda_capable, _multiply_inplace_c128, _halve_c128, _real_c128
|
|
if _cuda_capable:
|
|
return
|
|
if not ignore_config and (get_config('MNE_USE_CUDA', 'false').lower() !=
|
|
'true'):
|
|
logger.info('CUDA not enabled in config, skipping initialization')
|
|
return
|
|
# Triage possible errors for informative messaging
|
|
_cuda_capable = False
|
|
try:
|
|
from pycuda import gpuarray, driver # noqa: F401
|
|
from pycuda.elementwise import ElementwiseKernel
|
|
except ImportError:
|
|
warn('module pycuda not found, CUDA not enabled')
|
|
return
|
|
try:
|
|
# Initialize CUDA; happens with importing autoinit
|
|
import pycuda.autoinit # noqa: F401
|
|
except ImportError:
|
|
warn('pycuda.autoinit could not be imported, likely a hardware error, '
|
|
'CUDA not enabled%s' % _explain_exception())
|
|
return
|
|
# Make sure scikit-cuda is installed
|
|
cudafft = _get_cudafft()
|
|
if cudafft is None:
|
|
warn('module scikit-cuda not found, CUDA not enabled')
|
|
return
|
|
|
|
# let's construct our own CUDA multiply in-place function
|
|
_multiply_inplace_c128 = ElementwiseKernel(
|
|
'pycuda::complex<double> *a, pycuda::complex<double> *b',
|
|
'b[i] *= a[i]', 'multiply_inplace')
|
|
_halve_c128 = ElementwiseKernel(
|
|
'pycuda::complex<double> *a', 'a[i] /= 2.0', 'halve_value')
|
|
_real_c128 = ElementwiseKernel(
|
|
'pycuda::complex<double> *a', 'a[i] = real(a[i])', 'real_value')
|
|
|
|
# Make sure we can use 64-bit FFTs
|
|
try:
|
|
cudafft.Plan(16, np.float64, np.complex128) # will get auto-GC'ed
|
|
except Exception:
|
|
warn('Device does not appear to support 64-bit FFTs, CUDA not '
|
|
'enabled%s' % _explain_exception())
|
|
return
|
|
_cuda_capable = True
|
|
# Figure out limit for CUDA FFT calculations
|
|
logger.info('Enabling CUDA with %s available memory' % get_cuda_memory())
|
|
|
|
|
|
###############################################################################
|
|
# Repeated FFT multiplication
|
|
|
|
def setup_cuda_fft_multiply_repeated(n_jobs, h_fft):
|
|
"""Set up repeated CUDA FFT multiplication with a given filter.
|
|
|
|
Parameters
|
|
----------
|
|
n_jobs : int | str
|
|
If n_jobs == 'cuda', the function will attempt to set up for CUDA
|
|
FFT multiplication.
|
|
h_fft : array
|
|
The filtering function that will be used repeatedly.
|
|
If n_jobs='cuda', this function will be shortened (since CUDA
|
|
assumes FFTs of real signals are half the length of the signal)
|
|
and turned into a gpuarray.
|
|
|
|
Returns
|
|
-------
|
|
n_jobs : int
|
|
Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
|
|
original n_jobs is passed.
|
|
cuda_dict : dict
|
|
Dictionary with the following CUDA-related variables:
|
|
use_cuda : bool
|
|
Whether CUDA should be used.
|
|
fft_plan : instance of FFTPlan
|
|
FFT plan to use in calculating the FFT.
|
|
ifft_plan : instance of FFTPlan
|
|
FFT plan to use in calculating the IFFT.
|
|
x_fft : instance of gpuarray
|
|
Empty allocated GPU space for storing the result of the
|
|
frequency-domain multiplication.
|
|
x : instance of gpuarray
|
|
Empty allocated GPU space for the data to filter.
|
|
h_fft : array | instance of gpuarray
|
|
This will either be a gpuarray (if CUDA enabled) or np.ndarray.
|
|
If CUDA is enabled, h_fft will be modified appropriately for use
|
|
with filter.fft_multiply().
|
|
|
|
Notes
|
|
-----
|
|
This function is designed to be used with fft_multiply_repeated().
|
|
"""
|
|
cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None,
|
|
x_fft=None, x=None)
|
|
n_fft = len(h_fft)
|
|
cuda_fft_len = int((n_fft - (n_fft % 2)) / 2 + 1)
|
|
if n_jobs == 'cuda':
|
|
n_jobs = 1
|
|
init_cuda()
|
|
if _cuda_capable:
|
|
from pycuda import gpuarray
|
|
cudafft = _get_cudafft()
|
|
# set up all arrays necessary for CUDA
|
|
# try setting up for float64
|
|
try:
|
|
# do the IFFT normalization now so we don't have to later
|
|
h_fft = gpuarray.to_gpu(h_fft[:cuda_fft_len]
|
|
.astype('complex_') / len(h_fft))
|
|
cuda_dict.update(
|
|
use_cuda=True,
|
|
fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128),
|
|
ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64),
|
|
x_fft=gpuarray.empty(cuda_fft_len, np.complex128),
|
|
x=gpuarray.empty(int(n_fft), np.float64))
|
|
logger.info('Using CUDA for FFT FIR filtering')
|
|
except Exception as exp:
|
|
logger.info('CUDA not used, could not instantiate memory '
|
|
'(arrays may be too large: "%s"), falling back to '
|
|
'n_jobs=1' % str(exp))
|
|
else:
|
|
logger.info('CUDA not used, CUDA could not be initialized, '
|
|
'falling back to n_jobs=1')
|
|
return n_jobs, cuda_dict, h_fft
|
|
|
|
|
|
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)):
|
|
"""Do FFT multiplication by a filter function (possibly using CUDA).
|
|
|
|
Parameters
|
|
----------
|
|
h_fft : 1-d array or gpuarray
|
|
The filtering array to apply.
|
|
x : 1-d array
|
|
The array to filter.
|
|
cuda_dict : dict
|
|
Dictionary constructed using setup_cuda_multiply_repeated().
|
|
|
|
Returns
|
|
-------
|
|
x : 1-d array
|
|
Filtered version of x.
|
|
"""
|
|
if not cuda_dict['use_cuda']:
|
|
# do the fourier-domain operations
|
|
x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel()
|
|
else:
|
|
cudafft = _get_cudafft()
|
|
# do the fourier-domain operations, results in second param
|
|
cuda_dict['x'].set(x.astype(np.float64))
|
|
cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
|
|
_multiply_inplace_c128(h_fft, cuda_dict['x_fft'])
|
|
# If we wanted to do it locally instead of using our own kernel:
|
|
# cuda_seg_fft.set(cuda_seg_fft.get() * h_fft)
|
|
cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'],
|
|
cuda_dict['ifft_plan'], False)
|
|
x = np.array(cuda_dict['x'].get(), dtype=x.dtype, subok=True,
|
|
copy=False)
|
|
return x
|
|
|
|
|
|
###############################################################################
|
|
# FFT Resampling
|
|
|
|
def setup_cuda_fft_resample(n_jobs, W, new_len):
|
|
"""Set up CUDA FFT resampling.
|
|
|
|
Parameters
|
|
----------
|
|
n_jobs : int | str
|
|
If n_jobs == 'cuda', the function will attempt to set up for CUDA
|
|
FFT resampling.
|
|
W : array
|
|
The filtering function to be used during resampling.
|
|
If n_jobs='cuda', this function will be shortened (since CUDA
|
|
assumes FFTs of real signals are half the length of the signal)
|
|
and turned into a gpuarray.
|
|
new_len : int
|
|
The size of the array following resampling.
|
|
|
|
Returns
|
|
-------
|
|
n_jobs : int
|
|
Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
|
|
original n_jobs is passed.
|
|
cuda_dict : dict
|
|
Dictionary with the following CUDA-related variables:
|
|
use_cuda : bool
|
|
Whether CUDA should be used.
|
|
fft_plan : instance of FFTPlan
|
|
FFT plan to use in calculating the FFT.
|
|
ifft_plan : instance of FFTPlan
|
|
FFT plan to use in calculating the IFFT.
|
|
x_fft : instance of gpuarray
|
|
Empty allocated GPU space for storing the result of the
|
|
frequency-domain multiplication.
|
|
x : instance of gpuarray
|
|
Empty allocated GPU space for the data to resample.
|
|
W : array | instance of gpuarray
|
|
This will either be a gpuarray (if CUDA enabled) or np.ndarray.
|
|
If CUDA is enabled, W will be modified appropriately for use
|
|
with filter.fft_multiply().
|
|
|
|
Notes
|
|
-----
|
|
This function is designed to be used with fft_resample().
|
|
"""
|
|
cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None,
|
|
x_fft=None, x=None, y_fft=None, y=None)
|
|
n_fft_x, n_fft_y = len(W), new_len
|
|
cuda_fft_len_x = int((n_fft_x - (n_fft_x % 2)) // 2 + 1)
|
|
cuda_fft_len_y = int((n_fft_y - (n_fft_y % 2)) // 2 + 1)
|
|
if n_jobs == 'cuda':
|
|
n_jobs = 1
|
|
init_cuda()
|
|
if _cuda_capable:
|
|
# try setting up for float64
|
|
from pycuda import gpuarray
|
|
cudafft = _get_cudafft()
|
|
try:
|
|
# do the IFFT normalization now so we don't have to later
|
|
W = gpuarray.to_gpu(W[:cuda_fft_len_x]
|
|
.astype('complex_') / n_fft_y)
|
|
cuda_dict.update(
|
|
use_cuda=True,
|
|
fft_plan=cudafft.Plan(n_fft_x, np.float64, np.complex128),
|
|
ifft_plan=cudafft.Plan(n_fft_y, np.complex128, np.float64),
|
|
x_fft=gpuarray.zeros(max(cuda_fft_len_x,
|
|
cuda_fft_len_y), np.complex128),
|
|
x=gpuarray.empty(max(int(n_fft_x),
|
|
int(n_fft_y)), np.float64))
|
|
logger.info('Using CUDA for FFT resampling')
|
|
except Exception:
|
|
logger.info('CUDA not used, could not instantiate memory '
|
|
'(arrays may be too large), falling back to '
|
|
'n_jobs=1')
|
|
else:
|
|
logger.info('CUDA not used, CUDA could not be initialized, '
|
|
'falling back to n_jobs=1')
|
|
return n_jobs, cuda_dict, W
|
|
|
|
|
|
def fft_resample(x, W, new_len, npads, to_removes, cuda_dict=None,
|
|
pad='reflect_limited'):
|
|
"""Do FFT resampling with a filter function (possibly using CUDA).
|
|
|
|
Parameters
|
|
----------
|
|
x : 1-d array
|
|
The array to resample. Will be converted to float64 if necessary.
|
|
W : 1-d array or gpuarray
|
|
The filtering function to apply.
|
|
new_len : int
|
|
The size of the output array (before removing padding).
|
|
npads : tuple of int
|
|
Amount of padding to apply to the start and end of the
|
|
signal before resampling.
|
|
to_removes : tuple of int
|
|
Number of samples to remove after resampling.
|
|
cuda_dict : dict
|
|
Dictionary constructed using setup_cuda_multiply_repeated().
|
|
pad : str
|
|
The type of padding to use. Supports all :func:`np.pad` ``mode``
|
|
options. Can also be "reflect_limited" (default), which pads with a
|
|
reflected version of each vector mirrored on the first and last values
|
|
of the vector, followed by zeros.
|
|
|
|
.. versionadded:: 0.15
|
|
|
|
Returns
|
|
-------
|
|
x : 1-d array
|
|
Filtered version of x.
|
|
"""
|
|
cuda_dict = dict(use_cuda=False) if cuda_dict is None else cuda_dict
|
|
# add some padding at beginning and end to make this work a little cleaner
|
|
if x.dtype != np.float64:
|
|
x = x.astype(np.float64)
|
|
x = _smart_pad(x, npads, pad)
|
|
old_len = len(x)
|
|
shorter = new_len < old_len
|
|
if not cuda_dict['use_cuda']:
|
|
N = int(min(new_len, old_len))
|
|
# The below is equivalent to this, but faster
|
|
# sl_1 = slice((N + 1) // 2)
|
|
# y_fft = np.zeros(new_len, np.complex128)
|
|
# x_fft = fft(x).ravel() * W
|
|
# y_fft[sl_1] = x_fft[sl_1]
|
|
# sl_2 = slice(-(N - 1) // 2, None)
|
|
# y_fft[sl_2] = x_fft[sl_2]
|
|
# y = np.real(ifft(y_fft, overwrite_x=True)).ravel()
|
|
x_fft = rfft(x).ravel()
|
|
x_fft *= W[np.arange(1, len(x) + 1) // 2].real
|
|
y_fft = np.zeros(new_len, np.float64)
|
|
sl_1 = slice(N)
|
|
y_fft[sl_1] = x_fft[sl_1]
|
|
if min(new_len, old_len) % 2 == 0:
|
|
if new_len > old_len:
|
|
y_fft[N - 1] /= 2.
|
|
y = irfft(y_fft, overwrite_x=True).ravel()
|
|
else:
|
|
cudafft = _get_cudafft()
|
|
cuda_dict['x'].set(np.concatenate((x, np.zeros(max(new_len - old_len,
|
|
0), x.dtype))))
|
|
# do the fourier-domain operations, results put in second param
|
|
cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
|
|
_multiply_inplace_c128(W, cuda_dict['x_fft'])
|
|
# This is not straightforward, but because x_fft and y_fft share
|
|
# the same data (and only one half of the full DFT is stored), we
|
|
# don't have to transfer the slice like we do in scipy. All we
|
|
# need to worry about is the Nyquist component, either halving it
|
|
# or taking just the real component...
|
|
use_len = new_len if shorter else old_len
|
|
func = _real_c128 if shorter else _halve_c128
|
|
if use_len % 2 == 0:
|
|
nyq = int((use_len - (use_len % 2)) // 2)
|
|
func(cuda_dict['x_fft'], slice=slice(nyq, nyq + 1))
|
|
cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'],
|
|
cuda_dict['ifft_plan'], scale=False)
|
|
y = cuda_dict['x'].get()[:new_len if shorter else None]
|
|
|
|
# now let's trim it back to the correct size (if there was padding)
|
|
if (to_removes > 0).any():
|
|
keep = np.ones((new_len), dtype='bool')
|
|
keep[:to_removes[0]] = False
|
|
keep[-to_removes[1]:] = False
|
|
y = np.compress(keep, y)
|
|
|
|
return y
|
|
|
|
|
|
###############################################################################
|
|
# Misc
|
|
|
|
# this has to go in mne.cuda instead of mne.filter to avoid import errors
|
|
def _smart_pad(x, n_pad, pad='reflect_limited'):
|
|
"""Pad vector x."""
|
|
n_pad = np.asarray(n_pad)
|
|
assert n_pad.shape == (2,)
|
|
if (n_pad == 0).all():
|
|
return x
|
|
elif (n_pad < 0).any():
|
|
raise RuntimeError('n_pad must be non-negative')
|
|
if pad == 'reflect_limited':
|
|
# need to pad with zeros if len(x) <= npad
|
|
l_z_pad = np.zeros(max(n_pad[0] - len(x) + 1, 0), dtype=x.dtype)
|
|
r_z_pad = np.zeros(max(n_pad[0] - len(x) + 1, 0), dtype=x.dtype)
|
|
return np.concatenate([l_z_pad, 2 * x[0] - x[n_pad[0]:0:-1], x,
|
|
2 * x[-1] - x[-2:-n_pad[1] - 2:-1], r_z_pad])
|
|
else:
|
|
return np.pad(x, (tuple(n_pad),), pad)
|