#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from builtins import object
import sys
import resource
import numpy as np
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
# import pycuda.autoinit
import skcuda.fft as cufft
from .core import GPUAsyncProcess
from .utils import find_kernel, _module_reader
[docs]class NFFTMemory(object):
def __init__(self, sigma, stream, m, use_double=False,
precomp_psi=True, **kwargs):
self.sigma = sigma
self.stream = stream
self.m = m
self.use_double = use_double
self.precomp_psi = precomp_psi
# set datatypes
self.real_type = np.float32 if not self.use_double \
else np.float64
self.complex_type = np.complex64 if not self.use_double \
else np.complex128
self.other_settings = {}
self.other_settings.update(kwargs)
self.t = kwargs.get('t', None)
self.y = kwargs.get('y', None)
self.f0 = kwargs.get('f0', 0.)
self.n0 = kwargs.get('n0', None)
self.nf = kwargs.get('nf', None)
self.t_g = kwargs.get('t_g', None)
self.y_g = kwargs.get('y_g', None)
self.ghat_g = kwargs.get('ghat_g', None)
self.ghat_c = kwargs.get('ghat_c', None)
self.q1 = kwargs.get('q1', None)
self.q2 = kwargs.get('q2', None)
self.q3 = kwargs.get('q3', None)
self.cu_plan = kwargs.get('cu_plan', None)
D = (2 * self.sigma - 1) * np.pi
self.b = float(2 * self.sigma * self.m) / D
[docs] def allocate_data(self, **kwargs):
self.n0 = kwargs.get('n0', self.n0)
self.nf = kwargs.get('nf', self.nf)
assert(self.n0 is not None)
assert(self.nf is not None)
self.t_g = gpuarray.zeros(self.n0, dtype=self.real_type)
self.y_g = gpuarray.zeros(self.n0, dtype=self.real_type)
return self
[docs] def allocate_precomp_psi(self, **kwargs):
self.n0 = kwargs.get('n0', self.n0)
assert(self.n0 is not None)
self.q1 = gpuarray.zeros(self.n0, dtype=self.real_type)
self.q2 = gpuarray.zeros(self.n0, dtype=self.real_type)
self.q3 = gpuarray.zeros(2 * self.m + 1, dtype=self.real_type)
return self
[docs] def allocate_grid(self, **kwargs):
self.nf = kwargs.get('nf', self.nf)
assert(self.nf is not None)
self.n = int(self.sigma * self.nf)
self.ghat_g = gpuarray.zeros(self.n,
dtype=self.complex_type)
self.cu_plan = cufft.Plan(self.n, self.complex_type, self.complex_type,
stream=self.stream)
return self
[docs] def allocate_pinned_cpu(self, **kwargs):
self.nf = kwargs.get('nf', self.nf)
assert(self.nf is not None)
self.ghat_c = cuda.aligned_zeros(shape=(self.nf,),
dtype=self.complex_type,
alignment=resource.getpagesize())
self.ghat_c = cuda.register_host_memory(self.ghat_c)
return self
[docs] def is_ready(self):
assert(self.n0 == len(self.t_g))
assert(self.n0 == len(self.y_g))
assert(self.n == len(self.ghat_g))
if self.ghat_c is not None:
assert(self.nf == len(self.ghat_c))
if self.precomp_psi:
assert(self.n0 == len(self.q1))
assert(self.n0 == len(self.q2))
assert(2 * self.m + 1 == len(self.q3))
[docs] def allocate(self, **kwargs):
self.n0 = kwargs.get('n0', self.n0)
self.nf = kwargs.get('nf', self.nf)
assert(self.n0 is not None)
assert(self.nf is not None)
self.n = int(self.sigma * self.nf)
self.allocate_data(**kwargs)
self.allocate_grid(**kwargs)
self.allocate_pinned_cpu(**kwargs)
if self.precomp_psi:
self.allocate_precomp_psi(**kwargs)
return self
[docs] def transfer_data_to_gpu(self, **kwargs):
t = kwargs.get('t', self.t)
y = kwargs.get('y', self.y)
assert(t is not None)
assert(y is not None)
self.t_g.set_async(t, stream=self.stream)
self.y_g.set_async(y, stream=self.stream)
[docs] def transfer_nfft_to_cpu(self, **kwargs):
cuda.memcpy_dtoh_async(self.ghat_c, self.ghat_g.ptr,
stream=self.stream)
[docs] def fromdata(self, t, y, allocate=True, **kwargs):
self.tmin = min(t)
self.tmax = max(t)
self.t = np.asarray(t).astype(self.real_type)
self.y = np.asarray(y).astype(self.real_type)
self.n0 = kwargs.get('n0', len(t))
self.nf = kwargs.get('nf', self.nf)
if self.nf is not None and allocate:
self.allocate(**kwargs)
return self
[docs]def nfft_adjoint_async(memory, functions,
minimum_frequency=0., block_size=256,
just_return_gridded_data=False, use_grid=None,
fast_grid=True, transfer_to_device=True,
transfer_to_host=True, precomp_psi=True,
samples_per_peak=1, **kwargs):
"""
Asynchronous NFFT adjoint operation.
Use the ``NFFTAsyncProcess`` class and related subroutines when possible.
Parameters
----------
memory: ``NFFTMemory``
Allocated memory, must have data already set (see, e.g.,
``NFFTAsyncProcess.allocate()``)
functions: tuple, length 5
Tuple of compiled functions from `SourceModule`. Must be prepared with
their appropriate dtype.
minimum_frequency: float, optional (default: 0)
First frequency of transform
block_size: int, optional
Number of CUDA threads per block
just_return_gridded_data: bool, optional
If True, returns grid via `grid_g.get()` after gridding
use_grid: ``GPUArray``, optional
If specified, will skip gridding procedure and use the `GPUArray`
provided
fast_grid: bool, optional, default: True
Whether or not to use the "fast" gridding procedure
transfer_to_device: bool, optional, (default: True)
If the data is already on the gpu, set as False
transfer_to_host: bool, optional, (default: True)
If False, will not transfer the resulting nfft to CPU memory
precomp_psi: bool, optional, (default: True)
Only relevant if ``fast`` is True. Will precompute values for the
fast gridding procedure.
samples_per_peak: float, optional (default: 1)
Frequency spacing is reduced by this factor, but number of frequencies
is kept the same
Returns
-------
ghat_cpu: ``np.array``
The resulting NFFT
"""
precompute_psi, fast_gaussian_grid, slow_gaussian_grid, \
nfft_shift, normalize = functions
stream = memory.stream
block = (block_size, 1, 1)
batch_size = 1
def grid_size(nthreads):
return int(np.ceil(float(nthreads) / block_size))
minimum_frequency = memory.real_type(minimum_frequency)
# transfer data -> gpu
if transfer_to_device:
memory.transfer_data_to_gpu()
# smooth data onto uniform grid
if fast_grid:
if memory.precomp_psi:
grid = (grid_size(memory.n0 + 2 * memory.m + 1), 1)
args = (grid, block, stream)
args += (memory.t_g.ptr,)
args += (memory.q1.ptr, memory.q2.ptr, memory.q3.ptr)
args += (np.int32(memory.n0), np.int32(memory.n),
np.int32(memory.m), memory.real_type(memory.b))
args += (memory.real_type(memory.tmin),
memory.real_type(memory.tmax),
memory.real_type(samples_per_peak))
precompute_psi.prepared_async_call(*args)
grid = (grid_size(memory.n0), 1)
args = (grid, block, stream)
args += (memory.t_g.ptr, memory.y_g.ptr, memory.ghat_g.ptr)
args += (memory.q1.ptr, memory.q2.ptr, memory.q3.ptr)
args += (np.int32(memory.n0), np.int32(memory.n),
np.int32(batch_size), np.int32(memory.m))
args += (memory.real_type(memory.tmin),
memory.real_type(memory.tmax),
memory.real_type(samples_per_peak))
fast_gaussian_grid.prepared_async_call(*args)
else:
grid = (grid_size(memory.n), 1)
args = (grid, block, stream)
args += (memory.t_g.ptr, memory.y_g.ptr, memory.ghat_g.ptr)
args += (np.int32(memory.n0), np.int32(memory.n),
np.int32(batch_size), np.int32(memory.m),
memory.real_type(memory.b))
args += (memory.real_type(memory.tmin),
memory.real_type(memory.tmax),
memory.real_type(samples_per_peak))
slow_gaussian_grid.prepared_async_call(*args)
# Stop if user wants the grid
if just_return_gridded_data:
stream.synchronize()
return np.real(memory.ghat_g.get())
# Set the grid manually if the user wants to
# (only for debugging)
if use_grid is not None:
memory.ghat_g.set(use_grid)
# for a non-zero minimum frequency, do a shift
if abs(minimum_frequency) > 1E-9:
grid = (grid_size(memory.n), 1)
args = (grid, block, stream)
args += (memory.ghat_g.ptr, memory.ghat_g.ptr)
args += (np.int32(memory.n), np.int32(batch_size))
args += (memory.real_type(memory.tmin),
memory.real_type(memory.tmax),
memory.real_type(samples_per_peak),
memory.real_type(minimum_frequency))
nfft_shift.prepared_async_call(*args)
# Run IFFT on grid
cufft.ifft(memory.ghat_g, memory.ghat_g, memory.cu_plan)
# Normalize result (deconvolve smoothing kernel)
grid = (grid_size(memory.nf), 1)
args = (grid, block, stream)
args += (memory.ghat_g.ptr, memory.ghat_g.ptr)
args += (np.int32(memory.n),
np.int32(memory.nf),
np.int32(batch_size),
memory.real_type(memory.b))
args += (memory.real_type(memory.tmin),
memory.real_type(memory.tmax),
memory.real_type(samples_per_peak),
memory.real_type(minimum_frequency))
normalize.prepared_async_call(*args)
# Transfer result!
if transfer_to_host:
memory.transfer_nfft_to_cpu()
return memory.ghat_c
[docs]class NFFTAsyncProcess(GPUAsyncProcess):
"""
`GPUAsyncProcess` for the adjoint NFFT.
Parameters
----------
sigma: float, optional (default: 2)
Size of NFFT grid will be NFFT_SIZE * sigma
m: int, optional (default: 8)
Maximum radius for grid contributions (by default,
this value will automatically be set based on a specified
error tolerance)
autoset_m: bool, optional (default: True)
Automatically set the ``m`` parameter based on the
error tolerance given by the ``m_tol`` parameter
tol: float, optional (default: 1E-8)
Error tolerance for the NFFT (used to auto set ``m``)
block_size: int, optional (default: 256)
CUDA block size.
use_double: bool, optional (default: False)
Use double precision. On non-Tesla cards this will
make things ~24 times slower.
use_fast_math: bool, optional (default: True)
Compile kernel with the ``--use_fast_math`` option
supplied to ``nvcc``.
Example
-------
>>> import numpy as np
>>> t = np.random.rand(100)
>>> y = np.cos(10 * t - 0.4) + 0.1 * np.random.randn(len(t))
>>> proc = NFFTAsyncProcess()
>>> data = [(t, y, 2 * len(t))]
>>> nfft_adjoint = proc.run(data)
"""
def __init__(self, *args, **kwargs):
super(NFFTAsyncProcess, self).__init__(*args, **kwargs)
self.sigma = kwargs.get('sigma', 4)
self.m = kwargs.get('m', 8)
self.autoset_m = kwargs.get('autoset_m', False)
self.block_size = kwargs.get('block_size', 256)
self.use_double = kwargs.get('use_double', False)
self.m_tol = kwargs.get('tol', 1E-8)
self.module_options = []
if kwargs.get('use_fast_math', True):
self.module_options.append('--use_fast_math')
self.real_type = np.float64 if self.use_double \
else np.float32
self.complex_type = np.complex128 if self.use_double \
else np.complex64
self._cpp_defs = dict(BLOCK_SIZE=self.block_size)
if self.use_double:
self._cpp_defs['DOUBLE_PRECISION'] = None
self.function_names = ['precompute_psi',
'fast_gaussian_grid',
'slow_gaussian_grid', 'nfft_shift',
'normalize']
self.allocated_memory = []
[docs] def m_from_C(self, C, sigma):
"""
Returns an estimate for what ``m`` value to use from ``C``,
where ``C`` is something like ``err_tolerance/N_freq``.
Pulled from <https://github.com/jakevdp/nfft>_
"""
D = (np.pi * (1. - 1. / (2. * sigma - 1.)))
return int(np.ceil(-np.log(0.25 * C) / D))
[docs] def estimate_m(self, N):
"""
Estimate ``m`` based on an error tolerance of ``self.tol``.
Parameters
----------
N: int
size of NFFT
Returns
-------
m: int
Maximum grid radius
Notes
-----
Pulled from <https://github.com/jakevdp/nfft>_.
"""
# TODO: this should be computed in terms of the L1-norm of the true
# Fourier coefficients... see p. 11 of
# https://www-user.tu-chemnitz.de/~potts/nfft/guide/nfft3.pdf
# Need to think about how to estimate the value of m more accurately
return self.m_from_C(self.m_tol / N, self.sigma)
[docs] def get_m(self, N=None):
"""
Returns the ``m`` value for ``N`` frequencies.
Parameters
----------
N: int
Number of frequencies, only needed if ``autoset_m`` is ``False``.
Returns
-------
m: int
The filter radius (in grid points)
"""
if self.autoset_m:
return self.estimate_m(N)
else:
return self.m
def _compile_and_prepare_functions(self, **kwargs):
module_txt = _module_reader(find_kernel('cunfft'), self._cpp_defs)
self.module = SourceModule(module_txt, options=self.module_options)
self.dtypes = dict(
precompute_psi=[np.intp, np.intp, np.intp, np.intp, np.int32,
np.int32, np.int32, self.real_type,
self.real_type, self.real_type, self.real_type],
fast_gaussian_grid=[np.intp, np.intp, np.intp, np.intp,
np.intp, np.intp, np.int32, np.int32,
np.int32, np.int32, self.real_type,
self.real_type, self.real_type],
slow_gaussian_grid=[np.intp, np.intp, np.intp, np.int32,
np.int32, np.int32, np.int32, self.real_type,
self.real_type, self.real_type,
self.real_type],
normalize=[np.intp, np.intp, np.int32, np.int32, np.int32,
self.real_type, self.real_type, self.real_type,
self.real_type, self.real_type],
nfft_shift=[np.intp, np.intp, np.int32, np.int32, self.real_type,
self.real_type, self.real_type, self.real_type]
)
for function, dtype in self.dtypes.items():
func = self.module.get_function(function)
self.prepared_functions[function] = func.prepare(dtype)
self.function_tuple = tuple([self.prepared_functions[f]
for f in self.function_names])
[docs] def allocate(self, data, **kwargs):
"""
Allocate GPU memory for NFFT-related computations
Parameters
----------
data: list of (t, y, N) tuples
List of data, ``[(t_1, y_1, N_1), ...]``
* ``t``: Observation times.
* ``y``: Observations.
* ``nf``: int, FFT size
**kwargs
Returns
-------
allocated_memory: list of ``NFFTMemory`` objects
List of allocated memory for each dataset
"""
# Purge any previously allocated memory
allocated_memory = []
if len(data) > len(self.streams):
self._create_streams(len(data) - len(self.streams))
for i, (t, y, nf) in enumerate(data):
m = self.get_m(nf)
mem = NFFTMemory(self.sigma, self.streams[i], m,
use_double=self.use_double, **kwargs)
allocated_memory.append(mem.fromdata(t, y, nf=nf,
allocate=True,
**kwargs))
return allocated_memory
[docs] def run(self, data, memory=None, **kwargs):
"""
Run the adjoint NFFT on a batch of data
Parameters
----------
data: list of tuples
list of [(t, y, w), ...] containing
* ``t``: observation times
* ``y``: observations
* ``nf``: int, size of NFFT
memory:
**kwargs
Returns
-------
powers: list of np.ndarrays
List of adjoint NFFTs
"""
if not hasattr(self, 'prepared_functions') or \
not all([func in self.prepared_functions
for func in self.function_names]):
self._compile_and_prepare_functions(**kwargs)
if memory is None:
memory = self.allocate(data, **kwargs)
nfft_kwargs = dict(block_size=self.block_size)
nfft_kwargs.update(kwargs)
results = [nfft_adjoint_async(mem, self.function_tuple,
**nfft_kwargs)
for mem in memory]
return results