"""
Implementation of the box-least squares periodogram [K2002]_
and variants.
.. [K2002] `Kovacs et al. 2002 <http://adsabs.harvard.edu/abs/2002A%26A...391..369K>`_
"""
from __future__ import print_function, division
from builtins import zip
from builtins import range
import sys
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
from .core import GPUAsyncProcess
from .utils import find_kernel, _module_reader
import resource
import numpy as np
_default_block_size = 256
_all_function_names = ['full_bls_no_sol',
'bin_and_phase_fold_custom',
'reduction_max',
'store_best_sols',
'store_best_sols_custom',
'bin_and_phase_fold_bst_multifreq',
'binned_bls_bst']
_function_signatures = {
'full_bls_no_sol': [np.intp, np.intp, np.intp,
np.intp, np.intp, np.intp,
np.intp, np.uint32, np.uint32,
np.uint32, np.uint32, np.uint32,
np.float32, np.float32],
'bin_and_phase_fold_custom': [np.intp, np.intp, np.intp,
np.intp, np.intp, np.intp,
np.intp, np.intp, np.int32,
np.uint32, np.uint32, np.uint32,
np.uint32],
'reduction_max': [np.intp, np.intp, np.uint32, np.uint32, np.uint32,
np.intp, np.intp, np.uint32, np.uint32],
'store_best_sols': [np.intp, np.intp, np.intp, np.uint32,
np.uint32, np.uint32, np.float32, np.uint32,
np.uint32],
'store_best_sols_custom': [np.intp, np.intp, np.intp,
np.intp, np.intp, np.uint32,
np.uint32, np.uint32, np.uint32],
'bin_and_phase_fold_bst_multifreq':
[np.intp, np.intp, np.intp, np.intp,
np.intp, np.intp, np.uint32, np.uint32,
np.uint32, np.uint32, np.uint32, np.uint32,
np.float32, np.uint32],
'binned_bls_bst': [np.intp, np.intp, np.intp, np.uint32]
}
def _reduction_max(max_func, arr, arr_args, nfreq, nbins,
stream, final_arr, final_argmax_arr,
final_index, block_size):
# assert power of 2
assert(block_size - 2 * (block_size / 2) == 0)
block = (block_size, 1, 1)
grid_size = int(np.ceil(float(nbins) / block_size)) * nfreq
grid = (grid_size, 1)
nbins0 = nbins
init = np.uint32(1)
while (grid_size > nfreq):
max_func.prepared_async_call(grid, block, stream,
arr.ptr, arr_args.ptr,
np.uint32(nfreq), np.uint32(nbins0),
np.uint32(nbins),
arr.ptr, arr_args.ptr, np.uint32(0), init)
init = np.uint32(0)
nbins0 = grid_size / nfreq
grid_size = int(np.ceil(float(nbins0) / block_size)) * nfreq
grid = (grid_size, 1)
max_func.prepared_async_call(grid, block, stream,
arr.ptr, arr_args.ptr,
np.uint32(nfreq), np.uint32(nbins0),
np.uint32(nbins),
final_arr.ptr, final_argmax_arr.ptr,
np.uint32(final_index), init)
[docs]def fmin_transit(t, rho=1., min_obs_per_transit=5, **kwargs):
T = max(t) - min(t)
qmin = float(min_obs_per_transit) / len(t)
fmin1 = freq_transit(qmin, rho=rho)
fmin2 = 2./(max(t) - min(t))
return max([fmin1, fmin2])
[docs]def fmax_transit0(rho=1., **kwargs):
return 8.6307 * np.sqrt(rho)
[docs]def q_transit(freq, rho=1., **kwargs):
fmax0 = fmax_transit0(rho=rho)
f23 = np.power(freq / fmax0, 2./3.)
f23 = np.minimum(1., f23)
return np.arcsin(f23) / np.pi
[docs]def freq_transit(q, rho=1., **kwargs):
fmax0 = fmax_transit0(rho=rho)
return fmax0 * (np.sin(np.pi * q) ** 1.5)
[docs]def fmax_transit(rho=1., qmax=0.5, **kwargs):
fmax0 = fmax_transit0(rho=rho)
return min([fmax0, freq_transit(qmax, rho=rho, **kwargs)])
[docs]def transit_autofreq(t, fmin=None, fmax=None, samples_per_peak=2,
rho=1., qmin_fac=0.2, qmax_fac=None, **kwargs):
"""
Produce list of frequencies for a given frequency range
suitable for performing Keplerian BLS.
Parameters
----------
t: array_like, float
Observation times.
fmin: float, optional (default: ``None``)
Minimum frequency. By default this is determined by ``fmin_transit``.
fmax: float, optional (default: ``None``)
Maximum frequency. By default this is determined by ``fmax_transit``.
samples_per_peak: float, optional (default: 2)
Oversampling factor. Frequency spacing is multiplied by
``1/samples_per_peak``.
rho: float, optional (default: 1)
Mean stellar density of host star in solar units
:math:`\\rho=\\rho_{\\star} / \\rho_{\\odot}`, where
:math:`\\rho_{\\odot}`
is the mean density of the sun
qmin_fac: float, optional (default: 0.2)
The minimum :math:`q` value to search in units of the Keplerian
:math:`q` value
qmax_fac: float, optional (default: None)
The maximum :math:`q` value to search in units of the Keplerian
:math:`q` value. If ``None``, this defaults to ``1/qmin_fac``.
Returns
-------
freqs: array_like
The frequency grid
q0vals: array_like
The list of Keplerian :math:`q` values.
"""
if qmax_fac is None:
qmax_fac = 1./qmin_fac
if fmin is None:
fmin = fmin_transit(t, rho=rho, samples_per_peak=samples_per_peak,
**kwargs)
if fmax is None:
fmax = fmax_transit(rho=rho, **kwargs)
T = max(t) - min(t)
freqs = [fmin]
while freqs[-1] < fmax:
df = qmin_fac * q_transit(freqs[-1], rho=rho) / (samples_per_peak * T)
freqs.append(freqs[-1] + df)
freqs = np.array(freqs)
q0vals = q_transit(freqs, rho=rho)
return freqs, q0vals
[docs]def compile_bls(block_size=_default_block_size,
function_names=_all_function_names,
prepare=True,
**kwargs):
"""
Compile BLS kernel
Parameters
----------
block_size: int, optional (default: _default_block_size)
CUDA threads per CUDA block.
function_names: list, optional (default: _all_function_names)
Function names to load and prepare
prepare: bool, optional (default: True)
Whether or not to prepare functions (for slightly faster
kernel launching)
Returns
-------
functions: dict
Dictionary of (function name, PyCUDA function object) pairs
"""
# Read kernel
cppd = dict(BLOCK_SIZE=block_size)
kernel_txt = _module_reader(find_kernel('bls'),
cpp_defs=cppd)
# compile kernel
module = SourceModule(kernel_txt, options=['--use_fast_math'])
functions = {name: module.get_function(name) for name in function_names}
# prepare functions
if prepare:
for name in functions.keys():
sig = _function_signatures[name]
functions[name] = functions[name].prepare(sig)
return functions
[docs]class BLSMemory(object):
def __init__(self, max_ndata, max_nfreqs, stream=None, **kwargs):
self.max_ndata = max_ndata
self.max_nfreqs = max_nfreqs
self.t = None
self.yw = None
self.w = None
self.t_g = None
self.yw_g = None
self.w_g = None
self.freqs = None
self.freqs_g = None
self.qmin = None
self.nbins0_g = None
self.qmax = None
self.nbinsf_g = None
self.bls = None
self.bls_g = None
self.rtype = np.float32
self.stream = stream
self.allocate_pinned_arrays(nfreqs=max_nfreqs, ndata=max_ndata)
[docs] def allocate_pinned_arrays(self, nfreqs=None, ndata=None):
if nfreqs is None:
nfreqs = int(self.max_nfreqs)
if ndata is None:
ndata = int(self.max_ndata)
self.bls = cuda.aligned_zeros(shape=(nfreqs,),
dtype=self.rtype,
alignment=resource.getpagesize())
self.bls = cuda.register_host_memory(self.bls)
self.nbins0 = cuda.aligned_zeros(shape=(nfreqs,),
dtype=np.int32,
alignment=resource.getpagesize())
self.nbins0 = cuda.register_host_memory(self.nbins0)
self.nbinsf = cuda.aligned_zeros(shape=(nfreqs,),
dtype=np.int32,
alignment=resource.getpagesize())
self.nbinsf = cuda.register_host_memory(self.nbinsf)
self.t = cuda.aligned_zeros(shape=(ndata,),
dtype=self.rtype,
alignment=resource.getpagesize())
self.t = cuda.register_host_memory(self.t)
self.yw = cuda.aligned_zeros(shape=(ndata,),
dtype=self.rtype,
alignment=resource.getpagesize())
self.yw = cuda.register_host_memory(self.yw)
self.w = cuda.aligned_zeros(shape=(ndata,),
dtype=self.rtype,
alignment=resource.getpagesize())
self.w = cuda.register_host_memory(self.w)
[docs] def allocate_freqs(self, nfreqs=None):
if nfreqs is None:
nfreqs = self.max_nfreqs
self.freqs_g = gpuarray.zeros(nfreqs, dtype=self.rtype)
self.bls_g = gpuarray.zeros(nfreqs, dtype=self.rtype)
self.nbins0_g = gpuarray.zeros(nfreqs, dtype=np.uint32)
self.nbinsf_g = gpuarray.zeros(nfreqs, dtype=np.uint32)
[docs] def allocate_data(self, ndata=None):
if ndata is None:
ndata = len(self.t)
self.t_g = gpuarray.zeros(ndata, dtype=self.rtype)
self.yw_g = gpuarray.zeros(ndata, dtype=self.rtype)
self.w_g = gpuarray.zeros(ndata, dtype=self.rtype)
[docs] def transfer_data_to_gpu(self, transfer_freqs=True):
self.t_g.set_async(self.t, stream=self.stream)
self.yw_g.set_async(self.yw, stream=self.stream)
self.w_g.set_async(self.w, stream=self.stream)
if transfer_freqs:
self.freqs_g.set_async(self.freqs, stream=self.stream)
self.nbins0_g.set_async(self.nbins0, stream=self.stream)
self.nbinsf_g.set_async(self.nbinsf, stream=self.stream)
[docs] def transfer_data_to_cpu(self):
# self.bls_g.get_async(ary=self.bls, stream=self.stream)
if self.stream is None:
self.bls = self.bls_g.get() / self.yy
else:
self.bls_g.get_async(ary=self.bls, stream=self.stream)
self.bls /= self.yy
# return self.bls
[docs] def setdata(self, t, y, dy, qmin=None, qmax=None,
freqs=None, nf=None, transfer=True,
**kwargs):
if freqs is not None:
self.freqs = np.asarray(freqs).astype(self.rtype)
self.nbinsf = (np.ones_like(self.freqs)/qmin).astype(np.uint32)
self.nbins0 = (np.ones_like(self.freqs)/qmax).astype(np.uint32)
self.t[:len(t)] = np.asarray(t).astype(self.rtype)[:]
w = np.power(dy, -2)
w /= sum(w)
self.w[:len(t)] = np.asarray(w).astype(self.rtype)[:]
self.ybar = sum(y * w)
self.yy = np.dot(w, np.power(y - self.ybar, 2))
u = (y - self.ybar) * w
self.yw[:len(t)] = np.asarray(u).astype(self.rtype)[:]
if any([x is None for x in [self.t_g, self.yw_g, self.w_g]]):
self.allocate_data()
if self.freqs_g is None:
if nf is None:
nf = len(freqs)
self.allocate_freqs(nfreqs=nf)
if transfer:
self.transfer_data_to_gpu(transfer_freqs=(freqs is not None))
return self
[docs] @classmethod
def fromdata(cls, t, y, dy, qmin=None, qmax=None,
freqs=None, nf=None, transfer=True,
**kwargs):
max_ndata = kwargs.get('max_ndata', len(t))
max_nfreqs = kwargs.get('max_nfreqs', nf if freqs is None
else len(freqs))
c = cls(max_ndata, max_nfreqs, **kwargs)
return c.setdata(t, y, dy, qmin=qmin, qmax=qmax,
freqs=freqs, nf=nf, transfer=transfer,
**kwargs)
[docs]def eebls_gpu_fast(t, y, dy, freqs, qmin=1e-2, qmax=0.5,
functions=None, stream=None, dlogq=0.3,
memory=None, noverlap=2, max_nblocks=5000,
force_nblocks=None, dphi=0.0,
shmem_lim=None, freq_batch_size=None,
transfer_to_device=True,
transfer_to_host=True, **kwargs):
"""
Box-Least Squares with PyCUDA but about 2-3 orders of magnitude
faster than eebls_gpu. Uses shared memory for the binned data,
which means that there is a lower limit on the q values that
this function can handle.
To save memory and improve speed, the best solution is not
kept. To get the best solution, run ``eebls_gpu`` at the
optimal frequency.
.. warning::
If you are running on a single-GPU machine, there may be a
kernel time limit set by your OS. If running this function
produces a timeout error, try setting ``freq_batch_size`` to a
reasonable number (~10). That will split up the computations by
frequency.
.. note::
No extra global memory is needed, meaning you likely do *not* need
to use ``large_run`` with this function.
.. note::
There is no ``noverlap`` parameter here yet. This is only a problem
if the optimal ``q`` value is close to ``qmin``. To alleviate this,
you can run this function ``noverlap`` times with
``dphi = i/noverlap`` for the ``i``-th run. Then take the best solution
of all runs.
Parameters
----------
t: array_like, float
Observation times
y: array_like, float
Observations
dy: array_like, float
Observation uncertainties
freqs: array_like, float
Frequencies
qmin: float or array_like, optional (default: 1e-2)
minimum q values to search at each frequency
qmax: float or array_like (default: 0.5)
maximum q values to search at each frequency
dphi: float, optional (default: 0.)
Phase offset (in units of the finest grid spacing). If you
want ``noverlap`` bins at the smallest ``q`` value, run this
function ``noverlap`` times, with ``dphi = i / noverlap``
for the ``i``-th run and take the best solution for all the runs.
dlogq: float
The logarithmic spacing of the q values to use. If negative,
the q values increase by ``dq = qmin``.
functions: dict
Dictionary of compiled functions (see :func:`compile_bls`)
freq_batch_size: int, optional (default: None)
Number of frequencies to compute in a single batch; if
``None`` this will run a single batch for all frequencies
simultaneously
shmem_lim: int, optional (default: None)
Maximum amount of shared memory to use per block in bytes.
This is GPU-dependent but usually around 48KB. If ``None``,
uses device information provided by PyCUDA (recommended).
max_nblocks: int, optional (default: 200)
Maximum grid size to use
force_nblocks: int, optional (default: None)
If this is set the gridsize is forced to be this value
memory: :class:`BLSMemory` instance, optional (default: None)
See :class:`BLSMemory`.
transfer_to_host: bool, optional (default: True)
Transfer BLS back to CPU.
transfer_to_device: bool, optional (default: True)
Transfer data to GPU
**kwargs:
passed to `compile_bls`
Returns
-------
bls: array_like, float
BLS periodogram, normalized to
:math:`1 - \chi_2(\omega) / \chi_2(constant)`
"""
fname = 'full_bls_no_sol'
if functions is None:
functions = compile_bls(function_names=[fname], **kwargs)
func = functions[fname]
if shmem_lim is None:
dev = pycuda.autoinit.device
att = cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK
shmem_lim = pycuda.autoinit.device.get_attribute(att)
if memory is None:
memory = BLSMemory.fromdata(t, y, dy, qmin=qmin, qmax=qmax,
freqs=freqs, stream=stream,
transfer=True,
**kwargs)
elif transfer_to_device:
memory.setdata(t, y, dy, qmin=qmin, qmax=qmax,
freqs=freqs, transfer=True,
**kwargs)
float_size = np.float32(1).nbytes
block_size = kwargs.get('block_size', _default_block_size)
if freq_batch_size is None:
freq_batch_size = len(freqs)
nbatches = int(np.ceil(len(freqs) / freq_batch_size))
block = (block_size, 1, 1)
# minimum q value that we can handle with the shared memory limit
qmin_min = 2 * float_size / (shmem_lim - float_size * block_size)
i_freq = 0
while(i_freq < len(freqs)):
j_freq = min([i_freq + freq_batch_size, len(freqs)])
nfreqs = j_freq - i_freq
max_nbins = max(memory.nbinsf[i_freq:j_freq])
mem_req = (block_size + 2 * max_nbins) * float_size
if mem_req > shmem_lim:
s = "qmin = %.2e requires too much shared memory." % (1./max_nbins)
s += " Either try a larger value of qmin (> %e)" % (qmin_min)
s += " or avoid using eebls_gpu_fast."
raise Exception(s)
# nblocks = int((2 * max_shmem / (mem_req + 4 * float_size)))
nblocks = min([nfreqs, max_nblocks])
if force_nblocks is not None:
nblocks = force_nblocks
grid = (nblocks, 1)
args = (grid, block)
if stream is not None:
args += (stream,)
args += (memory.t_g.ptr, memory.yw_g.ptr, memory.w_g.ptr)
args += (memory.bls_g.ptr, memory.freqs_g.ptr)
args += (memory.nbins0_g.ptr, memory.nbinsf_g.ptr)
args += (np.uint32(len(t)), np.uint32(nfreqs),
np.uint32(i_freq))
args += (np.uint32(max_nbins), np.uint32(noverlap))
args += (np.float32(dlogq), np.float32(dphi))
if stream is not None:
func.prepared_async_call(*args, shared_size=int(mem_req))
else:
func.prepared_call(*args, shared_size=int(mem_req))
i_freq = j_freq
if transfer_to_host:
memory.transfer_data_to_cpu()
if stream is not None:
stream.synchronize()
return memory.bls
[docs]def eebls_gpu_custom(t, y, dy, freqs, q_values, phi_values,
freq_batch_size=None, nstreams=5, max_memory=None,
functions=None, **kwargs):
"""
Box-Least Squares, with custom q and phi values. Useful
if you're honing the initial solution or testing between
a relatively small number of possible solutions.
Parameters
----------
t: array_like, float
Observation times
y: array_like, float
Observations
dy: array_like, float
Observation uncertainties
freqs: array_like, float
Frequencies
q_values: array_like
Set of q values to search at each trial frequency
phi_values: float or array_like
Set of phi values to search at each trial frequency
nstreams: int, optional (default: 5)
Number of CUDA streams to utilize.
freq_batch_size: int, optional (default: None)
Number of frequencies to compute in a single batch; determines
this automatically by default based on ``max_memory``
max_memory: float, optional (default: None)
Maximum memory to use in bytes. Will ignore this if
``freq_batch_size`` is specified. If ``None``, will use the
free memory given by ``pycuda.driver.mem_get_info()``
functions: tuple of CUDA functions
Dictionary of prepared functions from :func:`compile_bls`.
**kwargs:
passed to :func:`compile_bls`
Returns
-------
bls: array_like, float
BLS periodogram, normalized to 1 - chi2(best_fit) / chi2(constant)
qphi_sols: list of (q, phi) tuples
Best (q, phi) solution at each frequency
"""
functions = functions if functions is not None \
else compile_bls(**kwargs)
block_size = kwargs.get('block_size', _default_block_size)
ndata = len(t)
# read max_memory as total free memory available from driver
if max_memory is None:
free, total = cuda.mem_get_info()
max_memory = int(0.9 * free)
if freq_batch_size is None:
# compute memory
real_type_size = 4
# data
mem0 = ndata * 3 * real_type_size
nq = len(q_values)
nphi = len(phi_values)
# q_values and phi_values
mem0 += nq + nphi
# freqs + bls + best_phi + best_q + best_sol (int32)
mem0 += len(freqs) * 5 * real_type_size
# yw_g_bins, w_g_bins, bls_tmp_gs, bls_tmp_sol_gs (int32)
mem_per_f = 4 * nstreams * nq * nphi * real_type_size
freq_batch_size = int(float(max_memory - mem0) / (mem_per_f))
if freq_batch_size == 0:
raise Exception("Not enough memory (freq_batch_size = 0)")
nbtot = len(q_values) * len(phi_values) * freq_batch_size
grid_size = int(np.ceil(float(nbtot) / block_size))
# move data to GPU
w = np.power(dy, -2)
w /= sum(w)
ybar = np.dot(w, y)
YY = np.dot(w, np.power(np.array(y) - ybar, 2))
yw = (np.array(y) - ybar) * np.array(w)
t_g = gpuarray.to_gpu(np.array(t).astype(np.float32))
yw_g = gpuarray.to_gpu(yw.astype(np.float32))
w_g = gpuarray.to_gpu(np.array(w).astype(np.float32))
freqs_g = gpuarray.to_gpu(np.array(freqs).astype(np.float32))
yw_g_bins, w_g_bins, bls_tmp_gs, bls_tmp_sol_gs, streams \
= [], [], [], [], []
for i in range(nstreams):
streams.append(cuda.Stream())
yw_g_bins.append(gpuarray.zeros(nbtot, dtype=np.float32))
w_g_bins.append(gpuarray.zeros(nbtot, dtype=np.float32))
bls_tmp_gs.append(gpuarray.zeros(nbtot, dtype=np.float32))
bls_tmp_sol_gs.append(gpuarray.zeros(nbtot, dtype=np.uint32))
bls_g = gpuarray.zeros(len(freqs), dtype=np.float32)
bls_sol_g = gpuarray.zeros(len(freqs), dtype=np.uint32)
bls_best_phi = gpuarray.zeros(len(freqs), dtype=np.float32)
bls_best_q = gpuarray.zeros(len(freqs), dtype=np.float32)
q_values_g = gpuarray.to_gpu(np.asarray(q_values).astype(np.float32))
phi_values_g = gpuarray.to_gpu(np.asarray(phi_values).astype(np.float32))
block = (block_size, 1, 1)
grid = (grid_size, 1)
nbatches = int(np.ceil(float(len(freqs)) / freq_batch_size))
bls = np.zeros(len(freqs))
bin_func = functions['bin_and_phase_fold_custom']
bls_func = functions['binned_bls_bst']
max_func = functions['reduction_max']
store_func = functions['store_best_sols_custom']
for batch in range(nbatches):
imin = freq_batch_size * batch
imax = min([len(freqs), freq_batch_size * (batch + 1)])
nf = imax - imin
j = batch % nstreams
yw_g_bin = yw_g_bins[j]
w_g_bin = w_g_bins[j]
bls_tmp_g = bls_tmp_gs[j]
bls_tmp_sol_g = bls_tmp_sol_gs[j]
stream = streams[j]
yw_g_bin.fill(np.float32(0), stream=stream)
w_g_bin.fill(np.float32(0), stream=stream)
bls_tmp_g.fill(np.float32(0), stream=stream)
bls_tmp_sol_g.fill(np.int32(0), stream=stream)
bin_grid = (int(np.ceil(float(len(t) * nf) / block_size)), 1)
args = (bin_grid, block, stream)
args += (t_g.ptr, yw_g.ptr, w_g.ptr)
args += (yw_g_bin.ptr, w_g_bin.ptr, freqs_g.ptr)
args += (q_values_g.ptr, phi_values_g.ptr)
args += (np.uint32(len(q_values)), np.uint32(len(phi_values)))
args += (np.uint32(len(t)), np.uint32(nf))
args += (np.uint32(freq_batch_size * batch),)
bin_func.prepared_async_call(*args)
nb = len(q_values) * len(phi_values)
bls_grid = (int(np.ceil(float(nf * nb) / block_size)), 1)
args = (bls_grid, block, stream)
args += (yw_g_bin.ptr, w_g_bin.ptr)
args += (bls_tmp_g.ptr, np.uint32(nf * nb))
bls_func.prepared_async_call(*args)
args = (max_func, bls_tmp_g, bls_tmp_sol_g)
args += (nf, nb, stream, bls_g, bls_sol_g)
args += (batch * freq_batch_size, block_size)
_reduction_max(*args)
store_grid = (int(np.ceil(float(nf) / block_size)), 1)
args = (store_grid, block, stream)
args += (bls_sol_g.ptr, bls_best_phi.ptr, bls_best_q.ptr)
args += (q_values_g.ptr, phi_values_g.ptr)
args += (np.uint32(len(q_values)), np.uint32(len(phi_values)))
args += (np.uint32(nf), np.uint32(batch * freq_batch_size))
store_func.prepared_async_call(*args)
best_q = bls_best_q.get()
best_phi = bls_best_phi.get()
qphi_sols = list(zip(best_q, best_phi))
return bls_g.get()/YY, qphi_sols
[docs]def dnbins(nbins, dlogq):
if (dlogq < 0):
return 1
n = int(np.floor(dlogq * nbins))
return n if n > 0 else 1
[docs]def nbins_iter(i, nb0, dlogq):
nb = nb0
for j in range(i):
nb += dnbins(nb, dlogq)
return nb
[docs]def count_tot_nbins(nbins0, nbinsf, dlogq):
ntot = 0
i = 0
while nbins_iter(i, nbins0, dlogq) <= nbinsf:
ntot += nbins_iter(i, nbins0, dlogq)
i += 1
return ntot
[docs]def eebls_gpu(t, y, dy, freqs, qmin=1e-2, qmax=0.5,
nstreams=5, noverlap=3, dlogq=0.2, max_memory=None,
freq_batch_size=None, functions=None, **kwargs):
"""
Box-Least Squares, accelerated with PyCUDA
Parameters
----------
t: array_like, float
Observation times
y: array_like, float
Observations
dy: array_like, float
Observation uncertainties
freqs: array_like, float
Frequencies
qmin: float or array_like
Minimum q value(s) to test for each frequency
qmax: float or array_like
Maximum q value(s) to test for each frequency
nstreams: int, optional (default: 5)
Number of CUDA streams to utilize.
noverlap: int, optional (default: 3)
Number of overlapping q bins to use
dlogq: float, optional, (default: 0.5)
logarithmic spacing of :math:`q` values, where :math:`d\log q = dq / q`
freq_batch_size: int, optional (default: None)
Number of frequencies to compute in a single batch; determines
this automatically based on ``max_memory``
max_memory: float, optional (default: None)
Maximum memory to use in bytes. Will ignore this if
``freq_batch_size`` is specified, and will use the total free memory
as returned by ``pycuda.driver.mem_get_info`` if this is ``None``.
functions: tuple of CUDA functions
returned by ``compile_bls``
Returns
-------
bls: array_like, float
BLS periodogram, normalized to :math:`1 - \chi^2(f) / \chi^2_0`
qphi_sols: list of ``(q, phi)`` tuples
Best ``(q, phi)`` solution at each frequency
"""
def locext(ext, arr, imin=None, imax=None):
if isinstance(arr, float) or isinstance(arr, int):
return arr
return ext(arr[slice(imin, imax)])
functions = functions if functions is not None \
else compile_bls(**kwargs)
if max_memory is None:
free, total = cuda.mem_get_info()
max_memory = int(0.9 * free)
# smallest and largest number of bins
nbins0_max = 1
nbinsf_max = 1
block_size = kwargs.get('block_size', _default_block_size)
max_q_vals = locext(max, qmax)
min_q_vals = locext(min, qmin)
nbins0_max = int(np.floor(1./max_q_vals))
nbinsf_max = int(np.ceil(1./min_q_vals))
ndata = len(t)
nbins_tot_max = count_tot_nbins(nbins0_max, nbinsf_max, dlogq)
if freq_batch_size is None:
# compute memory
real_type_size = np.float32(1).nbytes
# data
mem0 = ndata * 3 * real_type_size
# freqs + bls + best_phi + best_q + best_sol (int32)
mem0 += len(freqs) * 5 * real_type_size
# yw_g_bins, w_g_bins, bls_tmp_gs, bls_tmp_sol_gs (int32)
mem_per_f = 4 * nstreams * nbins_tot_max * noverlap * real_type_size
freq_batch_size = int(float(max_memory - mem0) / (mem_per_f))
if freq_batch_size == 0:
raise Exception("Not enough memory (freq_batch_size = 0)")
gs = freq_batch_size * nbins_tot_max * noverlap
grid_size = int(np.ceil(float(gs) / block_size))
# move data to GPU
w = np.power(dy, -2)
w /= sum(w)
ybar = np.dot(w, y)
YY = np.dot(w, np.power(np.array(y) - ybar, 2))
yw = (np.array(y) - ybar) * np.array(w)
t_g = gpuarray.to_gpu(np.array(t).astype(np.float32))
yw_g = gpuarray.to_gpu(yw.astype(np.float32))
w_g = gpuarray.to_gpu(np.array(w).astype(np.float32))
freqs_g = gpuarray.to_gpu(np.array(freqs).astype(np.float32))
yw_g_bins, w_g_bins, bls_tmp_gs, bls_tmp_sol_gs, streams \
= [], [], [], [], []
for i in range(nstreams):
streams.append(cuda.Stream())
yw_g_bins.append(gpuarray.zeros(gs, dtype=np.float32))
w_g_bins.append(gpuarray.zeros(gs, dtype=np.float32))
bls_tmp_gs.append(gpuarray.zeros(gs, dtype=np.float32))
bls_tmp_sol_gs.append(gpuarray.zeros(gs, dtype=np.int32))
bls_g = gpuarray.zeros(len(freqs), dtype=np.float32)
bls_sol_g = gpuarray.zeros(len(freqs), dtype=np.int32)
bls_best_phi = gpuarray.zeros(len(freqs), dtype=np.float32)
bls_best_q = gpuarray.zeros(len(freqs), dtype=np.float32)
block = (block_size, 1, 1)
grid = (grid_size, 1)
nbatches = int(np.ceil(float(len(freqs)) / freq_batch_size))
bls = np.zeros(len(freqs))
bin_func = functions['bin_and_phase_fold_bst_multifreq']
bls_func = functions['binned_bls_bst']
max_func = functions['reduction_max']
store_func = functions['store_best_sols']
for batch in range(nbatches):
imin = freq_batch_size * batch
imax = min([len(freqs), freq_batch_size * (batch + 1)])
minq = locext(min, qmin, imin, imax)
maxq = locext(max, qmax, imin, imax)
nbins0 = int(np.floor(1./maxq))
nbinsf = int(np.ceil(1./minq))
nbins_tot = count_tot_nbins(nbins0, nbinsf, dlogq)
nf = imax - imin
j = batch % nstreams
yw_g_bin = yw_g_bins[j]
w_g_bin = w_g_bins[j]
bls_tmp_g = bls_tmp_gs[j]
bls_tmp_sol_g = bls_tmp_sol_gs[j]
stream = streams[j]
# stream.synchronize()
yw_g_bin.fill(np.float32(0), stream=stream)
w_g_bin.fill(np.float32(0), stream=stream)
bls_tmp_g.fill(np.float32(0), stream=stream)
bls_tmp_sol_g.fill(np.int32(0), stream=stream)
bin_grid = (int(np.ceil(float(ndata * nf) / block_size)), 1)
args = (bin_grid, block, stream)
args += (t_g.ptr, yw_g.ptr, w_g.ptr)
args += (yw_g_bin.ptr, w_g_bin.ptr, freqs_g.ptr)
args += (np.int32(ndata), np.int32(nf))
args += (np.int32(nbins0), np.int32(nbinsf))
args += (np.int32(freq_batch_size * batch), np.int32(noverlap))
args += (np.float32(dlogq), np.int32(nbins_tot))
bin_func.prepared_async_call(*args)
all_bins = nf * nbins_tot * noverlap
bls_grid = (int(np.ceil(float(all_bins) / block_size)), 1)
args = (bls_grid, block, stream)
args += (yw_g_bin.ptr, w_g_bin.ptr)
args += (bls_tmp_g.ptr, np.int32(all_bins))
bls_func.prepared_async_call(*args)
args = (max_func, bls_tmp_g, bls_tmp_sol_g)
args += (nf, nbins_tot * noverlap, stream, bls_g, bls_sol_g)
args += (batch * freq_batch_size, block_size)
_reduction_max(*args)
store_grid = (int(np.ceil(float(nf) / block_size)), 1)
args = (store_grid, block, stream)
args += (bls_sol_g.ptr, bls_best_phi.ptr, bls_best_q.ptr)
args += (np.uint32(nbins0), np.uint32(nbinsf), np.uint32(noverlap))
args += (np.float32(dlogq), np.uint32(nf))
args += (np.uint32(batch * freq_batch_size),)
store_func.prepared_async_call(*args)
best_q = bls_best_q.get()
best_phi = bls_best_phi.get()
qphi_sols = list(zip(best_q, best_phi))
return bls_g.get()/YY, qphi_sols
[docs]def single_bls(t, y, dy, freq, q, phi0):
"""
Evaluate BLS power for a single set of (freq, q, phi0)
parameters.
Parameters
----------
t: array_like, float
Observation times
y: array_like, float
Observations
dy: array_like, float
Observation uncertainties
freq: float
Frequency of the signal
q: float
Transit duration in phase
phi0: float
Phase offset of transit
Returns
-------
bls: float
BLS power for this set of parameters
"""
phi = np.asarray(t).astype(np.float32) * np.float32(freq)
phi -= np.float32(phi0)
phi -= np.floor(phi)
mask = phi < np.float32(q)
w = np.power(dy, -2)
w /= np.sum(w.astype(np.float32))
ybar = np.dot(w, np.asarray(y).astype(np.float32))
YY = np.dot(w, np.power(np.asarray(y).astype(np.float32) - ybar, 2))
W = np.sum(w[mask])
YW = np.dot(w[mask], np.asarray(y).astype(np.float32)[mask]) - ybar * W
return 0 if W < 1e-9 else (YW ** 2) / (W * (1 - W)) / YY
[docs]def hone_solution(t, y, dy, f0, df0, q0, dlogq0, phi0, stop=1e-5,
samples_per_peak=5, max_iter=50, noverlap=3, **kwargs):
"""
Experimental!
"""
p0 = single_bls(t, y, dy, f0, q0, phi0)
pn = None
df = df0
dlogq = dlogq0
q = q0
phi = phi0
f = f0
nol = noverlap
baseline = max(t) - min(t)
functions = compile_bls(**kwargs)
i = 0
while pn is None or i < 5 or ((pn - p0) / p0 > stop and i < max_iter):
if pn is not None:
p0 = pn
fmin, fmax = f - 25 * df, f + 25 * df
qmin = q / (1 + 5 * dlogq)
qmax = q * (1 + 5 * dlogq)
df *= 0.1
dlogq *= 0.25
nq = int(np.ceil(np.log(qmax/qmin)/dlogq))
q_values = np.logspace(np.log(qmin), np.log(qmax), num=nq, base=np.e)
dphi = 2 * ((fmax - fmin) * baseline * samples_per_peak + dlogq * q)
phimin = 0.
phimax = 1.
if dphi < 0.25:
phimin = phi - dphi
phimax = phi + dphi
nphi = max([10, int(np.ceil(10 * min([2 * dphi, 1.]) / qmin))])
phi_values = np.linspace(phimin, phimax, nphi)
nf = int((fmax - fmin)/df)
freqs = np.linspace(fmin, fmax + df, nf)
powers, sols = eebls_gpu_custom(t, y, dy, freqs, q_values, phi_values,
freq_batch_size=5, nstreams=5,
functions=functions, **kwargs)
ibest = np.argmax(powers)
f = freqs[ibest]
q, phi = sols[ibest]
pn = powers[ibest]
i += 1
return f, pn, i, (q, phi)
[docs]def eebls_transit_gpu(t, y, dy, fmax_frac=1.0, fmin_frac=1.0,
qmin_fac=0.5, qmax_fac=2.0, fmin=None,
fmax=None, freqs=None, qvals=None, use_fast=False,
**kwargs):
"""
Compute BLS for timeseries assuming edge-on keplerian
orbit of a planet with Mp/Ms << 1, Rp/Rs < 1, Lp/Ls << 1 and
negligible eccentricity.
Parameters
----------
t: array_like, float
Observation times
y: array_like, float
Observations
dy: array_like, float
Observation uncertainties
fmax_frac: float, optional (default: 1.0)
Maximum frequency is `fmax_frac * fmax`, where
`fmax` is automatically selected by `fmax_transit`.
fmin_frac: float, optional (default: 1.5)
Minimum frequency is `fmin_frac * fmin`, where
`fmin` is automatically selected by `fmin_transit`.
fmin: float, optional (default: None)
Overrides automatic frequency minimum with this value
fmax: float, optional (default: None)
Overrides automatic frequency maximum with this value
qmin_fac: float, optional (default: 0.5)
Fraction of the fiducial q value to search
at each frequency (minimum)
qmax_fac: float, optional (default: 2.0)
Fraction of the fiducial q value to search
at each frequency (maximum)
freqs: array_like, optional (default: None)
Overrides the auto-generated frequency grid
qvals: array_like, optional (default: None)
Overrides the keplerian q values
functions: tuple, optional (default=None)
result of ``compile_bls(**kwargs)``.
use_fast: bool, optional (default: False)
**kwargs:
passed to `eebls_gpu`, `compile_bls`, `fmax_transit`,
`fmin_transit`, and `transit_autofreq`
Returns
-------
freqs: array_like, float
Frequencies where BLS is evaluated
bls: array_like, float
BLS periodogram, normalized to :math:`1 - \chi^2(f) / \chi^2_0`
solutions: list of ``(q, phi)`` tuples
Best ``(q, phi)`` solution at each frequency
.. note::
Only returned when ``use_fast=False``.
"""
if freqs is None:
if qvals is not None:
raise Exception("qvals must be None if freqs is None")
if fmin is None:
fmin = fmin_transit(t, **kwargs) * fmin_frac
if fmax is None:
fmax = fmax_transit(qmax=0.5 / qmax_fac, **kwargs) * fmax_frac
freqs, qvals = transit_autofreq(t, fmin=fmin, fmax=fmax,
qmin_fac=qmin_fac, **kwargs)
if qvals is None:
qvals = q_transit(freqs, **kwargs)
qmins = qvals * qmin_fac
qmaxes = qvals * qmax_fac
if use_fast:
powers = eebls_gpu_fast(t, y, dy, freqs,
qmin=qmins, qmax=qmaxes,
**kwargs)
return freqs, powers
powers, sols = eebls_gpu(t, y, dy, freqs,
qmin=qmins, qmax=qmaxes,
**kwargs)
return freqs, powers, sols