Source code for cuvarbase.ce

"""
Implementation of Graham et al. 2013's Conditional Entropy
period finding algorithm
"""
from __future__ import print_function, division

from builtins import zip
from builtins import range
from builtins import object

import numpy as np

import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule

from .core import GPUAsyncProcess
from .utils import _module_reader, find_kernel
from .utils import autofrequency as utils_autofreq

import resource
import warnings


[docs]class ConditionalEntropyMemory(object): def __init__(self, **kwargs): self.phase_bins = kwargs.get('phase_bins', 10) self.mag_bins = kwargs.get('mag_bins', 5) self.phase_overlap = kwargs.get('phase_overlap', 0) self.mag_overlap = kwargs.get('mag_overlap', 0) self.max_phi = kwargs.get('max_phi', 3.) self.stream = kwargs.get('stream', None) self.weighted = kwargs.get('weighted', False) self.widen_mag_range = kwargs.get('widen_mag_range', False) self.n0 = kwargs.get('n0', None) self.nf = kwargs.get('nf', None) self.compute_log_prob = kwargs.get('compute_log_prob', False) self.balanced_magbins = kwargs.get('balanced_magbins', False) if self.weighted and self.balanced_magbins: raise Exception("simultaneous balanced_magbins and weighted" " options is not currently supported") if self.weighted and self.compute_log_prob: raise Exception("simultaneous compute_log_prob and weighted" " options is not currently supported") self.n0_buffer = kwargs.get('n0_buffer', None) self.buffered_transfer = kwargs.get('buffered_transfer', False) self.t = None self.y = None self.dy = None self.t_g = None self.y_g = None self.dy_g = None self.bins_g = None self.ce_c = None self.ce_g = None self.mag_bwf = None self.mag_bwf_g = None self.real_type = np.float32 if kwargs.get('use_double', False): self.real_type = np.float64 self.freqs = kwargs.get('freqs', None) self.freqs_g = None self.mag_bin_fracs = None self.mag_bin_fracs_g = None self.ytype = np.uint32 if not self.weighted else self.real_type
[docs] def allocate_buffered_data_arrays(self, **kwargs): n0 = kwargs.get('n0', self.n0) if self.buffered_transfer: n0 = kwargs.get('n0_buffer', self.n0_buffer) assert(n0 is not None) kw = dict(dtype=self.real_type, alignment=resource.getpagesize()) self.t = cuda.aligned_zeros(shape=(n0,), **kw) self.t = cuda.register_host_memory(self.t) self.y = cuda.aligned_zeros(shape=(n0,), dtype=self.ytype, alignment=resource.getpagesize()) self.y = cuda.register_host_memory(self.y) if self.weighted: self.dy = cuda.aligned_zeros(shape=(n0,), **kw) self.dy = cuda.register_host_memory(self.dy) if self.balanced_magbins: self.mag_bwf = cuda.aligned_zeros(shape=(self.mag_bins,), **kw) self.mag_bwf = cuda.register_host_memory(self.mag_bwf) if self.compute_log_prob: self.mag_bin_fracs = cuda.aligned_zeros(shape=(self.mag_bins,), **kw) self.mag_bin_fracs = cuda.register_host_memory(self.mag_bin_fracs) return self
[docs] def allocate_pinned_cpu(self, **kwargs): nf = kwargs.get('nf', self.nf) assert(nf is not None) self.ce_c = cuda.aligned_zeros(shape=(nf,), dtype=self.real_type, alignment=resource.getpagesize()) self.ce_c = cuda.register_host_memory(self.ce_c) return self
[docs] def allocate_data(self, **kwargs): n0 = kwargs.get('n0', self.n0) if self.buffered_transfer: n0 = kwargs.get('n0_buffer', self.n0_buffer) assert(n0 is not None) self.t_g = gpuarray.zeros(n0, dtype=self.real_type) self.y_g = gpuarray.zeros(n0, dtype=self.ytype) if self.weighted: self.dy_g = gpuarray.zeros(n0, dtype=self.real_type)
[docs] def allocate_bins(self, **kwargs): nf = kwargs.get('nf', self.nf) assert(nf is not None) self.nbins = nf * self.phase_bins * self.mag_bins if self.weighted: self.bins_g = gpuarray.zeros(self.nbins, dtype=self.real_type) else: self.bins_g = gpuarray.zeros(self.nbins, dtype=np.uint32) if self.balanced_magbins: self.mag_bwf_g = gpuarray.zeros(self.mag_bins, dtype=self.real_type) if self.compute_log_prob: self.mag_bin_fracs_g = gpuarray.zeros(self.mag_bins, dtype=self.real_type)
[docs] def allocate_freqs(self, **kwargs): nf = kwargs.get('nf', self.nf) assert(nf is not None) self.freqs_g = gpuarray.zeros(nf, dtype=self.real_type) if self.ce_g is None: self.ce_g = gpuarray.zeros(nf, dtype=self.real_type)
[docs] def allocate(self, **kwargs): self.freqs = kwargs.get('freqs', self.freqs) self.nf = kwargs.get('nf', len(self.freqs)) if self.freqs is not None: self.freqs = np.asarray(self.freqs).astype(self.real_type) assert(self.nf is not None) self.allocate_data(**kwargs) self.allocate_bins(**kwargs) self.allocate_freqs(**kwargs) self.allocate_pinned_cpu(**kwargs) if self.buffered_transfer: self.allocate_buffered_data_arrays(**kwargs) return self
[docs] def transfer_data_to_gpu(self, **kwargs): assert(not any([x is None for x in [self.t, self.y]])) self.t_g.set_async(self.t, stream=self.stream) self.y_g.set_async(self.y, stream=self.stream) if self.weighted: assert(self.dy is not None) self.dy_g.set_async(self.dy, stream=self.stream) if self.balanced_magbins: self.mag_bwf_g.set_async(self.mag_bwf, stream=self.stream) if self.compute_log_prob: self.mag_bin_fracs_g.set_async(self.mag_bin_fracs, stream=self.stream)
[docs] def transfer_freqs_to_gpu(self, **kwargs): freqs = kwargs.get('freqs', self.freqs) assert(freqs is not None) self.freqs_g.set_async(freqs, stream=self.stream)
[docs] def transfer_ce_to_cpu(self, **kwargs): self.ce_g.get_async(stream=self.stream, ary=self.ce_c)
[docs] def compute_mag_bin_fracs(self, y, **kwargs): N = float(len(y)) mbf = np.array([np.sum(y == i)/N for i in range(self.mag_bins)]) if self.mag_bin_fracs is None: self.mag_bin_fracs = np.zeros(self.mag_bins, dtype=self.real_type) self.mag_bin_fracs[:self.mag_bins] = mbf[:]
[docs] def balance_magbins(self, y, **kwargs): yinds = np.argsort(y) ybins = np.zeros(len(y)) assert len(y) >= self.mag_bins di = len(y) / self.mag_bins mag_bwf = np.zeros(self.mag_bins) for i in range(self.mag_bins): imin = max([0, int(i * di)]) imax = min([len(y), int((i + 1) * di)]) inds = yinds[imin:imax] ybins[inds] = i mag_bwf[i] = y[inds[-1]] - y[inds[0]] mag_bwf /= (max(y) - min(y)) return ybins, mag_bwf.astype(self.real_type)
[docs] def setdata(self, t, y, **kwargs): dy = kwargs.get('dy', self.dy) self.n0 = kwargs.get('n0', len(t)) t = np.asarray(t).astype(self.real_type) y = np.asarray(y).astype(self.real_type) yscale = max(y[:self.n0]) - min(y[:self.n0]) y0 = min(y[:self.n0]) if self.weighted: dy = np.asarray(dy).astype(self.real_type) if self.widen_mag_range: med_sigma = np.median(dy[:self.n0]) yscale += 2 * self.max_phi * med_sigma y0 -= self.max_phi * med_sigma dy /= yscale y = (y - y0) / yscale if not self.weighted: if self.balanced_magbins: y, self.mag_bwf = self.balance_magbins(y) y = y.astype(self.ytype) else: y = np.floor(y * self.mag_bins).astype(self.ytype) if self.compute_log_prob: self.compute_mag_bin_fracs(y) if self.buffered_transfer: arrs = [self.t, self.y] if self.weighted: arrs.append(self.dy) if any([arr is None for arr in arrs]): if self.buffered_transfer: self.allocate_buffered_data_arrays(**kwargs) assert(self.n0 <= len(self.t)) self.t[:self.n0] = t[:self.n0] self.y[:self.n0] = y[:self.n0] if self.weighted: self.dy[:self.n0] = dy[:self.n0] else: self.t = t self.y = y if self.weighted: self.dy = dy return self
[docs] def set_gpu_arrays_to_zero(self, **kwargs): self.t_g.fill(self.real_type(0), stream=self.stream) self.y_g.fill(self.ytype(0), stream=self.stream) if self.weighted: self.bins_g.fill(self.real_type(0), stream=self.stream) self.dy_g.fill(self.real_type(0), stream=self.stream) else: self.bins_g.fill(np.uint32(0), stream=self.stream)
[docs] def fromdata(self, t, y, **kwargs): self.setdata(t, y, **kwargs) if kwargs.get('allocate', True): self.allocate(**kwargs) return self
[docs]def conditional_entropy(memory, functions, block_size=256, transfer_to_host=True, transfer_to_device=True, **kwargs): block = (block_size, 1, 1) grid = (int(np.ceil((memory.n0 * memory.nf) / float(block_size))), 1) fast_ce, faster_ce, ce_dpdm, hist_count, hist_weight,\ ce_logp, ce_std, ce_wt = functions if transfer_to_device: memory.transfer_data_to_gpu() if memory.weighted: args = (grid, block, memory.stream) args += (memory.t_g.ptr, memory.y_g.ptr, memory.dy_g.ptr) args += (memory.bins_g.ptr, memory.freqs_g.ptr) args += (np.uint32(memory.nf), np.uint32(memory.n0)) args += (memory.real_type(memory.max_phi),) hist_weight.prepared_async_call(*args) grid = (int(np.ceil(memory.nf / float(block_size))), 1) args = (grid, block, memory.stream) args += (memory.bins_g.ptr, np.uint32(memory.nf), memory.ce_g.ptr) ce_wt.prepared_async_call(*args) if transfer_to_host: memory.transfer_ce_to_cpu() return memory.ce_c args = (grid, block, memory.stream) args += (memory.t_g.ptr, memory.y_g.ptr) args += (memory.bins_g.ptr, memory.freqs_g.ptr) args += (np.uint32(memory.nf), np.uint32(memory.n0)) hist_count.prepared_async_call(*args) grid = (int(np.ceil(memory.nf / float(block_size))), 1) args = (grid, block, memory.stream) args += (memory.bins_g.ptr, np.uint32(memory.nf), memory.ce_g.ptr) if memory.balanced_magbins: args += (memory.mag_bwf_g.ptr,) ce_dpdm.prepared_async_call(*args) elif memory.compute_log_prob: args += (memory.mag_bin_fracs_g.ptr,) ce_logp.prepared_async_call(*args) else: ce_std.prepared_async_call(*args) if transfer_to_host: memory.transfer_ce_to_cpu() return memory.ce_c
[docs]def conditional_entropy_fast(memory, functions, block_size=256, transfer_to_host=True, transfer_to_device=True, freq_batch_size=None, shmem_lc=True, shmem_lim=None, max_nblocks=200, force_nblocks=None, stream=None, **kwargs): fast_ce, faster_ce, ce_dpdm, hist_count, hist_weight,\ ce_logp, ce_std, ce_wt = functions if shmem_lim is None: dev = pycuda.autoinit.device att = cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK shmem_lim = pycuda.autoinit.device.get_attribute(att) if transfer_to_device: memory.transfer_data_to_gpu() if freq_batch_size is None: freq_batch_size = int(memory.nf) block = (block_size, 1, 1) # Get the shared memory requirement r = memory.real_type(1).nbytes u = np.uint32(1).nbytes shmem = (r + u) * memory.phase_bins * memory.mag_bins shmem += u * memory.phase_bins data_mem = (r + u) * len(memory.t) func = fast_ce # Decide whether or not to use shared memory for # loading the lightcurve. Only if the user # wants and we have enough memory data_in_shared_mem = False if shmem_lc: data_in_shared_mem = shmem + data_mem < shmem_lim if data_in_shared_mem: shmem += data_mem func = faster_ce # Make sure we have extra memory for alignment shmem += shmem % r i_freq = 0 while (i_freq < memory.nf): j_freq = min([i_freq + freq_batch_size, memory.nf]) grid = (min([int(np.ceil((j_freq - i_freq) / block_size)), max_nblocks]), 1) if data_in_shared_mem: grid = (int(np.floor(2 * float(shmem_lim) / shmem)), 1) if force_nblocks is not None: grid = (force_nblocks, 1) assert(grid[0] > 0) args = (grid, block, stream) args += (memory.t_g.ptr, memory.y_g.ptr) args += (memory.freqs_g.ptr, memory.ce_g.ptr) args += (np.uint32(j_freq - i_freq), np.uint32(i_freq), np.uint32(memory.n0)) args += (np.uint32(memory.phase_bins), np.uint32(memory.mag_bins)) args += (np.uint32(memory.phase_overlap), np.uint32(memory.mag_overlap)) func.prepared_async_call(*args, shared_size=shmem) i_freq += j_freq - i_freq if transfer_to_host: memory.transfer_ce_to_cpu() return memory.ce_c
[docs]class ConditionalEntropyAsyncProcess(GPUAsyncProcess): """ GPUAsyncProcess for the Conditional Entropy period finder Parameters ---------- phase_bins: int, optional (default: 10) Number of phase bins to use. mag_bins: int, optional (default: 10) Number of mag bins to use. max_phi: float, optional (default: 3.) For weighted CE; skips contibutions to bins that are more than ``max_phi`` sigma away. weighted: bool, optional (default: False) If true, uses the weighted version of the CE periodogram. Slower, but accounts for data uncertainties. block_size: int, optional (default: 256) Number of CUDA threads per CUDA block. phase_overlap: int, optional (default: 0) If > 0, the phase bins are overlapped with each other mag_overlap: int, optional (default: 0) If > 0, the mag bins are overlapped with each other use_fast: bool, optional (default: False) Use a somewhat experimental function to speed up computations. This is perfect for large Nfreqs and nobs <~ 2000. If True, use :func:`run` and not :func:`large_run` and set ``nstreams = 1``. Example ------- >>> proc = ConditionalEntropyAsyncProcess() >>> Ndata = 1000 >>> t = np.sort(365 * np.random.rand(N)) >>> y = 12 + 0.01 * np.cos(2 * np.pi * t / 5.0) >>> y += 0.01 * np.random.randn(len(t)) >>> dy = 0.01 * np.ones_like(y) >>> results = proc.run([(t, y, dy)]) >>> proc.finish() >>> ce_freqs, ce_powers = results[0] """ def __init__(self, *args, **kwargs): super(ConditionalEntropyAsyncProcess, self).__init__(*args, **kwargs) self.phase_bins = kwargs.get('phase_bins', 10) self.mag_bins = kwargs.get('mag_bins', 5) self.max_phi = kwargs.get('max_phi', 3.) self.weighted = kwargs.get('weighted', False) self.block_size = kwargs.get('block_size', 256) self.phase_overlap = kwargs.get('phase_overlap', 0) self.mag_overlap = kwargs.get('mag_overlap', 0) if self.mag_overlap > 0: if kwargs.get('balanced_magbins', False): raise Exception("mag_overlap must be zero " "if balanced_magbins is True") self.use_double = kwargs.get('use_double', False) self.real_type = np.float32 if self.use_double: self.real_type = np.float64 self.call_func = conditional_entropy if kwargs.get('use_fast', False): self.call_func = conditional_entropy_fast self.memory = kwargs.get('memory', None) self.shmem_lc = kwargs.get('shmem_lc', True) def _compile_and_prepare_functions(self, **kwargs): cpp_defs = dict(NPHASE=self.phase_bins, NMAG=self.mag_bins, PHASE_OVERLAP=self.phase_overlap, MAG_OVERLAP=self.mag_overlap) if self.use_double: cpp_defs['DOUBLE_PRECISION'] = None # Read kernel & replace with kernel_txt = _module_reader(find_kernel('ce'), cpp_defs=cpp_defs) # compile kernel self.module = SourceModule(kernel_txt, options=['--use_fast_math']) self.dtypes = dict( constdpdm_ce=[np.intp, np.int32, np.intp, np.intp], histogram_data_weighted=[np.intp, np.intp, np.intp, np.intp, np.intp, np.uint32, np.uint32, self.real_type], histogram_data_count=[np.intp, np.intp, np.intp, np.intp, np.uint32, np.uint32], log_prob=[np.intp, np.uint32, np.intp, np.intp], standard_ce=[np.intp, np.uint32, np.intp], weighted_ce=[np.intp, np.uint32, np.intp], ce_classical_fast=[np.intp, np.intp, np.intp, np.intp, np.uint32, np.uint32, np.uint32, np.uint32, np.uint32, np.uint32, np.uint32], ce_classical_faster=[np.intp, np.intp, np.intp, np.intp, np.uint32, np.uint32, np.uint32, np.uint32, np.uint32, np.uint32, np.uint32] ) for fname, dtype in self.dtypes.items(): func = self.module.get_function(fname) self.prepared_functions[fname] = func.prepare(dtype) self.function_tuple = tuple(self.prepared_functions[fname] for fname in sorted(self.dtypes.keys()))
[docs] def memory_requirement(self, data, **kwargs): """ Return an approximate GPU memory requirement in bytes. Will throw a ``NotImplementedError`` if called, so ... don't call it. """ raise NotImplementedError()
[docs] def allocate_for_single_lc(self, t, y, freqs, dy=None, stream=None, **kwargs): """ Allocate GPU (and possibly CPU) memory for single lightcurve Parameters ---------- t: array_like Observation times y: array_like Observations freqs: array_like frequencies dy: array_like, optional Observation uncertainties stream: pycuda.driver.Stream, optional CUDA stream you want this to run on **kwargs Returns ------- mem: ConditionalEntropyMemory Memory object. """ kw = dict(phase_bins=self.phase_bins, mag_bins=self.mag_bins, mag_overlap=self.mag_overlap, phase_overlap=self.phase_overlap, max_phi=self.max_phi, stream=stream, weighted=self.weighted, use_double=self.use_double) kw.update(kwargs) mem = ConditionalEntropyMemory(**kw) mem.fromdata(t, y, dy=dy, freqs=freqs, allocate=True, **kwargs) return mem
[docs] def autofrequency(self, *args, **kwargs): """ calls :func:`cuvarbase.utils.autofrequency` """ return utils_autofreq(*args, **kwargs)
def _nfreqs(self, *args, **kwargs): return len(self.autofrequency(*args, **kwargs))
[docs] def allocate(self, data, freqs=None, **kwargs): """ Allocate GPU memory for Conditional Entropy computations Parameters ---------- data: list of (t, y, dy) tuples List of data, ``[(t_1, y_1, w_1), ...]`` * ``t``: Observation times * ``y``: Observations * ``dy``: Observation uncertainties freqs: list, optional Either a list of floats (same frequencies for all data), or a list of length ``n=len(data)``, with element ``i`` of the list being a list of frequencies for the ``i``-th lightcurve. **kwargs Returns ------- allocated_memory: list of ``ConditionalEntropyMemory`` list of allocated memory objects for each lightcurve """ if len(data) > len(self.streams): self._create_streams(len(data) - len(self.streams)) allocated_memory = [] frqs = freqs if frqs is None: frqs = [self.autofrequency(t, **kwargs) for (t, y, dy) in data] elif isinstance(freqs[0], float): frqs = [freqs] * len(data) for i, ((t, y, dy), f) in enumerate(zip(data, frqs)): mem = self.allocate_for_single_lc(t, y, dy=dy, freqs=f, stream=self.streams[i], **kwargs) allocated_memory.append(mem) return allocated_memory
[docs] def preallocate(self, max_nobs, freqs, nlcs=1, streams=None, **kwargs): """ Preallocate memory for future runs. Parameters ---------- max_nobs: int Upper limit for the number of observations freqs: array_like Frequency array to be used by future ``run`` calls nlcs: int, optional (default: 1) Maximum batch size for ``run`` calls streams: list of ``pycuda.driver.Stream`` Length of list must be ``>= nlcs`` Returns ------- self.memory: list List of ``ConditionalEntropyMemory`` objects """ kw = dict(phase_bins=self.phase_bins, mag_bins=self.mag_bins, mag_overlap=self.mag_overlap, phase_overlap=self.phase_overlap, max_phi=self.max_phi, weighted=self.weighted, use_double=self.use_double, n0_buffer=max_nobs, buffered_transfer=True, allocate=True, freqs=freqs) kw.update(kwargs) self.memory = [] for i in range(nlcs): stream = None if streams is None else streams[i] kw.update(dict(stream=stream)) mem = ConditionalEntropyMemory(**kw) mem.allocate(**kwargs) self.memory.append(mem) return self.memory
[docs] def run(self, data, memory=None, freqs=None, set_data=True, **kwargs): """ Run Conditional Entropy on a batch of data. Parameters ---------- data: list of tuples list of [(t, y, dy), ...] containing * ``t``: observation times * ``y``: observations * ``dy``: observation uncertainties freqs: optional, list of ``np.ndarray`` frequencies List of custom frequencies. If not specified, calls ``autofrequency`` with default arguments memory: optional, list of ``ConditionalEntropyMemory`` objects List of memory objects, length of list must be ``>= len(data)`` set_data: boolean, optional (default: True) Transfers data to gpu if memory is provided **kwargs Returns ------- results: list of lists list of (freqs, ce) corresponding to CE for each element of the ``data`` array """ # compile module if not compiled already if not hasattr(self, 'prepared_functions') or \ not all([func in self.prepared_functions for func in ['ce_wt']]): self._compile_and_prepare_functions(**kwargs) # create and/or check frequencies frqs = freqs if frqs is None: frqs = [self.autofrequency(d[0], **kwargs) for d in data] elif isinstance(frqs[0], float): frqs = [frqs] * len(data) assert(len(frqs) == len(data)) memory = memory if memory is not None else self.memory if memory is None: memory = self.allocate(data, freqs=frqs, **kwargs) for mem in memory: mem.transfer_freqs_to_gpu() elif set_data: for i, (t, y, dy) in enumerate(data): memory[i].set_gpu_arrays_to_zero(**kwargs) memory[i].setdata(t, y, dy=dy, **kwargs) kw = dict(block_size=self.block_size, shmem_lc=self.shmem_lc) kw.update(kwargs) results = [self.call_func(memory[i], self.function_tuple, **kw) for i in range(len(data))] results = [(f, r) for f, r in zip(frqs, results)] return results
[docs] def large_run(self, data, freqs=None, max_memory=None, **kwargs): """ Run Conditional Entropy on a large frequency grid Parameters ---------- data: list of tuples list of [(t, y, dy), ...] containing * ``t``: observation times * ``y``: observations * ``dy``: observation uncertainties freqs: optional, list of ``np.ndarray`` frequencies List of custom frequencies. If not specified, calls ``autofrequency`` with default arguments max_memory: float, optional (default: None) Maximum memory per batch in bytes. If ``None``, it will use 90% of the total free memory available as specified by ``pycuda.driver.mem_get_info()`` **kwargs Returns ------- results: list of lists list of (freqs, ce) corresponding to CE for each element of the ``data`` array """ # compile module if not compiled already if not hasattr(self, 'prepared_functions') or \ not all([func in self.prepared_functions for func in ['ce_wt']]): self._compile_and_prepare_functions(**kwargs) if max_memory is None: free, total = cuda.mem_get_info() max_memory = 0.9 * free # create and/or check frequencies frqs = freqs if frqs is None: frqs = [self.autofrequency(d[0], **kwargs) for d in data] elif isinstance(frqs[0], float): frqs = [frqs] * len(data) assert(len(frqs) == len(data)) cpers = [] for d, f in zip(data, frqs): size_of_real = self.real_type(1).nbytes # subtract of lc memory fmem = max_memory - len(d[0]) * size_of_real * 3 tot_bins = self.phase_bins * self.mag_bins batch_size = int(np.floor(fmem / (size_of_real * (tot_bins + 2)))) nbatches = int(np.ceil(len(f) / float(batch_size))) cper = np.zeros(len(f)) for i in range(nbatches): imin = i * batch_size imax = min([len(f), (i + 1) * batch_size]) r = self.run([d], freqs=f[slice(imin, imax)], **kwargs) self.finish() cper[imin:imax] = r[0][1][:] cpers.append(cper) results = [(f, cper) for f, cper in zip(frqs, cpers)] return results
[docs] def batched_run_const_nfreq(self, data, batch_size=10, freqs=None, **kwargs): """ Same as ``batched_run`` but is more efficient when the frequencies are the same for each lightcurve. Doesn't reallocate memory for each batch. .. note:: To get best efficiency, make sure the maximum number of observations is not much larger than the typical number of observations. """ # create streams if needed bsize = min([len(data), batch_size]) if len(self.streams) < bsize: self._create_streams(bsize - len(self.streams)) streams = [self.streams[i] for i in range(bsize)] max_ndata = max([len(t) for t, y, dy in data]) if freqs is None: data_with_max_baseline = max(data, key=lambda d: max(d[0]) - min(d[0])) freqs = self.autofrequency(data_with_max_baseline[0], **kwargs) df = freqs[1] - freqs[0] nf = len(freqs) ces = [] # make data batches batches = [] while len(batches) * batch_size < len(data): start = len(batches) * batch_size finish = start + min([batch_size, len(data) - start]) batches.append([data[i] for i in range(start, finish)]) # set up memory containers for gpu and cpu (pinned) memory kwargs_mem = dict(buffered_transfer=True, n0_buffer=max_ndata, mag_overlap=self.mag_overlap, phase_overlap=self.phase_overlap, phase_bins=self.phase_bins, mag_bins=self.mag_bins, weighted=self.weighted, max_phi=self.max_phi, use_double=self.use_double) kwargs_mem.update(kwargs) memory = [ConditionalEntropyMemory(stream=stream, **kwargs_mem) for stream in streams] # allocate memory [mem.allocate(freqs=freqs, **kwargs) for mem in memory] [mem.transfer_freqs_to_gpu(**kwargs) for mem in memory] for b, batch in enumerate(batches): results = self.run(batch, memory=memory, freqs=freqs, **kwargs) self.finish() for i, (f, ce) in enumerate(results): ces.append(np.copy(ce)) return [(freqs, ce) for ce in ces]