Source code for cuvarbase.pdm

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from builtins import zip
from builtins import range

import numpy as np
import resource
import warnings

import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
# import pycuda.autoinit

from .core import GPUAsyncProcess
from .utils import weights, find_kernel


[docs]def var_tophat(t, y, w, freq, dphi): var = 0. for i, (T, Y, W) in enumerate(zip(t, y, w)): mbar = 0. wtot = 0. for j, (T2, Y2, W2) in enumerate(zip(t, y, w)): dph = dphase(abs(T2 - T), freq) if dph < dphi: mbar += W2 * Y2 wtot += W2 var += W * (Y - mbar / wtot)**2 return var
[docs]def binned_pdm_model(t, y, w, freq, nbins, linterp=True): if len(t) == 0: return lambda p, **kwargs: np.zeros_like(p) bin_means = np.zeros(nbins) phase = (t * freq) % 1.0 bins = [int(p * nbins) % nbins for p in phase] for i in range(nbins): wtot = max([sum([W for j, W in enumerate(w) if bins[j] == i]), 1E-10]) bin_means[i] = sum([W * Y for j, (Y, W) in enumerate(zip(y, w)) if bins[j] == i]) / wtot def pred_y(p, nbins=nbins, linterp=linterp, bin_means=bin_means): bs = np.array([int(P * nbins) % nbins for P in p]) if not linterp: return bin_means[bs] alphas = p * nbins - np.floor(p * nbins) - 0.5 di = np.floor(alphas).astype(np.int32) bins0 = bs + di bins1 = bins0 + 1 alphas[alphas < 0] += 1 bins0[bins0 < 0] += nbins bins1[bins1 >= nbins] -= nbins return (1 - alphas) * bin_means[bins0] + alphas * bin_means[bins1] return pred_y
[docs]def var_binned(t, y, w, freq, nbins, linterp=True): ypred = binned_pdm_model(t, y, w, freq, nbins, linterp=linterp)((t * freq) % 1.0) return np.dot(w, np.power(y - ypred, 2))
[docs]def binless_pdm_cpu(t, y, w, freqs, dphi=0.05): ybar = np.dot(w, y) var = np.dot(w, np.power(y - ybar, 2)) return [1 - var_tophat(t, y, w, freq, dphi) / var for freq in freqs]
[docs]def pdm2_cpu(t, y, w, freqs, nbins=30, linterp=True): ybar = np.dot(w, y) var = np.dot(w, np.power(y - ybar, 2)) return [1 - var_binned(t, y, w, freq, nbins=nbins, linterp=linterp) / var for freq in freqs]
[docs]def pdm2_single_freq(t, y, w, freq, nbins=30, linterp=True): ybar = np.dot(w, y) var = np.dot(w, np.power(y - ybar, 2)) return 1 - var_binned(t, y, w, freq, nbins=nbins, linterp=linterp) / var
[docs]def pdm_async(stream, data_cpu, data_gpu, pow_cpu, function, dphi=0.05, block_size=256): t, y, w, freqs = data_cpu t_g, y_g, w_g, freqs_g, pow_g = data_gpu if t_g is None: return pow_cpu # constants nfreqs = np.int32(len(freqs)) ndata = np.int32(len(t)) dphi = np.float32(dphi) # kernel size grid_size = int(np.ceil(float(nfreqs) / block_size)) grid = (grid_size, 1) block = (block_size, 1, 1) # weights + weighted variance ybar = np.dot(w, y) var = np.float32(np.dot(w, np.power(y - ybar, 2))) # transfer data w_g.set_async(np.asarray(w).astype(np.float32), stream=stream) t_g.set_async(np.asarray(t).astype(np.float32), stream=stream) y_g.set_async(np.asarray(y).astype(np.float32), stream=stream) function.prepared_async_call(grid, block, stream, t_g.ptr, y_g.ptr, w_g.ptr, freqs_g.ptr, pow_g.ptr, ndata, nfreqs, dphi, var) pow_g.get_async(stream=stream, ary=pow_cpu) return pow_cpu
[docs]class PDMAsyncProcess(GPUAsyncProcess): def __init__(self, *args, **kwargs): super(PDMAsyncProcess, self).__init__(*args, **kwargs) warnings.warn("PDM is experimental at this point. " "Use with great caution.") def _compile_and_prepare_functions(self, nbins=10): pdm2_txt = open(find_kernel('pdm'), 'r').read() pdm2_txt = pdm2_txt.replace('//INSERT_NBINS_HERE', '#define NBINS %d' % (nbins)) self.module = SourceModule(pdm2_txt, options=['--use_fast_math']) self.dtypes = [np.intp, np.intp, np.intp, np.intp, np.intp, np.int32, np.int32, np.float32, np.float32] for function in ['pdm_binless_tophat', 'pdm_binless_gauss', 'pdm_binned_linterp_%dbins' % (nbins), 'pdm_binned_step_%dbins' % (nbins)]: func = function.replace('_%dbins' % (nbins), '') func = self.module.get_function(func).prepare(self.dtypes) self.prepared_functions[function] = func
[docs] def allocate(self, data): if len(data) > len(self.streams): self._create_streams(len(data) - len(self.streams)) gpu_data, pow_cpus = [], [] for t, y, w, freqs in data: pow_cpu = cuda.aligned_zeros(shape=(len(freqs),), dtype=np.float32, alignment=resource.getpagesize()) pow_cpu = cuda.register_host_memory(pow_cpu) t_g, y_g, w_g = None, None, None if len(t) > 0: t_g, y_g, w_g = tuple([gpuarray.zeros(len(t), dtype=np.float32) for i in range(3)]) pow_g = gpuarray.zeros(len(pow_cpu), dtype=pow_cpu.dtype) freqs_g = gpuarray.to_gpu(np.asarray(freqs).astype(np.float32)) gpu_data.append((t_g, y_g, w_g, freqs_g, pow_g)) pow_cpus.append(pow_cpu) return gpu_data, pow_cpus
[docs] def run(self, data, gpu_data=None, pow_cpus=None, kind='binned_linterp', nbins=10, **pdm_kwargs): function = 'pdm_%s_%dbins' % (kind, nbins) if function not in self.prepared_functions: self._compile_and_prepare_functions(nbins=nbins) if pow_cpus is None or gpu_data is None: gpu_data, pow_cpus = self.allocate(data) streams = [s for i, s in enumerate(self.streams) if i < len(data)] func = self.prepared_functions[function] results = [pdm_async(stream, cdat, gdat, pcpu, func, **pdm_kwargs) for stream, cdat, gdat, pcpu in zip(streams, data, gpu_data, pow_cpus)] return results