Optimization with Numba¶
This notebook discussus how to achive high performance of vectorized operations on large chunks of data with numba. $\newcommand{\d}{\mathrm{d}}\newcommand{\vect}[1]{\vec{#1}}\newcommand{\abs}[1]{\lvert#1\rvert}\newcommand{\I}{\mathrm{i}}\newcommand{\braket}[1]{\langle#1\rangle}\newcommand{\ket}[1]{\left|#1\right\rangle}\newcommand{\bra}[1]{\left\langle#1\right|}\newcommand{\pdiff}[2]{\frac{\partial #1}{\partial #2}}$
%pylab inline
import itertools
import sympy
sympy.init_printing(use_latex=True)
%connect_info
Example¶
The example we shall start with is motivated by a nuclear physics problem. We need to compute the following term in a Hamiltonian representing the application of a two-body potential:
$$ \Psi(\vect{x}_i) = \sum_{p}V_{p}(\vect{r}_{p})\Psi(\vect{x}_i) $$where $\vect{x}_i$ are a set of internal (Jacobi) coordinates, and $\vect{r}_{p}$ are linear combinations of these representing the various relative coordinates in the problem. Here we define the potential:
class Potential(object):
def V(self, p, R):
"""Dummy potential for the p'th relative coordinate"""
r2 = np.linalg.norm(R)**2
return np.exp(-r2+(p[0]+1)**p[1])
class Basis(object):
def __init__(self, N=3, L=10.0):
x = y = z = np.arange(N, dtype=double)*L/N - L/2.0
self.X = [x[:, None, None, None, None, None],
y[None, :, None, None, None, None],
z[None, None, :, None, None, None],
x[None, None, None, :, None, None],
y[None, None, None, None, :, None],
z[None, None, None, None, None, :]]
self.shape = (N,)*6
# This is the change of basis matrix. We store
# only the first two relative coordinates, not
# the final center-of-mass variable
self.A = np.array([[1, -1, 0 ],
[0.5, 0.5, -1. ],
[1./3, 1./3, 1./3]])
self.Ainv = np.linalg.inv(self.A)
def get_rel_coeffs(self, a, b):
"""Return the coefficients defining the relative
coordinate from `a` to `b`
"""
return self.Ainv[a,:] - self.Ainv[b,:]
basis = Basis()
pot = Potential()
basis.get_rel_coeffs(1,2)
This tells us that the relative coordinate between particle 0 and particle 2 is 0.5*x[0] + x[1]
.
Now we make a mock wavefunction, and then apply the potential. This serves as the benchmark (but will be slow):
def apply_V(HPsi, Psi):
for p in [(0,1), (1,2), (0,2)]:
for i0, i1, i2, j0, j1, j2 in itertools.product(*[range(_x.size) for _x in basis.X]):
_ri = [basis.X[0].ravel()[i0],
basis.X[1].ravel()[i1],
basis.X[2].ravel()[i2]] # First Jacobi coordinate as a vector
_rj = [basis.X[3].ravel()[j0],
basis.X[4].ravel()[j1],
basis.X[5].ravel()[j2]] # Second Jacobi coordinate as a vector
r = np.dot(basis.get_rel_coeffs(*p)[:-1], [_ri, _rj])
HPsi[i0, i1, i2, j0, j1, j2] += pot.V(p, r)*Psi[i0, i1, i2, j0, j1, j2]
np.random.seed(1)
Psi = np.random.random(basis.shape) + 0j
HPsi = 0*Psi
HPsi_ = 0*Psi # Used for timing, which requires a global
apply_V(HPsi, Psi)
def check(apply_V):
"""Check that an implementation of apply_V works, and time it."""
HPsi0 = 0*Psi
apply_V(HPsi0, Psi)
assert np.allclose(HPsi0, HPsi)
check(apply_V)
%timeit apply_V(HPsi_, Psi)
To vertorize this loop, one would generally evaluate the potential at all the required points, then perform the multiplication and update. The problem is that the array would be so large that we will not have space to store it. Thus, some sort of iterative loop is really required. Our first step is to consider numba
:
Numba¶
Start by perusing the Numba Language Specification. This will show what things one can do.
We start with a simplified loop (assuming all abscisssa to be the same etc.) in order to ensure an easy transition to a compiled version.
def apply_V_numba(HPsi, Psi):
"""These outer parts do not work well in numba
so we do them first, then call the core."""
x = basis.X[0].ravel()
for p in [(0,1), (1,2), (0,2)]:
c = basis.get_rel_coeffs(p[0], p[1])[:-1]
#apply_V_core(HPsi=HPsi, Psi=Psi, p=p, c=c, x=x) # kwargs not yet supported
# Before calling this, we must make sure everything has the correct type!
p, c, x = [np.asarray(_x, dtype=float) for _x in [p, c, x]]
apply_V_core(HPsi, Psi, p, c, x)
def apply_V_core(HPsi, Psi, p, c, x):
for i0 in range(x.size):
for i1 in range(x.size):
for i2 in range(x.size):
for j0 in range(x.size):
for j1 in range(x.size):
for j2 in range(x.size):
_ri = x[[i0, i1, i2]] # First Jacobi coordinate as a vector
_rj = x[[j0, j1, j2]] # Second Jacobi coordinate as a vector
r = np.dot(c, [_ri, _rj])
HPsi[i0, i1, i2, j0, j1, j2] += pot.V(p, r)*Psi[i0, i1, i2, j0, j1, j2]
check(apply_V_numba)
%timeit apply_V_numba(HPsi_, Psi)
Now we try applying the numba.autojit
decorator:
import numba
for n in [#numba.pipeline,
#numba.control_flow.debug,
#numba.closures,
#numba.codegen.debug
]:
n.logger.setLevel(numba.pipeline.logging.ERROR)
x = basis.X[0].ravel()
@numba.autojit
def apply_V_core(HPsi, Psi, p, c, x):
for i0 in range(x.size):
for i1 in range(x.size):
for i2 in range(x.size):
for j0 in range(x.size):
for j1 in range(x.size):
for j2 in range(x.size):
_ri = x[[i0, i1, i2]] # First Jacobi coordinate as a vector
_rj = x[[j0, j1, j2]] # Second Jacobi coordinate as a vector
r = np.dot(c, [_ri, _rj])
HPsi[i0, i1, i2, j0, j1, j2] += pot.V(p, r)*Psi[i0, i1, i2, j0, j1, j2]
check(apply_V_numba)
%timeit apply_V_numba(HPsi_, Psi)
One way to see what is taking time is to use the annotate feature of numba. Unfortunately, this does not yet work very well in a notebook, so we shall first write a version of this code in an external file:
%%file apply_V_core_numba.py
import numpy as np
import numba
def V(p, R):
"""Dummy potential for the p'th relative coordinate"""
r2 = np.linalg.norm(R)**2
return np.exp(-r2+(p[0]+1)**p[1])
@numba.autojit
#@numba.jit('void(c16[:,:,:,:,:,::1], c16[:,:,:,:,:,::1], f8[:], f8[:], f8[:])',
# annotate=True)
def apply_V_core(HPsi, Psi, p, c, x):
for i0 in range(x.size):
for i1 in range(x.size):
for i2 in range(x.size):
for j0 in range(x.size):
for j1 in range(x.size):
for j2 in range(x.size):
_ri = x[[i0, i1, i2]] # First Jacobi coordinate as a vector
_rj = x[[j0, j1, j2]] # Second Jacobi coordinate as a vector
r = np.dot(c, [_ri, _rj])
v = V(p, r)
HPsi[i0, i1, i2, j0, j1, j2] += v*Psi[i0, i1, i2, j0, j1, j2]
if __name__ == "":
# This is executed when called with numba to generate annotations
N = 3
x = np.linspace(-10,10,N)
Psi = np.random.random((N,)*6) + 1j
HPsi = 0*Psi
p = np.zeros(2, dtype=float)
c = np.zeros(2, dtype=float)
apply_V_core(HPsi, Psi, p, c, x)
from importlib import reload
import apply_V_core_numba
reload(apply_V_core_numba)
from apply_V_core_numba import apply_V_core
check(apply_V_numba)
#%timeit apply_V_numba(HPsi_, Psi)
!. /data/apps/anaconda/etc/profile.d/conda.sh; conda activate _tov_test; numba --annotate apply_V_core_numba.py
!ls
!open apply_V_core_numba.html # On Mac's will open annotated source in another window
#from IPython.display import HTML
#with open('apply_V_core_numba.html') as _f:
# code = '<iframe> %s </iframe>' % (_f.read() )
#HTML(code)
The output looks something like this:
Note that almost every line as an "*". This means that the the "object layer" is being used which uses the python interpreter and so is slow.. Now we can try to learn how to improve this so that the compiler can generate efficient code. Here is a list of the problems:
-
x.size
is not recognized, but one can uselen(x)
. - Functions like
np.dot
andno.linalg.norm
are not supported outside of the object layer. - Slicing for assignment like
ri[:] = ...
seems not to be supported outside of the object layer. - Ensure that objects have the correct type... (Apparently numba does not properly cast arguments, even if the type is explicitly specified.) I was having problems with the argument
p
which was being passed as a list of integer for example.
%%file apply_V_core_numba.py
import numpy as np
import numba
@numba.autojit(inline=True)
#@numba.jit('f8(f8[:])')
def norm2(x):
res = abs(x[0])**2
for _i in xrange(1,len(x)):
res += x[_i]**2
return res
@numba.autojit(inline=True)
#@numba.jit('f8(f8[:], f8[:])')
def V(p, R):
"""Dummy potential for the p'th relative coordinate"""
#r2 = np.linalg.norm(R)**2
r2 = norm2(R)
#r2 = R[0]**2 + R[1]**2 + R[2]**2
return np.exp(-r2+(p[0]+1)**p[1])
@numba.autojit
#@numba.jit('void(c16[:,:,:,:,:,::1], c16[:,:,:,:,:,::1], f8[:], f8[:], f8[:])',
# annotate=True)
def apply_V_core(HPsi, Psi, p, c, x):
ri = np.zeros(3, dtype=float)
rj = np.zeros(3, dtype=float)
for i0 in xrange(len(x)):
for i1 in xrange(len(x)):
for i2 in xrange(len(x)):
for j0 in xrange(len(x)):
for j1 in xrange(len(x)):
for j2 in xrange(len(x)):
ri[0], ri[1], ri[2] = x[i0], x[i1], x[i2] # This works, but is ugly...
rj[0], rj[1], rj[2] = x[j0], x[j1], x[j2] # This works, but is ugly...
#ri[:] = x[i0], x[i1], x[i2] # This does not compile
#rj[:] = x[[j0, j1, j2]] # Neither does this
r = c[0] * ri + c[1] * rj # This works, but crashes annotate...
v = V(p, r)
HPsi[i0, i1, i2, j0, j1, j2] += v*Psi[i0, i1, i2, j0, j1, j2]
if __name__ in ["__main__", ""]:
# This is executed when called with numba to generate annotations
N = 3
x = np.linspace(-10,10,N)
Psi = np.random.random((N,)*6) + 1j
HPsi = 0*Psi
p = np.zeros(2, dtype=float)
c = np.zeros(2, dtype=float)
apply_V_core(HPsi, Psi, p, c, x)
import time
N = 10
tic = time.time()
for n in xrange(N):
apply_V_core(HPsi, Psi, p, c, x)
print("%gms" % ((time.time() - tic)/N*1000,))
import apply_V_core_numba
reload(apply_V_core_numba)
from apply_V_core_numba import apply_V_core
check(apply_V_numba)
%timeit apply_V_numba(HPsi_, Psi)
Now we see that everything happens in under a microsecond. Unfortunately annotate no-longer works because of the line r = c[0] * ri + c[1] * rj
... Not sure why.
Static Compilation¶
One problem with using JIT compilation is that it can take a while to get going since the compiler needs to be called. To alliviate this, it would be great if the compilation results could be cached between runs, but this feature is not yet available in numba. (See Issue #224.) Instead, we must resort to static compilation. Here we test this with both functions and classes (the latter can be useful to maintain state.)
%%file static_compilation.py
from numba import *
import numpy as np
def norm2(x):
res = x[0]**2
for _i in xrange(len(x)):
res += x[_i]**2
return res
export('norm2 f8(f8[:])')(norm2)
!pycc --python static_compilation.py -o sc.so
import sc
%%file static_compilation2.py
from numba import *
import numpy as np
@autojit
def norm2(x):
res = x[0]**2
for _i in xrange(len(x)):
res += x[_i]**2
return res
@autojit
class C(object):
def __init__(self, offset=1.0):
self.offset = offset
@void(c16[:,:,:,:,:,:],c16[:,:,:,:,:,:], f8[:])
def apply_V(self, VPsi, Psi, x):
for i0 in xrange(len(x)):
for i1 in xrange(len(x)):
for i2 in xrange(len(x)):
for i3 in xrange(len(x)):
for i4 in xrange(len(x)):
for i5 in xrange(len(x)):
VPsi[i0,i1,i2,i3,i4,i5] = (norm2(x) + self.offset)*Psi[i0,i1,i2,i3,i4,i5]
export('norm2 f8(f8[:])')(norm2)
if __name__ in ["__main__", ""]:
# This is executed when called with numba to generate annotations
N = 3
x = np.linspace(-10,10,N)
Psi = np.random.random((N,)*6) + 1j
HPsi = 0*Psi
c = C(0.1)
c.apply_V(HPsi, Psi, x)
import time
N = 10
tic = time.time()
for n in xrange(N):
c.apply_V(HPsi, Psi, x)
print("%gms" % ((time.time() - tic)/N*1000,))
!pycc static_compilation.py
import numba
@numba.autojit # This compiles when the function is called the first time.
#@jit(numba.f8(numba.f8)) # This compiles when the definition is loaded
def sq(x):
for i in xrange(100000):
x += x*x
return x
import time
tic = time.time()
sq(6.0);print time.time() - tic
SMP: prange
¶
Here we demonstrate how to use multiple cores on a single node using numba.prange
. A couple of points to note:
-
prange
cannot be nested, so you must unravel the loops you want to parallelize. - One cannot use
if
statements in aprange
loop, but they can be hidden inside a function call.
import numpy as np
import numba
for n in [numba.pipeline,
numba.control_flow.debug,
numba.closures,
numba.codegen.debug]:
n.logger.setLevel(numba.pipeline.logging.ERROR)
for n in [numba.pipeline,
numba.control_flow.debug,
numba.closures,
numba.codegen.debug]:
n.logger.setLevel(numba.pipeline.logging.ERROR)
@numba.jit(numba.f8(numba.f8[:]), inline=True)
def norm2(x):
res = abs(x[0])**2
for i in xrange(1,len(x)):
res += abs(x[i])**2
return res
@numba.jit(numba.f8(numba.int_, numba.f8), inline=True)
def g(i2, x):
if i2 % 2 == 0:
return np.sin(x)
else:
return np.cos(x)
def g_np(i2, x):
return np.where(i2 % 2 == 0, np.sin(x), np.cos(x))
@numba.jit(numba.void(numba.f8[:,:,:],numba.f8[:,:,:]))
def f(X, Y):
Nx, Ny, Nz = X.shape
for i0 in xrange(Nx):
for i1 in xrange(Ny):
for i2 in xrange(Nz):
Y[i0, i1, i2] = g(i2, X[i0, i1, i2])
@numba.jit(numba.void(numba.f8[:,:,:],numba.f8[:,:,:]))
def fp(X, Y):
Nx, Ny, Nz = X.shape
_r = np.zeros(Nx, dtype=float)
for i0 in numba.prange(Nx):
for i1 in xrange(Ny):
for i2 in xrange(Nz):
Y[i0, i1, i2] = norm2(_r) * g(i2, X[i0, i1, i2])
np.random.seed(1)
shape = (8*2, 20**2,20**2)
i2 = np.arange(shape[-1])[None,None,:]
X = np.random.random(shape)
Y = 0*X
f(X,Y)
assert np.allclose(Y, g_np(i2,X))
%timeit Y[::] = np.sin(X)
%timeit f(X,Y)
%timeit fp(X,Y)
import numpy as np
import numba
@numba.jit(numba.f8(numba.f8[:]), inline=True)
def norm2(x):
res = abs(x[0])**2
for i in xrange(1,len(x)):
res += abs(x[i])**2
return res
@numba.jit(numba.f8(numba.int_, numba.f8), inline=True)
def g(i2, x):
if i2 % 2 == 0:
return np.sin(x)
else:
return np.cos(x)
def g_np(i2, x):
return np.where(i2 % 2 == 0, np.sin(x), np.cos(x))
@numba.jit(numba.void(numba.f8[:,:,:],numba.f8[:,:,:]))
def f(X, Y):
Nx, Ny, Nz = X.shape
_r = np.zeros(Nx, dtype=float)
for i0 in xrange(Nx):
for i1 in xrange(Ny):
for i2 in xrange(Nz):
Y[i0, i1, i2] = norm2(_r) * g(i2, X[i0, i1, i2])
@numba.jit(numba.void(numba.f8[:,:,:],numba.f8[:,:,:]))
def fp(X, Y):
Nx, Ny, Nz = X.shape
_r = np.zeros(Nx, dtype=float)
for i0 in numba.prange(Nx):
for i1 in xrange(Ny):
for i2 in xrange(Nz):
Y[i0, i1, i2] = norm2(_r) * g(i2, X[i0, i1, i2])
import math
import numexpr
def f(x):
return math.sin(x)
A = np.random.random((10,10))
B = 0*A
numexpr.evaluate('f(A)', dict(f=f, A=A))
Benchmarking Vectorization¶
Here we compare several techniques for vectorization. We shall apply the sin
operator to a complex array and sum the result in the end to ensure the calculation proceeded correctly. Note that the choise of sin
may not be optimal as the time it takes depends on the argument, but if we use the same initial array, we should get a reasonable comparison. As a benchmark, we start with a straightforward C++ version.
Single Thread C++¶
%%file sin.cpp
#include <iostream>
#include <complex>
#include <ctime>
typedef std::complex<double> complex;
int main(int argc, char ** argv) {
const int N=16;
complex *X = new complex[N*N*N*N*N*N];
complex *Y = new complex[N*N*N*N*N*N];
int i,j;
for (i=0;i<N*N*N*N*N*N;++i)
X[i] = complex(static_cast<double>(i),1.0);
time_t start = time(0);
for (j=0;j<10;++j) {
#pragma omp parallel for private(i)
for (i=0;i<N*N*N*N*N*N;++i) {
Y[i] = std::sin(X[i]);
}
}
time_t end = time(0);
double time = difftime(end, start);
std::cout << "Loop took " << time << "s\n";
complex res = 0.0;
for (i=0;i<N*N*N*N*N*N;++i) {
res += Y[i];
}
std::cout << "res = " << res << "\n";
delete X;
delete Y;
};
!rm -f sin
!clang++ -Wall -O4 sin.cpp -o sin # clang++ is the Apple compiler based on LLVM
!time ./sin
!rm -f sin_omp
!g++ -fopenmp -lgomp -Wall -O4 sin.cpp -o sin_omp # clang++ is the Apple compiler based on LLVM
!time ./sin_omp
Python Numpy¶
This is the canonically way of performing such calculations in python. It would be extremely inefficient to use a for
loop as python does not handle them well. If you can structure your calculation to use numpy
in this matter – applying a single function to an array – then things will be much faster as the loop over elements of the array is performed inside the compiled functions. Unfortunately, to realize this speedup, the function must be compiled with the looping inside, so you cannot easily use this functionality for custom functions. (There is a numpy.vectorize
function that appears to do this, but it is only for convenience, not for performance.)
import numpy
N = 16
shape = (N,)*6
X = (np.arange(N**6) + 1.0j).reshape(shape)
Y = 0*X
sinX = np.sin(X)
res = sinX.sum()
print repr(res)
%%time
for i in xrange(10):
Y = np.sin(X)
NumExpr¶
Numexpr evaluates compiled expressions on a virtual machine, and pays careful attention to memory bandwith. The first time a function is called, it will be compiled – subsequent calls will be fast. One can also use an explicit instantiation of a function via the NumExpr
class (but this is not very well documented – check the numexpr
sources.) One strategy that I often use is to generate the expression symbolically using sympy
. One can then perform some symbolic optimizations (or compute derivatives and other symbolic manipulations) and compile the final expressions into an efficient function. This approach is also one of the easiest ways of to leverage multiple cores (the threading is done using the ctypes
interface that allows numexpr
to release the GIL.)
import numexpr
numexpr.set_num_threads(8)
apply_sin_numexpr = numexpr.NumExpr('sin(X)', signature=[('X', complex)], optimization='aggressive')
print("numexpr")
Y = apply_sin_numexpr(X)
assert np.allclose(Y, sinX)
print Y.sum()
%%timeit
for i in xrange(10):
Y = apply_sin_numexpr(X)
scipy.weave.inline
¶
The scipy.weave.inline
function can be used to compile code on the fly. The advantage here is that one can allocate and prepare the arrays in python, then just drop into c++ for the core loop. Another less obvious benefit is tha one can use python as a meta-language to dynamically generate the C++ code. (One use is to generate multiply nested loops to run over N-dimensions where N changes at runtime.)
The generated boiler plate provides both 1D and multi-dimensional access to the array elements, size, etc. through various macros. Unfortunately, there is no easy way that I know of to utilize multiple cores with this approach (though, of course, one could use the full power of C++, but this is not simple).
NOTE: Weave only works with Python 2*
from weave import inline
src = """
const int N=16;
int i,j;
for (j=0;j<10;++j){
for (i=0;i<N*N*N*N*N*N;++i) {
Y[i] = std::sin(X[i]);
}
}
"""
_args = dict(code=src,
arg_names=['X', 'Y'],
extra_compile_args=["-O3",],
local_dict=dict(N=N, X=X, Y=Y))
def apply_sin_inline(X, Y):
inline(code=src, arg_names=['X', 'Y'])
Y = 0*X
%time apply_sin_inline(X,Y)
assert np.allclose(Y, sinX)
print Y.sum()
Numba¶
The numba
package provides a way to convert a subset of python code into LLVM which then gets compiled. There are two options – the simplest is to use the autojit
decorator, which will determine the required types from the arguments when the function is called. The second approach is to use the jit
decorator, which allows one to specify the type (and which will perform the compilations on import.
import time
import numpy as np
import numba
for n in [numba.pipeline, # Ignore some debug info
numba.control_flow.debug,
numba.closures,
numba.codegen.debug]:
n.logger.setLevel(numba.pipeline.logging.ERROR)
@numba.autojit
def apply_sin_numba(X, Y):
x0, x1, x2, y0, y1, y2 = X.shape
for i0 in xrange(x0):
for i1 in xrange(x1):
for i2 in xrange(x2):
for j0 in xrange(y0):
for j1 in xrange(y1):
for j2 in xrange(y2):
Y[i0,i1,i2,j0,j1,j2] = np.sin(X[i0,i1,i2,j0,j1,j2])
# Note that the following also work as "array expressions" but take a lot longer to compile...
@numba.autojit
def _apply_sin_numba(X, Y):
for i in xrange(len(X)):
Y[i] = np.sin(X[i])
@numba.autojit
def _apply_sin_numba(X, Y):
Y[...] = np.sin(X[...])
Y = 0*X
%time apply_sin_numba(X, Y) # This will include the compiliation time.
assert np.allclose(Y, sinX)
%%time
for n in range(10):
apply_sin_numba(X, Y)
Now we use explicit types with jit
:
%%time
X_type = numba.c16[:,:,:,:,:,::1]
_jit = numba.jit(numba.void(X_type, X_type))
@_jit
def apply_sin_numba_jit(X, Y):
x0, x1, x2, y0, y1, y2 = X.shape
for i0 in xrange(x0):
for i1 in xrange(x1):
for i2 in xrange(x2):
for j0 in xrange(y0):
for j1 in xrange(y1):
for j2 in xrange(y2):
Y[i0,i1,i2,j0,j1,j2] = np.sin(X[i0,i1,i2,j0,j1,j2])
Y = 0*X
%time apply_sin_numba_jit(X, Y) # This will include the compiliation time.
assert np.allclose(Y, sinX)
%%time
for n in range(10):
apply_sin_numba_jit(X, Y)
Numba allows one to use multiple threads by using numba.prange
, but this places even more restrictions on what can appear in the loop. For example, one cannot use conditional (but they can be used in nested function calls). This take a LOT longer to compile though.
%%time
@_jit
def apply_sin_numba_p(X, Y):
"""Multi-threaded version of numba sin"""
x0, x1, x2, y0, y1, y2 = X.shape
for i0 in numba.prange(x0):
for i1 in xrange(x1):
for i2 in xrange(x2):
for j0 in xrange(y0):
for j1 in xrange(y1):
for j2 in xrange(y2):
Y[i0,i1,i2,j0,j1,j2] = np.sin(X[i0,i1,i2,j0,j1,j2])
#@_jit
def _apply_sin_numba_p(X, Y):
for i in numba.prange(len(X)):
Y[i] = np.sin(X[i])
#@_jit
def apply_sin_pr(X, Y):
"""Multi-threaded version of numba sin with reversed indexing"""
x0, x1, x2, y0, y1, y2 = X.shape
for j2 in numba.prange(y2):
for j1 in xrange(y1):
for j0 in xrange(y0):
for i2 in xrange(x2):
for i1 in xrange(x1):
for i0 in xrange(x0):
Y[i0,i1,i2,j0,j1,j2] = np.sin(X[i0,i1,i2,j0,j1,j2])
Y = 0*X
%time apply_sin_numba_p(X, Y) # This will include the compiliation time.
assert np.allclose(Y, sinX)
%%time
for n in range(10):
apply_sin_numba_p(X, Y)
print("numpy")
%timeit Y = np.sin(X)
Y = np.sin(X); assert np.allclose(Y, sinX)
print("numba")
%timeit apply_sin_numba(X,Y)
Y = 0*X; apply_sin_numba(X,Y); assert np.allclose(Y, sinX)
print("weave.inline")
%timeit apply_sin_inline(X, Y)
Y = 0*X; apply_sin_inline(X, Y); assert np.allclose(Y, sinX)
print("numba threaded")
%timeit apply_sin_numba_p(X, Y)
Y = 0*X; apply_sin_numba_p(X,Y); assert np.allclose(Y, sinX)
print("numexpr")
%timeit Y = apply_sin_numexpr(X)
Y = apply_sin_numexpr(X); assert np.allclose(Y, sinX)