Faster than NumPy

High-performance numerical computation in JavaScript

About me.

Quansight
Quansight Labs
Data APIs
stdlib

CPU-based library for performing numerical computation on multidimensional arrays.

ndarray

Demos

Multidimensional arrays


arr = [
	[ 1, 2, 3 ],
	[ 4, 5, 6 ],
	[ 7, 8, 9 ]
];

// Get individual item:
v = arr[1][1];
// returns 5

// Get row:
row = arr[1];
// returns [ 4, 5, 6 ]

// Get column: (!!!)
col = [ arr[0][1], arr[1][1], arr[2][1] ];
// returns [ 2, 5, 8 ]

// Column-major:
arr = [
	[ 1, [ 2, [ 3,
	  4,   5,   6,
	  7 ], 8 ], 9 ]
];
						

Let \( n \) be the number of elements per dimension and \( d \) be the number of dimensions.

  • \( O(d) \) — data access
  • \( O(n^{d}d) \) — traversal
  • \( O(n^{d}) \) — slicing
  • \( O(n^{d-1}) \) — extra storage requirements

Implicit arrays


arr = [
	1, 2, 3,
	4, 5, 6,
	7, 8, 9
];

// Get individual item:
v = arr[ (1*3) + 1 ];
// returns 5

// Get row:
row = [arr[(1*3)+0], arr[(1*3)+1], arr[(1*3)+2]];
// returns [ 4, 5, 6 ]

// Get column:
col = [arr[(0*3)+1], arr[(1*3)+1], arr[(2*3)+1]];
// returns [ 2, 5, 8 ]
						

Let \( n \) be the number of elements per dimension, \( d \) be the number of dimensions, and \( B \) be the cache line size.

  • \( O(1) \) — data access
  • \( O(\frac{n^d}{B}) \) — traversal
  • \( O(\frac{n^d}{B}) \) — slicing
  • \( O(0) \) — extra storage requirements

Strided arrays


x = [
	1, 2, 3,
	4, 5, 6,
	7, 8, 9
];

// Define meta data:
shx = [ 3, 3 ]; // shape
ox = 0;         // offset
sx = [ 3, 1 ];  // strides (row-major/lexicographic)

// Get individual item:
v = x[ (1*sx[0]) + (1*sx[1]) + ox ];
// returns 5

// Define row:
shr = [ shx[1] ];     // [ 3 ]
sr = [ sx[1] ];       // [ 1 ]
or = (1*sx[0]) + ox;  // 3

// Define column:
shc = [ shx[0] ];     // [ 3 ]
sc = [ sx[0] ];       // [ 3 ]
oc = (1*sx[1]) + ox;  // 1
						

\[ \textrm{index}(x_{i_0 i_1 \dots i_{d-1}}) = o + \sum^{d-1}_{k = 0} s_k i_k \] where \( d \) is the number of dimensions, \( o \) is the offset, \( s_k \) are the strides, and \( i_k \) are the dimension subscripts.

Let \( n \) be the number of elements per dimension, \( d \) be the number of dimensions, and \( B \) be the cache line size.

  • \( O(1) \) — data access
  • \( O(\frac{n^d}{B}) \) — traversal
  • \( O(d) \) — slicing
  • \( O(d) \) — extra storage requirements

// Reverse an array...
shape = [8]; strides = [1]; offset = 0;

// Negate strides and adjust offset:
strides[0] *= -1;
offset = 7;


// Matrix transpose...
shape = [3, 3]; strides = [3, 1]; offset = 0;

// Swap strides (and dimensions):
strides = [1, 3];


// Reshape an array...
shape = [3, 3]; strides = [3, 1]; offset = 0;

// Update shape and strides:
shape = [3, 1, 3];
strides = [3, 3, 1];


// Access a subarray (slice)...
shape = [3, 3]; strides = [3, 1]; offset = 0;

// Update shape and offset:
shape = [2, 2];
offset = 4;
						

Vectorization

Parallelization

Hardware Optimization

Broadcasting


import array from '@stdlib/ndarray-array';

// Create a 4-dimensional array:
const arr = array({
    'dtype': 'float32',
    'shape': [ 3, 3, 3, 3 ]
});

// Retrieve the array shape:
const shape = arr.shape;
// returns [ 3, 3, 3, 3 ]

// Retrieve the array strides:
const strides = arr.strides;
// returns [ 27, 9, 3, 1 ];

// Retrieve the array offset:
const offset = arr.offset;
// returns 0

// Retrieve the array dtype:
const dtype = arr.dtype;
// returns 'float32'

// Retrieve the underlying array data:
const data = arr.data;
// returns <Float32Array>[ 0, ..., 0 ]

// Retrieve the array byte length:
const byteLength = arr.byteLength;
// returns 324

// Retrieve an array value:
let v = arr.get( 1, 2, 1, 2 );
// returns 0.0

// Set an array value:
arr.set( 1, 2, 1, 2, 10.0 );

// Retrieve the array value:
v = arr.get( 1, 2, 1, 2 );
// returns 10.0

// Serialize as JSON:
const json = arr.toJSON();
// returns {"type":"ndarray","dtype":"float32","flags":{},"order":"row-major","shape":[3,3,3,3],"strides":[27,9,3,1],"data":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}

// Create a 2x2 matrix:
const arr2 = array([
	[ 1.0, 2.0 ],
	[ 3.0, 4.0 ]
]);
// returns <ndarray>[ 1.0, 2.0; 3.0, 4.0 ]
						

ndarray


import array from '@stdlib/ndarray-array';
import abs from '@stdlib/math-special-abs';

// Create a one-dimensional vector:
let x = array([ -1.0, -2.0 ]);
// returns <ndarray>[ -1.0, -2.0 ]

// Retrieve the shape:
let shape = x.shape;
// returns [ 2 ]

// Retrieve the dtype:
let dtype = x.dtype;
// returns 'float64'

// Compute the absolute value for each element:
let y = abs( x );
// returns <ndarray>[ 1.0, 2.0 ]

// Retrieve the shape:
shape = y.shape;
// returns [ 2 ]

// Define an output array:
y = array({
	'shape': [ 4, 2 ]
});
// returns <ndarray>[ 0.0, 0.0; 0.0, 0.0; 0.0, 0.0; 0.0, 0.0 ]

// Broadcast results across rows:
abs.assign( x, y );

// Retrieve values:
let v = y.get(0, 0);
// returns 1.0

v = y.get(0, 1);
// returns 2.0

v = y.get(1, 0);
// returns 1.0

v = y.get(3, 1);
// returns 2.0
						

Vectorization and Broadcasting


import array from '@stdlib/ndarray-array';
import ddot from '@stdlib/blas/ddot';

// Create two one-dimensional vectors:
let x = array([ 1.0, 2.0, 3.0, 4.0 ]);
let y = array([ 5.0, 6.0, 7.0, 8.0 ]);

// Compute the dot product:
let v = ddot(x, y);
// returns 70

// Import a lower-level ndarray constructor:
import ndarray from '@stdlib/ndarray-ctor';

// Create a reverse view atop `y`:
let yr = ndarray(
	y.dtype,
	y.data,
	y.shape,
	[-1],
	y.length-1,
	y.order
);
// returns <ndarray>[ 8.0, 7.0, 6.0, 5.0 ]

// Retrieve the first element:
v = yr.get(0);
// returns 8.0

// Retrieve the underlying data:
let data = yr.data;
// returns <Float64Array>[ 5.0, 6.0, 7.0, 8.0 ]

// Confirm that `yr` is a view:
let bool = ( y.data === yr.data );
// returns true

// Compute the dot product with the reversed vector:
v = ddot(x, yr);
// returns 60

// Import a lower-level BLAS interface:
import daxpy from '@stdlib/blas-base-daxpy';

// Compute `y = a*x + y`:
daxpy.ndarray(
	x.length,
	5.0,
	x.data,
	x.strides[ 0 ],
	x.offset,
	y.data,
	y.strides[ 0 ],
	y.offset
);

// Retrieve the underlying data of `y`:
data = y.data;
// returns <ndarray>[ 10, 16, 22, 28 ]

// Compute the dot product with the reversed vector:
v = ddot(x, yr);
// returns 160
						

Hardware Optimization

Performance results
Performance results
Performance results
Performance results

Not (yet) faster than NumPy


#define BINARY_DEFS\
    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
    npy_intp n = dimensions[0];\
    npy_intp i;\

#define BINARY_LOOP_SLIDING\
    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)

/** (ip1, ip2) -> (op1) */
#define BINARY_LOOP\
    BINARY_DEFS\
    BINARY_LOOP_SLIDING
						

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double#
 *  #TYPE = FLOAT, DOUBLE#
 *  #c = f, #
 *  #C = F, #
 */
/**begin repeat1
 * Arithmetic
 * # kind = add, subtract, multiply, divide#
 * # OP = +, -, *, /#
 * # PW = 1, 0, 0, 0#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    if (IS_BINARY_REDUCE) {
#if @PW@
        @type@ * iop1 = (@type@ *)args[0];
        npy_intp n = dimensions[0];

        *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
#else
        BINARY_REDUCE_LOOP(@type@) {
            io1 @OP@= *(@type@ *)ip2;
        }
        *((@type@ *)iop1) = io1;
#endif
    }
    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
        BINARY_LOOP {
            const @type@ in1 = *(@type@ *)ip1;
            const @type@ in2 = *(@type@ *)ip2;
            *((@type@ *)op1) = in1 @OP@ in2;
        }
    }
}
/**end repeat1**/
/**end repeat**/