How to perform computations with NumPy#
Awkward Array’s integration with NumPy allows you to use NumPy’s array functions on data with complex structures, including ragged and heterogeneous arrays.
import awkward as ak
import numpy as np
Universal functions (ufuncs)#
NumPy’s universal functions (ufuncs) are functions that operate elementwise on arrays. They are broadcasting-aware, so they can naturally handle data structures like ragged arrays that are common in Awkward Arrays.
Here’s an example of applying np.sqrt
, a NumPy ufunc, to an Awkward Array:
data = ak.Array([[1, 4, 9], [], [16, 25]])
np.sqrt(data)
[[1, 2, 3], [], [4, 5]] ----------------------- type: 3 * var * float64
Notice that the ufunc applies to the numeric data, passing through all dimensions of nested lists, even if those lists have variable length. This also applies to heterogeneous data, in which the data are not all of the same type.
data = ak.Array([[1, 4, 9], [], 16, [[[25]]]])
np.sqrt(data)
[[1, 2, 3], [], 4, [[[5]]]] --------------------------- type: 4 * union[ var * union[ float64, var * var * float64 ], float64 ]
Unary and binary operations on Awkward Arrays, such as +
, -
, >
, and ==
, are actually calling NumPy ufuncs. For instance, +
:
array1 = ak.Array([[1, 2, 3], [], [4, 5]])
array2 = ak.Array([[10, 20, 30], [], [40, 50]])
array1 + array2
[[11, 22, 33], [], [44, 55]] --------------------- type: 3 * var * int64
is actually np.add
:
np.add(array1, array2)
[[11, 22, 33], [], [44, 55]] --------------------- type: 3 * var * int64
Arrays with record fields#
Ufuncs can only be applied to numerical data in lists, not records.
records = ak.Array([{"x": 4, "y": 9}, {"x": 16, "y": 25}])
np.sqrt(records)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[7], line 1
----> 1 np.sqrt(records)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/highlevel.py:1594, in Array.__array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1592 name = f"{type(ufunc).__module__}.{ufunc.__name__}.{method!s}"
1593 with ak._errors.OperationErrorContext(name, inputs, kwargs):
-> 1594 return ak._connect.numpy.array_ufunc(ufunc, method, inputs, kwargs)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_connect/numpy.py:469, in array_ufunc(ufunc, method, inputs, kwargs)
461 raise TypeError(
462 "no {}.{} overloads for custom types: {}".format(
463 type(ufunc).__module__, ufunc.__name__, ", ".join(error_message)
464 )
465 )
467 return None
--> 469 out = ak._broadcasting.broadcast_and_apply(
470 inputs,
471 action,
472 depth_context=depth_context,
473 lateral_context=lateral_context,
474 allow_records=False,
475 function_name=ufunc.__name__,
476 )
478 out_named_axis = functools.reduce(
479 _unify_named_axis, lateral_context[NAMED_AXIS_KEY].named_axis
480 )
481 if len(out) == 1:
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:1200, in broadcast_and_apply(inputs, action, depth_context, lateral_context, allow_records, left_broadcast, right_broadcast, numpy_to_regular, regular_to_jagged, function_name, broadcast_parameters_rule)
1198 backend = backend_of(*inputs, coerce_to_common=False)
1199 isscalar = []
-> 1200 out = apply_step(
1201 backend,
1202 broadcast_pack(inputs, isscalar),
1203 action,
1204 0,
1205 depth_context,
1206 lateral_context,
1207 {
1208 "allow_records": allow_records,
1209 "left_broadcast": left_broadcast,
1210 "right_broadcast": right_broadcast,
1211 "numpy_to_regular": numpy_to_regular,
1212 "regular_to_jagged": regular_to_jagged,
1213 "function_name": function_name,
1214 "broadcast_parameters_rule": broadcast_parameters_rule,
1215 },
1216 )
1217 assert isinstance(out, tuple)
1218 return tuple(broadcast_unpack(x, isscalar) for x in out)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:1178, in apply_step(backend, inputs, action, depth, depth_context, lateral_context, options)
1176 return result
1177 elif result is None:
-> 1178 return continuation()
1179 else:
1180 raise AssertionError(result)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:1147, in apply_step.<locals>.continuation()
1145 # Any non-string list-types?
1146 elif any(x.is_list and not is_string_like(x) for x in contents):
-> 1147 return broadcast_any_list()
1149 # Any RecordArrays?
1150 elif any(x.is_record for x in contents):
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:671, in apply_step.<locals>.broadcast_any_list()
668 nextinputs.append(x)
669 nextparameters.append(NO_PARAMETERS)
--> 671 outcontent = apply_step(
672 backend,
673 nextinputs,
674 action,
675 depth + 1,
676 copy.copy(depth_context),
677 lateral_context,
678 options,
679 )
680 assert isinstance(outcontent, tuple)
681 parameters = parameters_factory(nextparameters, len(outcontent))
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:1178, in apply_step(backend, inputs, action, depth, depth_context, lateral_context, options)
1176 return result
1177 elif result is None:
-> 1178 return continuation()
1179 else:
1180 raise AssertionError(result)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:1151, in apply_step.<locals>.continuation()
1149 # Any RecordArrays?
1150 elif any(x.is_record for x in contents):
-> 1151 return broadcast_any_record()
1153 else:
1154 raise ValueError(
1155 "cannot broadcast: {}{}".format(
1156 ", ".join(repr(type(x)) for x in inputs), in_function(options)
1157 )
1158 )
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_broadcasting.py:503, in apply_step.<locals>.broadcast_any_record()
501 def broadcast_any_record():
502 if not options["allow_records"]:
--> 503 raise ValueError(f"cannot broadcast records{in_function(options)}")
505 frozen_record_fields: frozenset[str] | None = UNSET
506 first_record = next(c for c in contents if c.is_record)
ValueError: cannot broadcast records in sqrt
This error occurred while calling
numpy.sqrt.__call__(
<Array [{x: 4, y: 9}, {x: 16, ...}] type='2 * {x: int64, y: int64}'>
)
However, you can pull each field out of a record and apply the ufunc to it.
np.sqrt(records.x)
[2, 4] ----------------- type: 2 * float64
np.sqrt(records.y)
[3, 5] ----------------- type: 2 * float64
If you want the result wrapped up in a new array of records, you can use ak.zip()
to do that.
ak.zip({"x": np.sqrt(records.x), "y": np.sqrt(records.y)})
[{x: 2, y: 3}, {x: 4, y: 5}] --------------- type: 2 * { x: float64, y: float64 }
Here’s an idiom that would apply a ufunc to every field individually, and then wrap up the result as a new record with the same fields (using ak.fields()
, ak.unzip()
, and ak.zip()
):
ak.zip({key: np.sqrt(value) for key, value in zip(ak.fields(records), ak.unzip(records))})
[{x: 2, y: 3}, {x: 4, y: 5}] --------------- type: 2 * { x: float64, y: float64 }
The reaons that Awkward Array does not do this automatically is to prevent mistakes: it’s common for records to represent coordinates of data points, and if the coordinates are not Cartesian, the one-to-one application is not correct.
Using non-NumPy ufuncs#
NumPy-compatible ufuncs exist in other libraries, like SciPy, and can be applied in the same way. Here’s how you can apply scipy.special.gamma
and scipy.special.erf
:
import scipy.special
data = ak.Array([[0.1, 0.2, 0.3], [], [0.4, 0.5]])
scipy.special.gamma(data)
[[9.51, 4.59, 2.99], [], [2.22, 1.77]] ----------------------- type: 3 * var * float64
scipy.special.erf(data)
[[0.112, 0.223, 0.329], [], [0.428, 0.52]] ----------------------- type: 3 * var * float64
You can even create your own ufuncs using Numba’s @nb.vectorize
:
import numba as nb
@nb.vectorize
def gcd_euclid(x, y):
# computation that is more complex than a formula
while y != 0:
x, y = y, x % y
return x
x = ak.Array([[10, 20, 30], [], [40, 50]])
y = ak.Array([[5, 40, 15], [], [24, 255]])
gcd_euclid(x, y)
[[5, 20, 15], [], [8, 5]] --------------------- type: 3 * var * int64
Since Numba has JIT-compiled this function, it would run much faster on large arrays than custom Python code.
Non-ufunc NumPy functions#
Some NumPy functions don’t satisfy the ufunc protocol, but have been implemented for Awkward Arrays because they are useful. You can tell when a NumPy function has an Awkward Array implementation when a function with the same name and signature exists in both libraries.
For instance, np.where
works on Awkward Arrays because ak.where()
exists:
np.where(y % 2 == 0, x, y)
[[5, 20, 15], [], [40, 255]] --------------------- type: 3 * var * int64
(The above selects elements from x
when y
is even and elements from y
when y
is odd.)
Similarly, np.concatenate
works on Awkward Arrays because ak.concatenate()
exists:
np.concatenate([x, y])
[[10, 20, 30], [], [40, 50], [5, 40, 15], [], [24, 255]] --------------------- type: 6 * var * int64
np.concatenate([x, y], axis=1)
[[10, 20, 30, 5, 40, 15], [], [40, 50, 24, 255]] ------------------------- type: 3 * var * int64
Other NumPy functions, without an equivalent in the Awkward Array library, will work only if the Awkward Array can be converted into a NumPy array.
Ragged arrays can’t be converted to NumPy:
np.fft.fft(ak.Array([[1.1, 2.2, 3.3], [], [7.7, 8.8, 9.9]]))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[21], line 1
----> 1 np.fft.fft(ak.Array([[1.1, 2.2, 3.3], [], [7.7, 8.8, 9.9]]))
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/highlevel.py:1610, in Array.__array_function__(self, func, types, args, kwargs)
1596 def __array_function__(self, func, types, args, kwargs):
1597 """
1598 Intercepts attempts to pass this Array to those NumPy functions other
1599 than universal functions that have an Awkward equivalent.
(...)
1608 See also #__array_ufunc__.
1609 """
-> 1610 return ak._connect.numpy.array_function(
1611 func, types, args, kwargs, behavior=self._behavior, attrs=self._attrs
1612 )
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_connect/numpy.py:110, in array_function(func, types, args, kwargs, behavior, attrs)
107 unique_backends = frozenset(_find_backends(all_arguments))
108 backend = common_backend(unique_backends)
--> 110 rectilinear_args = tuple(_to_rectilinear(x, backend) for x in args)
111 rectilinear_kwargs = {k: _to_rectilinear(v, backend) for k, v in kwargs.items()}
112 result = func(*rectilinear_args, **rectilinear_kwargs)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_connect/numpy.py:110, in <genexpr>(.0)
107 unique_backends = frozenset(_find_backends(all_arguments))
108 backend = common_backend(unique_backends)
--> 110 rectilinear_args = tuple(_to_rectilinear(x, backend) for x in args)
111 rectilinear_kwargs = {k: _to_rectilinear(v, backend) for k, v in kwargs.items()}
112 result = func(*rectilinear_args, **rectilinear_kwargs)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_connect/numpy.py:79, in _to_rectilinear(arg, backend)
70 # Otherwise, cast to layout and convert
71 else:
72 layout = ak.to_layout(
73 arg,
74 allow_record=False,
(...)
77 string_policy="error",
78 )
---> 79 return layout.to_backend(backend).to_backend_array(allow_missing=True)
80 elif isinstance(arg, tuple):
81 return tuple(_to_rectilinear(x, backend) for x in arg)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/contents/content.py:1112, in Content.to_backend_array(self, allow_missing, backend)
1110 else:
1111 backend = regularize_backend(backend)
-> 1112 return self._to_backend_array(allow_missing, backend)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/contents/listoffsetarray.py:2106, in ListOffsetArray._to_backend_array(self, allow_missing, backend)
2104 return buffer.view(np.dtype(("S", max_count)))
2105 else:
-> 2106 return self.to_RegularArray()._to_backend_array(allow_missing, backend)
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/contents/listoffsetarray.py:284, in ListOffsetArray.to_RegularArray(self)
279 _size = Index64.empty(1, self._backend.index_nplike)
280 assert (
281 _size.nplike is self._backend.index_nplike
282 and self._offsets.nplike is self._backend.index_nplike
283 )
--> 284 self._backend.maybe_kernel_error(
285 self._backend[
286 "awkward_ListOffsetArray_toRegularArray",
287 _size.dtype.type,
288 self._offsets.dtype.type,
289 ](
290 _size.data,
291 self._offsets.data,
292 self._offsets.length,
293 )
294 )
295 size = self._backend.index_nplike.index_as_shape_item(_size[0])
296 length = self._offsets.length - 1
File ~/micromamba/envs/awkward-docs/lib/python3.11/site-packages/awkward/_backends/backend.py:67, in Backend.maybe_kernel_error(self, error)
65 return
66 else:
---> 67 raise ValueError(self.format_kernel_error(error))
ValueError: cannot convert to RegularArray because subarray lengths are not regular (in compiled code: https://github.com/scikit-hep/awkward/blob/awkward-cpp-39/awkward-cpp/src/cpu-kernels/awkward_ListOffsetArray_toRegularArray.cpp#L22)
But arrays with equal-sized lists can:
np.fft.fft(ak.Array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]))
[[6.6+0j, -1.65+0.953j, -1.65+-0.953j], [16.5+0j, -1.65+0.953j, -1.65+-0.953j], [26.4+0j, -1.65+0.953j, -1.65+-0.953j]] ---------------------------------------- type: 3 * 3 * complex128