Skip to content

Represent Model._coords values as 1-dim numpy arrays #6589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions pymc/backends/arviz.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

from pymc.model import Model, modelcontext
from pymc.pytensorf import extract_obs_data
from pymc.util import get_default_varnames
from pymc.util import _as_coord_vals, get_default_varnames

if TYPE_CHECKING:
from pymc.backends.base import MultiTrace # pylint: disable=invalid-name
Expand Down Expand Up @@ -216,15 +216,19 @@ def __init__(
" one of trace, prior, posterior_predictive or predictions."
)

# Make coord types more rigid
untyped_coords: Dict[str, Optional[Sequence[Any]]] = {**self.model.coords}
if coords:
untyped_coords.update(coords)
self.coords = {
cname: np.array(cvals) if isinstance(cvals, tuple) else cvals
for cname, cvals in untyped_coords.items()
given_coords = coords if coords is not None else {}
given_coords_typed = {
cname: _as_coord_vals(cvals)
for cname, cvals in given_coords.items()
if cvals is not None
}
model_coords_typed = {
cname: cvals_typed
for cname, cvals_typed in self.model.coords_typed.items()
if cvals_typed is not None
}
# Coords from argument should have precedence
self.coords = {**model_coords_typed, **given_coords_typed}

self.dims = {} if dims is None else dims
model_dims = {k: list(v) for k, v in self.model.named_vars_to_dims.items()}
Expand Down
6 changes: 3 additions & 3 deletions pymc/backends/mcbackend.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,9 @@ def make_runmeta_and_point_fn(
sample_stats.append(svar)

coordinates = [
mcb.Coordinate(dname, mcb.npproto.utils.ndarray_from_numpy(np.array(cvals)))
for dname, cvals in model.coords.items()
if cvals is not None
mcb.Coordinate(dname, mcb.npproto.utils.ndarray_from_numpy(cvals_typed))
for dname, cvals_typed in model.coords_typed.items()
if cvals_typed is not None
]
meta = mcb.RunMeta(
rid=hagelkorn.random(),
Expand Down
4 changes: 2 additions & 2 deletions pymc/distributions/bound.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ def __new__(

if dims is not None:
model = modelcontext(None)
if dims in model.coords:
dim_obj = np.asarray(model.coords[dims])
if dims in model.coords_typed:
dim_obj = model.coords_typed[dims]
size = dim_obj.shape
else:
raise ValueError("Given dims do not exist in model coordinates.")
Expand Down
71 changes: 42 additions & 29 deletions pymc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
UNSET,
WithMemoization,
_add_future_warning_tag,
_as_coord_vals,
get_transformed_name,
get_value_vars_from_user_vars,
get_var_name,
Expand Down Expand Up @@ -986,8 +987,19 @@ def RV_dims(self) -> Dict[str, Tuple[Union[str, None], ...]]:

@property
def coords(self) -> Dict[str, Union[Tuple, None]]:
"""Coordinate values for model dimensions."""
return self._coords
"""Coordinate values (tuple) for model dimensions."""
return {
cname: tuple(cvals) if cvals is not None else None
for cname, cvals in self._coords.items()
}

@property
def coords_typed(self) -> Dict[str, Union[np.ndarray, None]]:
"""Coordinate values (numpy array) for model dimensions."""
return {
cname: cvals.copy() if cvals is not None else None
for cname, cvals in self._coords.items()
}

@property
def dim_lengths(self) -> Dict[str, Variable]:
Expand All @@ -1002,20 +1014,20 @@ def shape_from_dims(self, dims):
if len(set(dims)) != len(dims):
raise ValueError("Can not contain the same dimension name twice.")
for dim in dims:
if dim not in self.coords:
if dim not in self.coords_typed:
raise ValueError(
f"Unknown dimension name '{dim}'. All dimension "
"names must be specified in the `coords` "
"argument of the model or through a pm.Data "
"variable."
)
shape.extend(np.shape(self.coords[dim]))
shape.extend(self.coords_typed[dim].shape)
return tuple(shape)

def add_coord(
self,
name: str,
values: Optional[Sequence] = None,
values: Optional[Union[Sequence, np.ndarray]] = None,
mutable: bool = False,
*,
length: Optional[Union[int, Variable]] = None,
Expand Down Expand Up @@ -1047,12 +1059,10 @@ def add_coord(
f"Either `values` or `length` must be specified for the '{name}' dimension."
)
if values is not None:
# Conversion to a tuple ensures that the coordinate values are immutable.
# Also unlike numpy arrays the's tuple.index(...) which is handy to work with.
values = tuple(values)
if name in self.coords:
if not np.array_equal(values, self.coords[name]):
raise ValueError(f"Duplicate and incompatible coordinate: {name}.")
# Conversion to numpy array to ensure coord vals are 1-dim
values = _as_coord_vals(values)
if name in self.coords_typed and not np.array_equal(values, self.coords_typed[name]):
raise ValueError(f"Duplicate and incompatible coordinate: {name}.")
if length is not None and not isinstance(length, (int, Variable)):
raise ValueError(
f"The `length` passed for the '{name}' coord must be an int, PyTensor Variable or None."
Expand All @@ -1069,7 +1079,7 @@ def add_coord(

def add_coords(
self,
coords: Dict[str, Optional[Sequence]],
coords: Dict[str, Optional[Union[Sequence, np.ndarray]]],
*,
lengths: Optional[Dict[str, Optional[Union[int, Variable]]]] = None,
):
Expand All @@ -1081,7 +1091,9 @@ def add_coords(
for name, values in coords.items():
self.add_coord(name, values, length=lengths.get(name, None))

def set_dim(self, name: str, new_length: int, coord_values: Optional[Sequence] = None):
def set_dim(
self, name: str, new_length: int, coord_values: Optional[Union[Sequence, np.ndarray]] = None
):
"""Update a mutable dimension.

Parameters
Expand All @@ -1095,7 +1107,7 @@ def set_dim(self, name: str, new_length: int, coord_values: Optional[Sequence] =
"""
if not isinstance(self.dim_lengths[name], ScalarSharedVariable):
raise ValueError(f"The dimension '{name}' is immutable.")
if coord_values is None and self.coords.get(name, None) is not None:
if coord_values is None and self.coords_typed.get(name, None) is not None:
raise ValueError(
f"'{name}' has coord values. Pass `set_dim(..., coord_values=...)` to update them."
)
Expand All @@ -1107,7 +1119,7 @@ def set_dim(self, name: str, new_length: int, coord_values: Optional[Sequence] =
actual=len_cvals,
expected=new_length,
)
self._coords[name] = tuple(coord_values)
self._coords[name] = _as_coord_vals(coord_values)
self.dim_lengths[name].set_value(new_length)
return

Expand Down Expand Up @@ -1151,7 +1163,7 @@ def set_data(
self,
name: str,
values: Dict[str, Optional[Sequence]],
coords: Optional[Dict[str, Sequence]] = None,
coords: Optional[Dict[str, Union[Sequence, np.ndarray]]] = None,
):
"""Changes the values of a data variable in the model.

Expand Down Expand Up @@ -1181,7 +1193,7 @@ def set_data(
values = np.array(values)
values = convert_observed_data(values)
dims = self.named_vars_to_dims.get(name, None) or ()
coords = coords or {}
coords_untyped = coords or {}

if values.ndim != shared_object.ndim:
raise ValueError(
Expand All @@ -1192,19 +1204,21 @@ def set_data(
length_tensor = self.dim_lengths[dname]
old_length = length_tensor.eval()
new_length = values.shape[d]
original_coords = self.coords.get(dname, None)
new_coords = coords.get(dname, None)
original_coord_vals = self.coords_typed.get(dname, None)
new_coord_vals = coords_untyped.get(dname, None)
if new_coord_vals is not None:
new_coord_vals = _as_coord_vals(new_coord_vals)

length_changed = new_length != old_length

# Reject resizing if we already know that it would create shape problems.
# NOTE: If there are multiple pm.MutableData containers sharing this dim, but the user only
# changes the values for one of them, they will run into shape problems nonetheless.
if length_changed:
if original_coords is not None:
if new_coords is None:
if original_coord_vals is not None:
if new_coord_vals is None:
raise ValueError(
f"The '{name}' variable already had {len(original_coords)} coord values defined for "
f"The '{name}' variable already had {len(original_coord_vals)} coord values defined for "
f"its {dname} dimension. With the new values this dimension changes to length "
f"{new_length}, so new coord values for the {dname} dimension are required."
)
Expand Down Expand Up @@ -1268,18 +1282,17 @@ def set_data(
if isinstance(length_tensor, ScalarSharedVariable):
# The dimension is mutable, but was defined without being linked
# to a shared variable. This is allowed, but a little less robust.
self.set_dim(dname, new_length, coord_values=new_coords)
self.set_dim(dname, new_length, coord_values=new_coord_vals)

if new_coords is not None:
if new_coord_vals is not None:
# Update the registered coord values (also if they were None)
if len(new_coords) != new_length:
if len(new_coord_vals) != new_length:
raise ShapeError(
f"Length of new coordinate values for dimension '{dname}' does not match the provided values.",
actual=len(new_coords),
actual=len(new_coord_vals),
expected=new_length,
)
# store it as tuple for immutability as in add_coord
self._coords[dname] = tuple(new_coords)
self._coords[dname] = _as_coord_vals(new_coord_vals)

shared_object.set_value(values)

Expand Down Expand Up @@ -1550,7 +1563,7 @@ def add_named_variable(self, var, dims: Optional[Tuple[Union[str, None], ...]] =
if isinstance(dims, str):
dims = (dims,)
for dim in dims:
if dim not in self.coords and dim is not None:
if dim not in self.coords_typed and dim is not None:
raise ValueError(f"Dimension {dim} is not specified in `coords`.")
if any(var.name == dim for dim in dims if dim is not None):
raise ValueError(f"Variable `{var.name}` has the same name as its dimension label.")
Expand Down
2 changes: 1 addition & 1 deletion pymc/sampling/forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ def sample_posterior_predictive(

constant_coords = set()
for dim, coord in trace_coords.items():
current_coord = model.coords.get(dim, None)
current_coord = model.coords_typed.get(dim, None)
if (
current_coord is not None
and len(coord) == len(current_coord)
Expand Down
12 changes: 6 additions & 6 deletions pymc/sampling/jax.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,9 @@ def sample_blackjax_nuts(
vars_to_sample = list(get_default_varnames(var_names, include_transformed=keep_untransformed))

coords = {
cname: np.array(cvals) if isinstance(cvals, tuple) else cvals
for cname, cvals in model.coords.items()
if cvals is not None
cname: cvals_typed
for cname, cvals_typed in model.coords_typed.items()
if cvals_typed is not None
}

dims = {
Expand Down Expand Up @@ -605,9 +605,9 @@ def sample_numpyro_nuts(
vars_to_sample = list(get_default_varnames(var_names, include_transformed=keep_untransformed))

coords = {
cname: np.array(cvals) if isinstance(cvals, tuple) else cvals
for cname, cvals in model.coords.items()
if cvals is not None
cname: cvals_typed
for cname, cvals_typed in model.coords_typed.items()
if cvals_typed is not None
}

dims = {
Expand Down
7 changes: 1 addition & 6 deletions pymc/stats/log_likelihood.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
# limitations under the License.
from typing import Optional, Sequence, cast

import numpy as np

from arviz import InferenceData, dict_to_dataset
from fastprogress import progress_bar

Expand Down Expand Up @@ -117,10 +115,7 @@ def compute_log_likelihood(
loglike_trace,
library=pymc,
dims={dname: list(dvals) for dname, dvals in model.named_vars_to_dims.items()},
coords={
cname: np.array(cvals) if isinstance(cvals, tuple) else cvals
for cname, cvals in model.coords.items()
},
coords={cname: cvals_typed for cname, cvals_typed in model.coords_typed.items()},
default_dims=list(sample_dims),
skip_event_dims=True,
)
Expand Down
19 changes: 19 additions & 0 deletions pymc/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from pytensor.compile import SharedVariable
from pytensor.graph.utils import ValidatingScratchpad

from pymc.exceptions import ShapeError


class _UnsetType:
"""Type for the `UNSET` object to make it look nice in `help(...)` outputs."""
Expand Down Expand Up @@ -510,3 +512,20 @@ def _add_future_warning_tag(var) -> None:
for k, v in old_tag.__dict__.items():
new_tag.__dict__.setdefault(k, v)
var.tag = new_tag


def _as_coord_vals(values: Union[Sequence, np.ndarray]) -> np.ndarray:
"""Coerce a sequence of coordinate values into a 1-dim array"""
if isinstance(values, np.ndarray):
if values.ndim != 1:
raise ShapeError(
"Coordinate values passed as a numpy array must be 1-dimensional",
actual=values.ndim,
expected=1,
)
return values
arr = np.array(values)
if arr.ndim > 1:
arr = np.empty(len(values), dtype="O")
arr[:] = values
return arr
2 changes: 1 addition & 1 deletion pymc/variational/opvi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,7 +1135,7 @@ def var_to_data(self, shared: at.TensorVariable) -> xarray.Dataset:
for name, s, shape, dtype in self.ordering.values():
dims = self.model.named_vars_to_dims.get(name, None)
if dims is not None:
coords = {d: np.array(self.model.coords[d]) for d in dims}
coords = {d: self.model.coords_typed[d] for d in dims}
else:
coords = None
values = shared_nda[s].reshape(shape).astype(dtype)
Expand Down
33 changes: 24 additions & 9 deletions tests/backends/test_arviz.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,15 +630,30 @@ def test_issue_5043_autoconvert_coord_values(self):
# We're not automatically converting things other than tuple,
# so advanced use cases remain supported at the InferenceData level.
# They just can't be used in the model construction already.
converter = InferenceDataConverter(
trace=mtrace,
coords={
"city": pd.MultiIndex.from_tuples(
[("Bonn", 53111), ("Berlin", 10178)], names=["name", "zipcode"]
)
},
)
assert isinstance(converter.coords["city"], pd.MultiIndex)
# TODO: we now ensure that everything passed to arviz is converted to
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Flagging this for discussion. This PR currently updates InferenceDataConverter to always convert coordinate values to ndarray. (Previously they were only converted if the sequence was a tuple.) Is there a reason we would want to maintain the original behavior...?

This test was originally made for #5043

# ndarray, which makes the following test fail.
# converter = InferenceDataConverter(
# trace=mtrace,
# coords={
# "city": pd.MultiIndex.from_tuples(
# [("Bonn", 53111), ("Berlin", 10178)], names=["name", "zipcode"]
# )
# },
# )
# assert isinstance(converter.coords["city"], pd.MultiIndex)

def test_nested_coords_issue_6496(self):
"""Regression test to ensure we don't bug out if coordinate values
appear "nested" to numpy.
"""
model = pm.Model(coords={"cname": [("a", 1), ("a", 2), ("b", 1)]})
idata = to_inference_data(
prior={"x": np.zeros((100, 3))}, dims={"x": ["cname"]}, model=model
)
idata_coord = idata.prior.coords["cname"]
assert len(idata_coord) == 3
assert idata_coord.dtype == np.dtype("O")
assert np.array_equal(idata_coord.data, model.coords_typed["cname"])

def test_variable_dimension_name_collision(self):
with pytest.raises(ValueError, match="same name as its dimension"):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ def test_nested_model_coords():
c = pm.HalfNormal("c", dims="dim3")
d = pm.Normal("d", b, c, dims="dim2")
e = pm.Normal("e", a[None] + d[:, None], dims=("dim2", "dim1"))
assert m1.coords is m2.coords
assert m1.coords == m2.coords
assert m1.dim_lengths is m2.dim_lengths
assert set(m2.named_vars_to_dims) < set(m1.named_vars_to_dims)

Expand Down