diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4d26a6adb..ba5308fcf5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: - id: black - id: black-jupyter - repo: https://github.com/PyCQA/pylint - rev: v2.17.0 + rev: v2.17.1 hooks: - id: pylint args: [--rcfile=.pylintrc] diff --git a/pymc/data.py b/pymc/data.py index 71ca2439fa..1f2aa38227 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -36,7 +36,7 @@ import pymc as pm -from pymc.pytensorf import convert_observed_data +from pymc.pytensorf import convert_observed_data, unmask_masked_data __all__ = [ "get_data", @@ -419,10 +419,20 @@ def Data( ) name = model.name_for(name) + if isinstance(value, np.ma.MaskedArray): + warnings.warn( + "If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor." + ) + # `convert_observed_data` takes care of parameter `value` and # transforms it to something digestible for PyTensor. arr = convert_observed_data(value) + # because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays + # produced by it are converted back to np.ndarray() with np.nan value. + # This is not very efficient and will not be necessary once pyTensor implements MaskedArray support + arr = unmask_masked_data(arr) + if mutable is None: warnings.warn( "The `mutable` kwarg was not specified. Before v4.1.0 it defaulted to `pm.Data(mutable=True)`," diff --git a/pymc/model.py b/pymc/model.py index 6b7b5d142f..609b035b1d 100644 --- a/pymc/model.py +++ b/pymc/model.py @@ -75,6 +75,7 @@ hessian, inputvars, replace_rvs_by_values, + unmask_masked_data, ) from pymc.util import ( UNSET, @@ -1184,7 +1185,19 @@ def set_data( if isinstance(values, list): values = np.array(values) + + if isinstance(values, np.ma.MaskedArray): + warnings.warn( + "If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor." + ) + values = convert_observed_data(values) + + # because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays + # produced by it are converted back to np.ndarray() with np.nan value. + # This is not very efficient and will not be necessary once pyTensor implements MaskedArray support + values = unmask_masked_data(values) + dims = self.named_vars_to_dims.get(name, None) or () coords = coords or {} @@ -1397,7 +1410,18 @@ def make_obs_var( else: rv_var.tag.test_value = data - mask = getattr(data, "mask", None) + # In case observed is a pyTensor variable, we need to retrieve the underlying array + # and convert it to a masked one to enable automatic imputation + if isinstance(data, SharedVariable): + values = data.get_value() + mask = np.isnan(values) if np.any(np.isnan(values)) else None + elif isinstance(data, TensorConstant): + values = data.data + mask = np.isnan(values) if np.any(np.isnan(values)) else None + else: + mask = getattr(data, "mask", None) + values = data + if mask is not None: impute_message = ( f"Data in {rv_var} contains missing values and" @@ -1443,7 +1467,7 @@ def make_obs_var( # values, and another for the non-missing values. antimask_idx = (~mask).nonzero() - nonmissing_data = pt.as_tensor_variable(data[antimask_idx]) + nonmissing_data = pt.as_tensor_variable(values[antimask_idx]) unmasked_rv_var = rv_var[antimask_idx] unmasked_rv_var = unmasked_rv_var.owner.clone().default_output() @@ -1466,7 +1490,7 @@ def make_obs_var( # Create deterministic that combines observed and missing # Note: This can widely increase memory consumption during sampling for large datasets - rv_var = pt.empty(data.shape, dtype=observed_rv_var.type.dtype) + rv_var = pt.empty(values.shape, dtype=observed_rv_var.type.dtype) rv_var = pt.set_subtensor(rv_var[mask.nonzero()], missing_rv_var) rv_var = pt.set_subtensor(rv_var[antimask_idx], observed_rv_var) rv_var = Deterministic(name, rv_var, self, dims) diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py index 3c9cb945be..8ab353eebe 100644 --- a/pymc/pytensorf.py +++ b/pymc/pytensorf.py @@ -84,11 +84,33 @@ "make_shared_replacements", "generator", "convert_observed_data", + "unmask_masked_data", "compile_pymc", "constant_fold", ] +def unmask_masked_data(data): + """Unmask masked numpy arrays for usage within PyTensor""" + + # PyTensor currently does not support masked arrays + # If a masked array is passed, we convert it to a standard numpy array with np.nans for float type arrays + # In case of integer type arrays, we throw an error as np.nan is a float concept. + + if isinstance(data, np.ma.MaskedArray): + if "int" in str(data.dtype): + raise TypeError( + "Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n" + "Consider if using a float type fits your use case. \n" + "Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution." + ) + else: + ret = data.filled(fill_value=np.nan) + else: + ret = data + return ret + + def convert_observed_data(data): """Convert user provided dataset to accepted formats.""" diff --git a/tests/test_data.py b/tests/test_data.py index 6db3b50875..ff44a3fcb4 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -454,6 +454,33 @@ def test_get_data(): assert type(data) == io.BytesIO +def test_masked_data_mutable(): + with pm.Model(): + data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1]) + expected = np.array([1, 2, np.nan]) + with pytest.warns(UserWarning, match="masked arrays"): + result = pm.MutableData("test", data).get_value() + np.testing.assert_array_equal(result, expected) + + +def test_masked_data_constant(): + with pm.Model(): + data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1]) + expected = np.array([1, 2, np.nan]) + with pytest.warns(UserWarning, match="masked arrays"): + result = pm.ConstantData("test", data).data + np.testing.assert_array_equal(result, expected) + + +def test_masked_integer_data(): + with pm.Model(): + data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1]) + with pytest.raises(TypeError, match="Masked integer"): + pm.ConstantData("test", data) + with pytest.raises(TypeError, match="Masked integer"): + pm.MutableData("test", data) + + class _DataSampler: """ Not for users diff --git a/tests/test_model.py b/tests/test_model.py index c6b176af96..36d895426f 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -530,7 +530,9 @@ def test_make_obs_var(): input_name = "testing_inputs" sparse_input = sps.csr_matrix(np.eye(3)) dense_input = np.arange(9).reshape((3, 3)) - masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0)) + masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0), dtype=float) + shared_tensor = pytensor.shared(masked_array_input.filled(fill_value=np.nan)) + constant_tensor = pt.as_tensor_variable(masked_array_input.filled(fill_value=np.nan)) # Create a fake model and fake distribution to be used for the test fake_model = pm.Model() @@ -558,18 +560,18 @@ def test_make_obs_var(): del fake_model.named_vars[fake_distribution.name] # Here the RandomVariable is split into observed/imputed and a Deterministic is returned - with pytest.warns(ImputationWarning): - masked_output = fake_model.make_obs_var( - fake_distribution, masked_array_input, None, None, None - ) - assert masked_output != fake_distribution - assert not isinstance(masked_output, RandomVariable) - # Ensure it has missing values - assert {"testing_inputs_missing"} == {v.name for v in fake_model.value_vars} - assert {"testing_inputs", "testing_inputs_observed"} == { - v.name for v in fake_model.observed_RVs - } - del fake_model.named_vars[fake_distribution.name] + # behavior should be identical with all inputs + for data_input in [masked_array_input, shared_tensor, constant_tensor]: + with pytest.warns(ImputationWarning): + masked_output = fake_model.make_obs_var(fake_distribution, data_input, None, None, None) + assert masked_output != fake_distribution + assert not isinstance(masked_output, RandomVariable) + # Ensure it has missing values + assert {input_name + "_missing"} == {v.name for v in fake_model.value_vars} + assert {input_name, input_name + "_observed"} == {v.name for v in fake_model.observed_RVs} + del fake_model.named_vars[fake_distribution.name] + del fake_model.named_vars[fake_distribution.name + "_missing"] + del fake_model.named_vars[fake_distribution.name + "_observed"] # Test that setting total_size returns a MinibatchRandomVariable scaled_outputs = fake_model.make_obs_var( @@ -967,6 +969,27 @@ def test_set_data_constant_shape_error(): pmodel.set_data("y", np.arange(10)) +def test_set_data_masked_array(): + data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1]) + + with pm.Model() as pmodel: + D = pm.MutableData("test", np.zeros(4)) + + with pytest.warns(UserWarning, match="masked arrays"): + pmodel.set_data("test", data) + result = D.get_value() + expected = np.array([1.0, 2.0, np.nan]) + np.testing.assert_array_equal(result, expected) + + +def test_set_data_masked_integer_array(): + with pm.Model() as pmodel: + D = pm.MutableData("test", np.zeros(4)) + with pytest.warns(UserWarning, match="masked arrays"): + with pytest.raises(TypeError, match="Masked integer"): + pmodel.set_data("test", np.ma.MaskedArray([1, 2, 3], [0, 0, 1])) + + def test_model_deprecation_warning(): with pm.Model() as m: x = pm.Normal("x", 0, 1, size=2) @@ -1204,7 +1227,7 @@ def test_profile_count(self): @pytest.fixture(params=["masked", "pandas"]) def missing_data(request): if request.param == "masked": - return np.ma.masked_values([1, 2, -1, 4, -1], value=-1) + return np.ma.masked_values([1, 2, -1, 4.0, -1], value=-1) else: # request.param == "pandas" pd = pytest.importorskip("pandas") @@ -1214,11 +1237,20 @@ def missing_data(request): class TestImputationMissingData: "Test for Missing Data imputation" - def test_missing_basic(self, missing_data): + def convert_type(self, X, format, name=None): + if format == "numpy": + return X + if format == "MutableData": + return pm.MutableData(format if name is None else name, X) + if format == "ConstantData": + return pm.ConstantData(format if name is None else name, X) + + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_basic(self, missing_data, observed_format): with pm.Model() as model: x = pm.Normal("x", 1, 1) with pytest.warns(ImputationWarning): - _ = pm.Normal("y", x, 1, observed=missing_data) + _ = pm.Normal("y", x, 1, observed=self.convert_type(missing_data, observed_format)) assert "y_missing" in model.named_vars @@ -1229,13 +1261,16 @@ def test_missing_basic(self, missing_data): ipr = pm.sample_prior_predictive() assert {"x", "y"} <= set(ipr.prior.keys()) - def test_missing_with_predictors(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_with_predictors(self, observed_format): predictors = np.array([0.5, 1, 0.5, 2, 0.3]) - data = np.ma.masked_values([1, 2, -1, 4, -1], value=-1) + data = np.ma.masked_values([1, 2, -1, 4.0, -1], value=-1) with pm.Model() as model: x = pm.Normal("x", 1, 1) with pytest.warns(ImputationWarning): - y = pm.Normal("y", x * predictors, 1, observed=data) + y = pm.Normal( + "y", x * predictors, 1, observed=self.convert_type(data, observed_format) + ) assert "y_missing" in model.named_vars @@ -1246,17 +1281,26 @@ def test_missing_with_predictors(self): ipr = pm.sample_prior_predictive() assert {"x", "y"} <= set(ipr.prior.keys()) - def test_missing_dual_observations(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_dual_observations(self, observed_format): with pm.Model() as model: - obs1 = np.ma.masked_values([1, 2, -1, 4, -1], value=-1) - obs2 = np.ma.masked_values([-1, -1, 6, -1, 8], value=-1) + obs1 = np.ma.masked_values([1, 2, -1, 4.0, -1], value=-1) + obs2 = np.ma.masked_values([-1, -1, 6, -1.0, 8], value=-1) beta1 = pm.Normal("beta1", 1, 1) beta2 = pm.Normal("beta2", 2, 1) latent = pm.Normal("theta", size=5) with pytest.warns(ImputationWarning): - ovar1 = pm.Normal("o1", mu=beta1 * latent, observed=obs1) + ovar1 = pm.Normal( + "o1", + mu=beta1 * latent, + observed=self.convert_type(obs1, observed_format, "obs1"), + ) with pytest.warns(ImputationWarning): - ovar2 = pm.Normal("o2", mu=beta2 * latent, observed=obs2) + ovar2 = pm.Normal( + "o2", + mu=beta2 * latent, + observed=self.convert_type(obs2, observed_format, "obs2"), + ) prior_trace = pm.sample_prior_predictive(return_inferencedata=False) assert {"beta1", "beta2", "theta", "o1", "o2"} <= set(prior_trace.keys()) @@ -1265,17 +1309,22 @@ def test_missing_dual_observations(self): warnings.filterwarnings("ignore", ".*number of samples.*", UserWarning) trace = pm.sample(chains=1, tune=5, draws=50) - def test_interval_missing_observations(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_interval_missing_observations(self, observed_format): rng = np.random.default_rng(1198) with pm.Model() as model: - obs1 = np.ma.masked_values([1, 2, -1, 4, -1], value=-1) - obs2 = np.ma.masked_values([-1, -1, 6, -1, 8], value=-1) + obs1 = np.ma.masked_values([1, 2, -1, 4.0, -1], value=-1) + obs2 = np.ma.masked_values([-1, -1, 6, -1, 8.0], value=-1) with pytest.warns(ImputationWarning): - theta1 = pm.Uniform("theta1", 0, 5, observed=obs1) + theta1 = pm.Uniform( + "theta1", 0, 5, observed=self.convert_type(obs1, observed_format, "obs1") + ) with pytest.warns(ImputationWarning): - theta2 = pm.Normal("theta2", mu=theta1, observed=obs2) + theta2 = pm.Normal( + "theta2", mu=theta1, observed=self.convert_type(obs2, observed_format, "obs2") + ) assert isinstance(model.rvs_to_transforms[model["theta1_missing"]], IntervalTransform) assert model.rvs_to_transforms[model["theta1_observed"]] is None @@ -1366,7 +1415,8 @@ def test_interval_missing_observations(self): np.mean(pp_trace["theta2"][:, ~obs2.mask] - pp_trace["theta2_observed"]), 0 ) - def test_missing_logp1(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_logp1(self, observed_format): with pm.Model(check_bounds=False) as m1: x = pm.Gamma("x", 1, 1, size=4) @@ -1375,12 +1425,15 @@ def test_missing_logp1(self): with pm.Model(check_bounds=False) as m2: with pytest.warns(ImputationWarning): - x = pm.Gamma("x", 1, 1, observed=[1, 1, 1, np.nan]) + x = pm.Gamma( + "x", 1, 1, observed=self.convert_type([1.0, 1, 1, np.nan], observed_format) + ) logp_val = m2.compile_logp()({"x_missing_log__": np.array([0])}) assert logp_val == -4.0 - def test_missing_logp2(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_logp2(self, observed_format): with pm.Model() as m: theta1 = pm.Normal("theta1", 0, 5, observed=[0, 1, 2, 3, 4]) theta2 = pm.Normal("theta2", mu=theta1, observed=[0, 1, 2, 3, 4]) @@ -1388,9 +1441,20 @@ def test_missing_logp2(self): with pm.Model() as m_missing: with pytest.warns(ImputationWarning): - theta1 = pm.Normal("theta1", 0, 5, observed=np.array([0, 1, np.nan, 3, np.nan])) + theta1 = pm.Normal( + "theta1", + 0, + 5, + observed=self.convert_type( + np.array([0, 1, np.nan, 3, np.nan]), observed_format, "obs1" + ), + ) theta2 = pm.Normal( - "theta2", mu=theta1, observed=np.array([np.nan, np.nan, 2, np.nan, 4]) + "theta2", + mu=theta1, + observed=self.convert_type( + np.array([np.nan, np.nan, 2, np.nan, 4]), observed_format, "obs2" + ), ) m_missing_logp = m_missing.compile_logp()( {"theta1_missing": [2, 4], "theta2_missing": [0, 1, 3]} @@ -1398,7 +1462,8 @@ def test_missing_logp2(self): assert m_logp == m_missing_logp - def test_missing_multivariate(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_multivariate(self, observed_format): """Test model with missing variables whose transform changes base shape still works""" with pm.Model() as m_miss: @@ -1410,7 +1475,9 @@ def test_missing_multivariate(self): x = pm.Dirichlet( "x", a=[1, 2, 3], - observed=np.array([[0.3, 0.3, 0.4], [np.nan, np.nan, np.nan]]), + observed=self.convert_type( + np.array([[0.3, 0.3, 0.4], [np.nan, np.nan, np.nan]]), observed_format + ), ) # TODO: Test can be used when local_subtensor_rv_lift supports multivariate distributions @@ -1425,14 +1492,17 @@ def test_missing_multivariate(self): # m_unobs.compile_logp(jacobian=False)({"x_simplex__": inp_vals}) * 2, # ) - def test_missing_vector_parameter(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_vector_parameter(self, observed_format): with pm.Model() as m: with pytest.warns(ImputationWarning): x = pm.Normal( "x", np.array([-10, 10]), 0.1, - observed=np.array([[np.nan, 10], [-10, np.nan], [np.nan, np.nan]]), + observed=self.convert_type( + np.array([[np.nan, 10], [-10, np.nan], [np.nan, np.nan]]), observed_format + ), ) x_draws = x.eval() assert x_draws.shape == (3, 2) @@ -1443,7 +1513,8 @@ def test_missing_vector_parameter(self): st.norm(scale=0.1).logpdf(0) * 6, ) - def test_missing_symmetric(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_missing_symmetric(self, observed_format): """Check that logp works when partially observed variable have equal observed and unobserved dimensions. @@ -1453,7 +1524,12 @@ def test_missing_symmetric(self): """ with pm.Model() as m: with pytest.warns(ImputationWarning): - x = pm.Gamma("x", alpha=3, beta=10, observed=np.array([1, np.nan])) + x = pm.Gamma( + "x", + alpha=3, + beta=10, + observed=self.convert_type(np.array([1, np.nan]), observed_format), + ) x_obs_rv = m["x_observed"] x_obs_vv = m.rvs_to_values[x_obs_rv] @@ -1470,21 +1546,23 @@ def test_missing_symmetric(self): assert x_obs_vv in logp_inputs assert x_unobs_vv in logp_inputs - def test_dims(self): + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_dims(self, observed_format): """Test that we don't propagate dims to the subcomponents of a partially observed RV See https://github.com/pymc-devs/pymc/issues/6177 """ - data = np.array([np.nan] * 3 + [0] * 7) with pm.Model(coords={"observed": range(10)}) as model: + data = self.convert_type(np.array([np.nan] * 3 + [0] * 7), observed_format) with pytest.warns(ImputationWarning): x = pm.Normal("x", observed=data, dims=("observed",)) assert model.named_vars_to_dims == {"x": ("observed",)} - def test_error_non_random_variable(self): - data = np.array([np.nan] * 3 + [0] * 7) + @pytest.mark.parametrize("observed_format", ["numpy", "MutableData", "ConstantData"]) + def test_error_non_random_variable(self, observed_format): with pm.Model() as model: + data = self.convert_type(np.array([np.nan] * 3 + [0] * 7), observed_format) msg = "x of type is not supported" with pytest.raises(NotImplementedError, match=msg): x = pm.Censored( diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py index 009eb88619..1a54b74057 100644 --- a/tests/test_pytensorf.py +++ b/tests/test_pytensorf.py @@ -49,6 +49,7 @@ replace_rvs_by_values, reseed_rngs, rvs_to_value_vars, + unmask_masked_data, walk_model, ) from pymc.testing import assert_no_rvs @@ -269,6 +270,25 @@ def test_convert_observed_data(input_dtype): assert isinstance(wrapped, TensorVariable) +def test_unmask_masked_data(): + # test with non-masked data + data = np.array([1, 2, 3]) + result = unmask_masked_data(data) + expected = np.array([1, 2, 3]) + np.testing.assert_array_equal(result, expected) + + # test with masked float data + data = np.ma.MaskedArray([1.0, 2.0, 3.0], [0, 0, 1]) + result = unmask_masked_data(data) + expected = np.array([1.0, 2.0, np.nan]) + np.testing.assert_array_equal(result, expected) + + # test with integer masked data + data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1]) + with pytest.raises(TypeError, match="Masked integer"): + unmask_masked_data(data) + + def test_pandas_to_array_pandas_index(): data = pd.Index([1, 2, 3]) result = convert_observed_data(data)