Fix typos, remove unnecessary stack

jessegrabowski · jessegrabowski · commit 4127be6321e4 · 2023-11-21T16:18:27.000+01:00
diff --git a/pymc_experimental/marginal_model.py b/pymc_experimental/marginal_model.py
@@ -522,6 +522,9 @@ def eval_logp(x):
     vec_eval_logp = pt.vectorize(eval_logp, "()->()")
     logp_init = vec_eval_logp(domain)
 
+    # Construct logp in two steps
+    # Step 1: Compute the probability of the data ("emissions") under every possible state (vec_logp_emission)
+
     # This will break the dependency between chain and the init_dist_ random variable
     # TODO: Make this comment more robust after I understand better.
     chain_dummy = chain_rv.clone()
@@ -534,19 +537,25 @@ def eval_logp(x):
         chain_dummy: pt.moveaxis(pt.broadcast_to(domain, (*values[0].shape, domain.size)), -1, 0)
     }
 
-    # TODO: @Ricardo: If you don't concatenate here, you get -inf in the logp (why?)
-    # TODO: I'm stacking the results (adds a batch dim to the left) and summing away the batch dim == joint probability?
-    vec_logp_emission = pt.stack(vectorize_graph(tuple(logp_value_dict.values()), sub_dict)).sum(
-        axis=0
-    )
+    # This is a (k, T) matrix of logp terms, one for each state - observation pair
+    vec_logp_emission = vectorize_graph(tuple(logp_value_dict.values()), sub_dict)
 
+    # Step 2: Compute the transition probabilities
+    # This is the "forward algorithm", alpha_t = sum(p(s_t | s_{t-1}) * alpha_{t-1})
+    # We do it entirely in logs, though.
     log_alpha_seq, _ = scan(
         step_alpha, non_sequences=[pt.log(P_)], outputs_info=[logp_init], n_steps=n_steps_
     )
 
+    # Scan works over the T dimension, so output is (T, k). We need to swap to (k, T)
     log_alpha_seq = pt.moveaxis(pt.concatenate([logp_init[None], log_alpha_seq], axis=0), -1, 0)
-    joint_log_obs_given_states = pt.logsumexp(pt.add(vec_logp_emission) + log_alpha_seq, axis=0)
 
-    # We have to add dummy logps for the remaining value variables, otherwise PyMC will raise
-    dummy_logps = (pt.constant(0.0),) * (len(values) - 1)
-    return joint_log_obs_given_states, dummy_logps
+    # Final logp is the sum of the sum of the emission probs and the transition probabilities
+    # pt.add is used in case there are multiple emissions that depend on the same markov chain; in this case, we compute
+    # the joint probability of seeing everything together.
+    joint_log_obs_given_states = pt.logsumexp(pt.add(*vec_logp_emission) + log_alpha_seq, axis=0)
+
+    # If there are multple emisson streams, we have to add dummy logps for the remaining value variables. The first
+    # return is the joint probability of everything together, but PyMC still expects one logp for each one.
+    dummy_logps = (pt.constant(np.zeros((4,))),) * (len(values) - 1)
+    return joint_log_obs_given_states, *dummy_logps