Source code for covid19_inference.model.likelihood

# ------------------------------------------------------------------------------ #
# Implementation of the Likelihood that is used during the mcmc acceptance
# ------------------------------------------------------------------------------ #

import logging

import pymc as pm
from aesara import scan
import aesara.tensor as at
import numpy as np
from scipy import ndimage as ndi

from .model import *
from . import utility as ut

log = logging.getLogger(__name__)


[docs]def student_t_likelihood(
    cases,
    name_student_t="_new_cases_studentT",
    name_sigma_obs="sigma_obs",
    pr_beta_sigma_obs=30,
    nu=4,
    offset_sigma=1,
    model=None,
    data_obs=None,
    sigma_shape=None,
):
    r"""
        Set the likelihood to apply to the model observations (`model.new_cases_obs`)
        We assume a :class:`~pymc3.distributions.continuous.StudentT` distribution because it is robust against outliers [Lange1989]_.
        The likelihood follows:

        .. math::

            P(\text{data\_obs}) &\sim StudentT(\text{mu} = \text{new\_cases\_inferred}, sigma =\sigma,
            \text{nu} = \text{nu})\\
            \sigma &= \sigma_r \sqrt{\text{new\_cases\_inferred} + \text{offset\_sigma}}

        The parameter :math:`\sigma_r` follows
        a :class:`~pymc3.distributions.continuous.HalfCauchy` prior distribution with parameter beta set by
        ``pr_beta_sigma_obs``. If the input is 2 dimensional, the parameter :math:`\sigma_r` is different for every region,
        this can be changed be using the ``sigma_shape`` Parameter.

        Parameters
        ----------
        cases : :class:`~aesara.tensor.TensorVariable`
            The daily new cases estimated by the model.
            Will be compared to  the real world data ``data_obs``.
            One or two dimensonal array. If 2 dimensional, the first dimension is time
            and the second are the regions/countries

        name_student_t :
            The name under which the studentT distribution is saved in the trace.

        name_sigma_obs :
            The name under which the distribution of the observable error is saved in the trace

        pr_beta_sigma_obs : float
            The beta of the :class:`~pymc3.distributions.continuous.HalfCauchy` prior distribution of :math:`\sigma_r`.

        nu : float
            How flat the tail of the distribution is. Larger nu should  make the model
            more robust to outliers. Defaults to 4 [Lange1989]_.

        offset_sigma : float
            An offset added to the sigma, to make the inference procedure robust. Otherwise numbers of
            ``cases`` would lead to very small errors and diverging likelihoods. Defaults to 1.

        model:
            The model on which we want to add the distribution

        data_obs : array
            The data that is observed. By default it is ``model.new_cases_obs``
        
        sigma_shape : int, array
            Shape of the sigma distribution i.e. the data error term. 


        Returns
        -------
        None

        References
        ----------

        .. [Lange1989] Lange, K., Roderick J. A. Little, & Jeremy M. G. Taylor. (1989).
            Robust Statistical Modeling Using the t Distribution.
            Journal of the American Statistical Association,
            84(408), 881-896. doi:10.2307/2290063

    """

    model = modelcontext(model)

    if data_obs is None:
        data_obs = model.new_cases_obs

    if model.shifted_cases:

        no_cases = data_obs == 0
        if len(data_obs.shape) > 1:

            for c in range(data_obs.shape[-1]):
                cases_obs_c = data_obs[..., c]
                # find short intervals of 0 entries and set to NaN
                no_cases_blob, n_blob = ndi.label(no_cases[..., c])
                for i in range(n_blob):
                    if (no_cases_blob == (i + 1)).sum() < 10:
                        data_obs[no_cases_blob == i + 1, ..., c] = np.NaN

                # shift cases from weekends or such to the next day, where cases are reported
                if n_blob > 0:
                    new_cases = 0
                    update = False
                    for i, cases_obs in enumerate(cases_obs_c):
                        new_cases += cases[i + model.diff_data_sim][..., c]
                        if np.isnan(cases_obs):
                            update = True
                        elif update:
                            cases_i = at.set_subtensor(
                                cases[i + model.diff_data_sim][..., c], new_cases
                            )
                            cases = at.set_subtensor(
                                cases[i + model.diff_data_sim], cases_i
                            )
                            new_cases = 0
                            update = False
        else:
            # find short intervals of 0 entries and set to NaN
            no_cases_blob, n_blob = ndi.label(no_cases)
            for i in range(n_blob):
                if (no_cases_blob == (i + 1)).sum() < 10:
                    data_obs[no_cases_blob == i + 1] = np.NaN

            # shift cases from weekends or such to the next day, where cases are reported
            if n_blob > 0:
                new_cases = 0
                update = False
                for i, cases_obs in enumerate(data_obs):
                    new_cases += cases[i + model.diff_data_sim]
                    if np.isnan(cases_obs):
                        update = True
                    elif update:
                        cases = at.set_subtensor(
                            cases[i + model.diff_data_sim], new_cases
                        )
                        new_cases = 0
                        update = False

    cases = cases[model.diff_data_sim : model.data_len + model.diff_data_sim]
    sigma_obs = pm.HalfCauchy(
        name_sigma_obs,
        beta=pr_beta_sigma_obs,
        shape=model.shape_of_regions if sigma_shape is None else sigma_shape,
    )
    sigma = (
        at.abs_(cases + offset_sigma) ** 0.5 * sigma_obs
    )  # offset and at.abs to avoid nans

    pm.StudentT(
        name=name_student_t,
        nu=nu,
        mu=cases[~np.isnan(data_obs)],
        sigma=sigma[~np.isnan(data_obs)],
        observed=data_obs[~np.isnan(data_obs)],
    )