Source code for metrics_as_scores.distribution.fitting_problems

"""
This module contains :code:`pymoo` fitting problems that allow fitting
distributions to almost arbitrary discrete data. The discrete random
variables in :code:`scipy` do not have a :code:`fit()`-method, as their fitting
often requires a global search. Also, many distributions require
discrete parameters, or a mixture of real and integer parameters.
The problems in this module provide a generalized way for `pymoo` to
find parameters for all of :code:`scipy`'s discrete random variables.

"""

import numpy as np
from nptyping import Float, NDArray, Shape
from scipy.stats._discrete_distns import rv_discrete, bernoulli_gen, betabinom_gen, binom_gen, boltzmann_gen, dlaplace_gen, geom_gen, hypergeom_gen, logser_gen, nbinom_gen, nchypergeom_fisher_gen, nchypergeom_wallenius_gen, nhypergeom_gen, planck_gen, poisson_gen, randint_gen, skellam_gen, yulesimon_gen, zipf_gen, zipfian_gen
from pymoo.core.variable import Variable
from pymoo.core.problem import ElementwiseProblem
from pymoo.core.variable import Real, Integer



[docs]class MixedVariableDistributionFittingProblem(ElementwiseProblem):
    """
    This is the base class for fitting all of ``scipy``'s discrete random variables.
    Therefore, it accepts a dictionary of parameters for each distribution to find
    optimal values for.
    """
[docs]    def __init__(self, dist: rv_discrete, data: NDArray[Shape['*'], Float], vars: dict[str, Variable], n_ieq_constr: int=0, **kwargs):
        """
        Constructor for a fitting any discrete random variable with one or more
        parameters that can be of any type as supported by ``pymoo.core.variable.Variable`` (e.g., ``Integer``, ``Real``, etc.).

        Parameters
        ----------
        dist: ``rv_discrete``
            An instance of the concrete discrete random variable that should be fit to the data.
        
        vars: ``dict[str, Variable]``
            An ordered dictionary of named variables to optimize. These must correspond
            one to one with the variable names of those defined for the random variable.

        data: ``NDArray[Shape['*'], Float]``
            The data the distribution should be fit to.
        
        n_ieq_constr: ``int``
            Number of inequality constraints. If there are any, then the problem also
            overrides :meth:`Problem._evaluate()` and sets values for each constraint.
        """
        self.ext = int(np.max(data) - np.min(data))
        self.dist = dist
        # All of them have 'loc':
        vars['loc'] = Integer(bounds=(
            int(np.floor(np.min(data))) - (int(5 * self.ext)),
            int(np.ceil(np.max(data))) + int(5 * self.ext)))
        super().__init__(vars=vars, n_obj=1, n_ieq_constr=n_ieq_constr, **kwargs)
        self.data = data
    
[docs]    def _evaluate(self, X, out, *args, **kwargs) -> dict:
        r"""
        This is an internal method that evaluates the discrete random variable's
        negative log likelihood, given the currently set values for all of its
        variables (stored in ``X``). This method is called by ``pymoo``, so be
        sure to check out their references, too.
        Note that the ``X``-dictionary is used to build :math:`\theta`, the vector
        of parameters for the random variable. The order of the parameters in that
        vector depends on the order of ``self.vars``.
        This method usually does not need to be overridden, except for when, e.g.,
        it is required to evaluate (in-)equality constraints (that is, whenever
        something else than 'F' in the ``out``-dictionary must be accessed).

        X: ``dict[str, Any]``
            The (ordered) dictionary with the variables' names and values.
        out: ``dict[str, Any]``
            A dictionary used by ``pymoo`` to store results in; e.g., in 'F' it
            stores the result of the evaluation, and in 'G' it stores the inequality
            constraints' values.
        
        :return: Returns the ``out``-dictionary. However, the dictionary is accessed
            by reference, so this method does not have to return anything.

        :rtype: ``dict[Any,Any]``
        """
        theta = ()
        for vn in self.vars.keys():
            theta += (X[vn],)
        out['F'] = self.dist.nnlf(theta=theta, x=self.data)
        return out



[docs]class Fit_bernoulli_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Bernoulli distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``bernoulli_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``p``: (``int``) :math:`\left[0,1\right]`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=bernoulli_gen(), data=data, vars={
            'p': Real(bounds=[0., 1.])
        }, **kwargs)


[docs]class Fit_betabinom_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Beta-Binomial distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``betabinom_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``n``: (``int``) :math:`\left(0,1e^{4}\right)`
    - ``a``: (``float``) :math:`\left(5e^{-308},1e^3\right)`
    - ``b``: (``float``) :math:`\left(5e^{-308},1e^3\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=betabinom_gen(), data=data, vars={
            'n': Integer(bounds=(0, 10_000)),
            'a': Real(bounds=(5e-308, 1e3)),
            'b': Real(bounds=(5e-308, 1e3))
        }, **kwargs)


[docs]class Fit_binom_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Binomial distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``binom_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``n``: (``int``) :math:`\left(1,25e^{3}\right)`
    - ``p``: (``float``) :math:`\left(0,1\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=binom_gen(), data=data, vars={
            'n': Integer(bounds=(1, 25_000)),
            'p': Real(bounds=(0., 1.))
        }, **kwargs)


[docs]class Fit_boltzmann_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Boltzman distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``boltzmann_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``lambda`` [:math:`\lambda`]: (``float``) :math:`\left(0,1e^{5}\right)`
    - ``N``: (``int``) :math:`\left(1,25e^3\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=boltzmann_gen(), data=data, vars={
            'lambda': Real(bounds=(0, 1e5)),
            'N': Integer(bounds=(1, 25_000))
        }, **kwargs)


[docs]class Fit_dlaplace_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Laplacian distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``dlaplace_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``a``: (``float``) :math:`\left(5e^{-308},1e^4\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=dlaplace_gen(), data=data, vars={
            'a': Real(bounds=(5e-308, 1e4))
        }, **kwargs)


[docs]class Fit_geom_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Geometric distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``geom_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``p``: (``float``) :math:`\left(0,1\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=geom_gen(), data=data, vars={
            'p': Real(bounds=(0., 1.))
        }, **kwargs)


[docs]class Fit_hypergeom_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Hypergeometric distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``hypergeom_gen`` as base distribution.

    Notes
    -----
    This problem **does** override `_evaluate()` and has four inequality constraints.
    These are:

    - :math:`n\geq0` (or :math:`-n\leq0`)
    - :math:`N\geq0` (or :math:`-N\leq0`)
    - :math:`n\leq M` (or :math:`n-M\leq0`)
    - :math:`N\leq M` (or :math:`N-M\leq0`)

    Calls the super constructor with these variables (in this order):

    - ``M``: (``int``) :math:`\left(1,25e^{3}\right)`
    - ``n``: (``int``) :math:`\left(0,25e^{3}\right)`
    - ``N``: (``int``) :math:`\left(0,25e^{3}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=hypergeom_gen(), data=data, vars={
            #M = [1, 25_000],         n = [0, 25_000],        N = [0, 25_000]
            'M': Integer(bounds=(1, 25_000)),
            'n': Integer(bounds=(0, 25_000)),
            'N': Integer(bounds=(0, 25_000))
        }, n_ieq_constr=4, **kwargs)
    
[docs]    def _evaluate(self, X, out, *args, **kwargs) -> dict:
        """
        Overridden to evaluate the inequality constraints, too.
        For all other documentaion, check out :meth:`MixedVariableDistributionFittingProblem._evaluate()`.
        """
        out = super()._evaluate(X, out, *args, **kwargs)
        M, n, N = X['M'], X['n'], X['N']
        out['G'] = np.asarray([
            -n, # n >= 0  ->  -n <= 0
            -N, # N >= 0  ->  -N <= 0
            n - M, # n <= M  ->  n - M <= 0
            N - M] # N <= M  ->  N - M <= 0
        )
        return out


[docs]class Fit_logser_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Logarithmic Series distribution using a
    ``pymoo`` problem. It uses ``scipy``'s ``logser_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``p``: (``float``) :math:`\left(0,1\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=logser_gen(), data=data, vars={
            'p': Real(bounds=(0., 1.))
        }, **kwargs)


[docs]class Fit_nbinom_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Negative Binomial Series distribution using
    a ``pymoo`` problem. It uses ``scipy``'s ``nbinom_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``n``: (``int``) :math:`\left(0,25e^{3}\right)`
    - ``p``: (``float``) :math:`\left[0,1\right]`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=nbinom_gen(), data=data, vars={
            'n': Integer(bounds=(0, 25_000)),
            'p': Real(bounds=[0., 1.])
        }, **kwargs)


[docs]class Fit_nchypergeom_fisher_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Fisher's Non-central Hypergeometric
    distribution using a ``pymoo`` problem. It uses ``scipy``'s
    ``nchypergeom_fisher_gen`` as base distribution.

    Notes
    -----
    This problem **does** override `_evaluate()` and has four inequality constraints.
    These are:

    - :math:`N\leq M` (or :math:`N-M\leq0`)
    - :math:`n\leq M` (or :math:`n-M\leq0`)
    - :math:`\max{(\text{data})}\leq N` (or :math:`\max{(\text{data})}-N\leq0`)
    - :math:`\max{(\text{data})}\leq n` (or :math:`\max{(\text{data})}-n\leq0`)

    Calls the super constructor with these variables (in this order; note that :math:`k=` ``data.size``):

    - ``M``: (``int``) :math:`\left(1,5\times k\right)`
    - ``n``: (``int``) :math:`\left(1,5\times k\right)`
    - ``N``: (``int``) :math:`\left(1,5\times k\right)`
    - ``odds``: (``float``) :math:`\left(5e^{-308},1e^{4}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        vars = {
            'M': Integer(bounds=(1, 5 * data.size)),
            'n': Integer(bounds=(1, 5 * data.size)),
            'N': Integer(bounds=(1, 5 * data.size)),
            'odds': Real(bounds=(5e-308, 1e4))
        }
        super().__init__(dist=nchypergeom_fisher_gen(), data=data, vars=vars, n_ieq_constr=4, **kwargs)

[docs]    def _evaluate(self, X, out, *args, **kwargs) -> dict:
        """
        Overridden to evaluate the inequality constraints, too.
        For all other documentaion, check out :meth:`MixedVariableDistributionFittingProblem._evaluate()`.
        """
        out = super()._evaluate(X=X, out=out, *args, **kwargs)
        M, n, N, b = X['M'], X['n'], X['N'], np.max(self.data)

        # Let's add the following inequality constraints to the output:
        out['G'] = np.asarray([
            N - M, # N <= M  ->  N - M <= 0
            n - M, # n <= M  ->  n - M <= 0
            b - N, # max(data) <= N  ->  max(data) - N <= 0
            b - n] # max(data) <= n  ->  max(data) - n <= 0
        )
        return out


[docs]class Fit_nchypergeom_wallenius_gen(Fit_nchypergeom_fisher_gen):
    r"""
    This class allows to fit the generalized Wallenius Non-central Hypergeometric
    distribution using a ``pymoo`` problem. It uses ``scipy``'s ``nchypergeom_wallenius_gen`` as base distribution.

    Notes
    -----
    This distribution has the same parameters and constraints as the
    nchypergeom_fisher_gen, which is implemented by the problem
    ``Fit_nchypergeom_fisher_gen`` (which this problem inherits from directly).
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(data, **kwargs)
        self.dist = nchypergeom_wallenius_gen()


[docs]class Fit_nhypergeom_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Negative Hypergeometric distribution
    using a ``pymoo`` problem. It uses ``scipy``'s ``nhypergeom_gen`` as base
    distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``M``: (``int``) :math:`\left(0,25e^3\right)`
    - ``n``: (``int``) :math:`\left(0,25e^3\right)`
    - ``r``: (``int``) :math:`\left(0,25e^3\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=nhypergeom_gen(), data=data, vars={
            'M': Integer(bounds=(0, 25_000)),
            'n': Integer(bounds=(0, 25_000)),
            'r': Integer(bounds=(0, 25_000))
        }, **kwargs)


[docs]class Fit_planck_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Planck distribution
    using a ``pymoo`` problem. It uses ``scipy``'s ``planck_gen`` as base
    distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``p``: (``float``) :math:`\left(5e^{-308},1e^2\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=planck_gen(), data=data, vars={
            # planck takes  as shape parameter. The Planck distribution can be written as a geometric distribution (geom) with p = 1 - exp(-lambda) shifted by loc = -1.
            # # exp(-lambda) get small very quickly, so we choose 100 (~3.7e-44)
            'p': Real(bounds=(5e-308, 1e2))
        }, **kwargs)


[docs]class Fit_poisson_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Poisson distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``poisson_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``mu`` [:math:`\mu`]: (``float``) :math:`\left(0,1e^{6}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=poisson_gen(), data=data, vars={
            'mu': Real(bounds=(0., 1e6))
        }, **kwargs)


[docs]class Fit_randint_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Uniform distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``randint_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``low``: (``int``) :math:`\left(-25e^{3},25e^{3}\right)`
    - ``high``: (``int``) :math:`\left(-25e^{3},25e^{3}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=randint_gen(), data=data, vars={
            'low': Integer(bounds=(-25_000, 25_000)),
            'high': Integer(bounds=(-25_000, 25_000))
        }, **kwargs)


[docs]class Fit_skellam_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Skellam distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``skellam_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``mu1`` [:math:`\mu_1`]: (``float``) :math:`\left(5e^{-308},5e^{3}\right)`
    - ``mu2`` [:math:`\mu_2`]: (``float``) :math:`\left(5e^{-308},5e^{3}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=skellam_gen(), data=data, vars={
            'mu1': Real(bounds=(5e-308, 5e3)),
            'mu2': Real(bounds=(5e-308, 5e3))
        }, **kwargs)


[docs]class Fit_yulesimon_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Yule--Simon distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``yulesimon_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``alpha`` [:math:`\alpha`]: (``float``) :math:`\left(5e^{-308},2e^{4}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=yulesimon_gen(), data=data, vars={
            'alpha': Real(bounds=(5e-308, 2e4)) # Guessed
        }, **kwargs)


[docs]class Fit_zipf_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Zipf (Zeta) distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``zipf_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``a``: (``float``) :math:`\left(1+1e^{-12},2e^{4}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=zipf_gen(), data=data, vars={
            'a': Real(bounds=(1. + 1e-12, 2e4)) # Guessed
        }, **kwargs)


[docs]class Fit_zipfian_gen(MixedVariableDistributionFittingProblem):
    r"""
    This class allows to fit the generalized Zipfian distribution using a ``pymoo``
    problem. It uses ``scipy``'s ``zipfian_gen`` as base distribution.

    Notes
    -----
    Does not override :meth:`MixedVariableDistributionFittingProblem._evaluate()`
    and does not have any (in-)equality constraints.
    Calls the super constructor with these variables (in this order):

    - ``a``: (``float``) :math:`\left(0,2e^{4}\right)`
    - ``n``: (``int``) :math:`\left(0,25e^{3}\right)`
    """
[docs]    def __init__(self, data: NDArray[Shape['*'], Float], **kwargs):
        super().__init__(dist=zipfian_gen(), data=data, vars={
            'a': Real(bounds=(0., 2e4)),
            'n': Integer(bounds=(0, 25_000))
        }, **kwargs)