Source code for metrics_as_scores.data.pregenerate

"""
This module contains top-level function that are used in highly parallel
scenarios for pre-generating densities for own datasets, either from
previously computed fits for random variables or empirical densities.
"""

import numpy as np
import pandas as pd
from pathlib import Path
from typing import Union
from warnings import warn
from os.path import exists
from pickle import dump, load
from joblib import Parallel, delayed
from tqdm import tqdm
from metrics_as_scores.tools.funcs import flatten_dict
from metrics_as_scores.data.pregenerate_fit import Continuous_RVs_dict, Discrete_RVs_dict, FitResult
from metrics_as_scores.distribution.distribution import Dataset, Density, DistTransform, Dataset, Empirical, Empirical_discrete, KDE_approx, Parametric, Parametric_discrete
from metrics_as_scores.distribution.fitting import StatisticalTest
from sklearn.model_selection import ParameterGrid




[docs]def generate_densities( dataset: Dataset, clazz: type[Density]=Empirical, unique_vals: bool=None, resample_samples=250_000, dist_transform: DistTransform=DistTransform.NONE, num_jobs: int=None ) -> dict[str, Density]: """ Generates a set of :py:class:`Density` objects for a certain :py:class:`DistTransform`. For each combination, we will later save one file that is then to be used in the web application, as generating these on-the-fly would take too long. dataset: ``Dataset`` Required for obtaining quantity types, contexts, and filtered data. clazz: ``type[Density]`` A type of empirical density to generate densities for. unique_vals: ``bool`` Used to conditionally add some jitter to data to all data points unique. This is automatically set to `True` if the class is :py:class:`Empirical`, because this class is for continuous RVs. If the data is not continuous (real), then setting this to `True` will make it so. resample_samples: ``int`` Unsigned integer, passed forward to the type of dict[str, Density]. dist_transform: ``DistTransform`` The chosen transformation for the data. :rtype: ``dict[str, Density]`` :return: A dictionary where the key is made of the context and quantity type, and the value is the generated :py:class:`Empirical` density. """ contexts = list(dataset.contexts(include_all_contexts=True)) param_grid = { 'context': contexts, 'qtype': dataset.quantity_types } expanded_grid = pd.DataFrame(ParameterGrid(param_grid=param_grid)) def get_density(grid_idx: int) -> tuple[str, Density]: row = expanded_grid.iloc[grid_idx,] context = row.context qtype = row.qtype # Only use unique values if explicitly requested or CDF-type is ECDF! uvals = True if unique_vals else (clazz == Empirical) data = dataset.data(qtype=qtype, context=None if context == '__ALL__' else context, unique_vals=uvals) # Do transformation manually for other types of DensityFunc transform_value, data = Dataset.transform(data=data, dist_transform=dist_transform) return (f'{context}_{qtype}', clazz(data=data, resample_samples=resample_samples, compute_ranges=True, ideal_value=dataset.ideal_values[qtype], dist_transform=dist_transform, transform_value=transform_value, qtype=qtype, context=context)) cdfs = Parallel(n_jobs=-1 if num_jobs is None else num_jobs)(delayed(get_density)(i) for i in tqdm(range(len(expanded_grid.index)))) return dict(cdfs)
[docs]def fits_to_MAS_densities( dataset: Dataset, distns_dict: dict[int, FitResult], dist_transform: DistTransform, use_continuous: bool ) -> dict[str, Union[Parametric, Parametric_discrete]]: """ Converts previously produced parametric fits to :py:class:`Density` objects that can be loaded and used in the web application. Similar to :py:meth:`generate_densities()`, this method also returns a dictionary with generated parametric densities. dataset: ``Dataset`` Required for obtaining quantity types, contexts, and filtered data. distns_dict: ``dict[int, FitResult]`` Dictionary with all fit results for a data transform. The `int`-key is just the previously used grid index and not relevant here. dist_transform: ``DistTransform`` The chosen transformation for the data. use_continuous: ``bool`` Used to select and generate densities based on either continuous (`True`) RVs or discrete RVs. :rtype: ``dict[str, Union[Parametric, Parametric_discrete]]`` :return: A dictionary where the key is made of the context and quantity type, and the value is the generated :py:class:`Union[Parametric, Parametric_discrete]` density. """ df = pd.DataFrame([flatten_dict(d) for d in distns_dict.values()]) data_df = dataset.df contexts = list(dataset.contexts(include_all_contexts=True)) the_type = 'continuous' if use_continuous else 'discrete' use_test = 'ks_1samp_ordinary' if use_continuous else 'epps_singleton_2samp_jittered' use_stat = f'stat_tests_tests_{use_test}_stat' use_vars = Continuous_RVs_dict if use_continuous else Discrete_RVs_dict Use_class = Parametric if use_continuous else Parametric_discrete df_cols = list(df.columns) the_dict: dict[str, Parametric] = {} for context in contexts: for qtype in dataset.quantity_types: key = f'{context}_{qtype}' candidates = df[(df.context == context) & (df.qtype == qtype) & (df.type == the_type) & (df.dist_transform == dist_transform.name)] if len(candidates.index) == 0: # No fit at all :( the_dict[key] = Use_class.unfitted(dist_transform=dist_transform) # pragma: no cover else: candidates = candidates.sort_values(by=[use_stat], ascending=True, inplace=False) # Lowest D-stat first best = candidates.head(1).iloc[0,] stat_tests_dict = StatisticalTest.from_dict(d=best, key_prefix='stat_tests_tests_') dist_type = use_vars[best.rv] dist = dist_type() params = () for pi in dist._param_info(): use_key = f'params_{pi.name}' if pi.name.endswith('_') and not use_key in df_cols: use_key = use_key.rstrip('_') # pragma: no cover try: params += (best[use_key],) except KeyError: # pragma: no cover # Happens when the candidate was not actually fit, e.g., when a discrete RV # was selected for continuous data. the_dict[key] = Use_class.unfitted(dist_transform=dist_transform) continue data = data_df[(data_df[dataset.ds['colname_type']] == qtype)] if context != '__ALL__': data = data[(data[dataset.ds['colname_context']] == context)] data = data[dataset.ds['colname_data']].to_numpy() # Re-apply the transform to the data: if best.transform_value is not None: data = np.abs(data - best.transform_value) the_dict[key] = Use_class(dist=dist, stat_tests=stat_tests_dict, use_stat_test=use_test, dist_params=params, range=(data.min(), data.max()), compute_ranges=True, ideal_value=dataset.ideal_values[best.qtype], dist_transform=dist_transform, transform_value=best.transform_value, qtype=qtype, context=context) return the_dict
[docs]def generate_empirical( dataset: Dataset, densities_dir: Path, clazz: Union[Empirical, KDE_approx], transform: DistTransform ) -> None: """ Generates a set of empirical (continuous) densities for a given density type (Empirical or KDE_Approx) and data transform. dataset: ``Dataset`` Required for obtaining quantity types, contexts, and filtered data. densities_dir: ``Path`` The directory to store the generated densities. The resulting file is a key of the used density type and data transform. clazz: ``Union[Empirical, KDE_approx]`` The type of density you wish to create. transform: ``DistTransform`` The chosen transformation for the data. :return: This method does not return anything but only writes the result to disk. """ temp = generate_densities(dataset=dataset, clazz=clazz, dist_transform=transform, unique_vals=True, resample_samples=75_000) dens_file = str(densities_dir.joinpath(f'./densities_{clazz.__name__}_{transform.name}.pickle')) with open(file=dens_file, mode='wb') as fp: dump(obj=temp, file=fp) print(f'Finished generating Densities for {clazz.__name__} with transform {transform.name}.')
[docs]def generate_parametric( dataset: Dataset, densities_dir: Path, fits_dir: Path, clazz: Union[Parametric, Parametric_discrete], transform: DistTransform ) -> None: """ Generates a set of parametric densities for a given density type (Parametric or Parametric_discrete) and data transform. dataset: ``Dataset`` Required for obtaining quantity types, contexts, and filtered data. densities_dir: ``Path`` The directory to store the generated densities. The resulting file is a key of the used density type and data transform. clazz: ``Union[Parametric, Parametric_discrete]`` The type of density you wish to create. transform: ``DistTransform`` The chosen transformation for the data. :return: This method does not return anything but only writes the result to disk. """ fits_file = str(fits_dir.joinpath(f'./pregen_distns_{transform.name}.pickle').resolve()) if not exists(fits_file): warn(f'Cannot generate parametric distribution for {clazz.__name__} and transformation {transform.name}, because the file {fits_file} does not exist. Did you forget to create the fits using the script pregenerate_distns.py?') return # pragma: no cover with open(fits_file, 'rb') as f: distns_list = load(f) distns_dict = { item['grid_idx']: item for item in distns_list } use_continuous = clazz == Parametric temp = fits_to_MAS_densities(dataset=dataset, distns_dict=distns_dict, dist_transform=transform, use_continuous=use_continuous) dens_file = str(densities_dir.joinpath(f'./densities_{clazz.__name__}_{transform.name}.pickle')) with open(file=dens_file, mode='wb') as f: dump(temp, f) print(f'Finished generating parametric Densities for {clazz.__name__} with transform {transform.name}.')
[docs]def generate_empirical_discrete( dataset: Dataset, densities_dir: Path, transform: DistTransform ) -> None: """ Generates discrete empirical densities for a given data transform. Only uses the type :py:class:``Empirical_discrete`` for this. dataset: ``Dataset`` Required for obtaining quantity types, contexts, and filtered data. densities_dir: ``Path`` The directory to store the generated densities. The resulting file is a key of the used density type and data transform. transform: ``DistTransform`` The chosen transformation for the data. :return: This method does not return anything but only writes the result to disk. """ the_dict: dict[str, Empirical_discrete] = {} for context in dataset.contexts(include_all_contexts=True): use_context = None if context == '__ALL__' else context for qtype in dataset.quantity_types: key = f'{context}_{qtype}' if not dataset.is_qtype_discrete(qtype=qtype): the_dict[key] = Empirical_discrete.unfitted(dist_transform=transform) else: data = dataset.data(qtype=qtype, context=use_context, unique_vals=False) transform_value, data = Dataset.transform(data=data, dist_transform=transform, continuous_value=False) the_dict[key] = Empirical_discrete(data=data.astype(int), ideal_value=dataset.ideal_values[qtype], dist_transform=transform, transform_value=transform_value, qtype=qtype, context=context) dens_file = str(densities_dir.joinpath(f'./densities_{Empirical_discrete.__name__}_{transform.name}.pickle')) with open(file=dens_file, mode='wb') as fp: dump(obj=the_dict, file=fp) print(f'Finished generating empirical Densities for {Empirical_discrete.__name__} with transform {transform.name}.')