Source code for tabensemb.data.utils

import numpy as np
from typing import List, Union, Dict, Type
import pandas as pd



[docs]
def get_corr_sets(where_corr: np.ndarray, names: List):
    where_corr = [[names[x] for x in y] for y in where_corr]
    corr_chain = {}

    def add_edge(x, y):
        if x not in corr_chain.keys():
            corr_chain[x] = [y]
        elif y not in corr_chain[x]:
            corr_chain[x].append(y)

    for x, y in zip(*where_corr):
        if x != y:
            add_edge(x, y)
            add_edge(y, x)
    corr_feature = list(corr_chain.keys())
    for x in np.setdiff1d(names, corr_feature):
        corr_chain[x] = []

    def dfs(visited, graph, node, ls):
        if node not in visited:
            ls.append(node)
            visited.add(node)
            for neighbour in graph[node]:
                ls = dfs(visited, graph, neighbour, ls)
        return ls

    corr_sets = []
    for x in corr_feature[::-1]:
        if len(corr_sets) != 0:
            for sets in corr_sets:
                if x in sets:
                    break
            else:
                corr_sets.append(dfs(set(), corr_chain, x, []))
        else:
            corr_sets.append(dfs(set(), corr_chain, x, []))

    corr_sets = [[x for x in y] for y in corr_sets]
    return corr_feature, corr_sets



object_unknown_value = "UNK"
number_unknown_value = -1



[docs]
def fill_cat_nan(df: pd.DataFrame, cat_dtypes: Dict[str, np.dtype]) -> pd.DataFrame:
    """
    Imputation of categorical features.

    Parameters
    ----------
    df
        The dataframe to be imputed.
    cat_dtypes
        The dtype of each categorical feature. If it is a numerical type, ``number_unknown_value`` (default to -1) is
        used for imputation, otherwise ``object_unknown_value`` (default to "UNK") is used. Change these two values if
        you want other values for missing or unknown values.

    Returns
    -------
    pd.DataFrame
    """
    df = df.copy()
    for feature, dtype in cat_dtypes.items():
        dtype = get_imputed_dtype(dtype)
        unknown_val = get_unknown_value(dtype)
        if feature in df.columns:
            values = df[feature].fillna(unknown_val).values
            if dtype == int and not np.all(np.mod(values, 1) == 0):
                raise Exception(
                    f"The numerical categorical feature {feature} is not integeral but {values.dtype}."
                )
            df[feature] = values.astype(dtype)
    return df




[docs]
def get_imputed_dtype(dtype: np.dtype) -> Union[Type[int], Type[str]]:
    """
    Numerical columns will be transformed to "int", and others will be transformed to "str".

    Parameters
    ----------
    dtype
        The dtype of a column.

    Returns
    -------
    Type[int] or Type[str]
    """
    if np.issubdtype(dtype, np.number):
        return int
    else:
        return str




[docs]
def get_unknown_value(dtype: Union[Type[int], Type[str]]) -> Union[int, str]:
    """
    Select the unknown value for the dtype judged by :func:`get_imputed_dtype`.

    Parameters
    ----------
    dtype
        int or str from :func:`get_imputed_dtype`.

    Returns
    -------
    int or str
    """
    if dtype == int:
        return number_unknown_value
    else:
        return object_unknown_value



class _OrdinalEncodingWrongDirException(Exception):
    """
    The exception might be raised by :class:`OrdinalEncoder` under the circumstance that
    :meth:`OrdinalEncoder.transform` is called for transformed data or :meth:`OrdinalEncoder.inverse_transform` is
    called for inverse-transformed data. If it is caught, the other method will be called to check whether it is now
    really the case.
    """

    pass



[docs]
class OrdinalEncoder:
    """
    An ordinal encoder for categorical features that better supports ``pd.DataFrame`` even with missing columns. It
    supports ``np.ndarray`` when calling :meth:`transform` or :meth:`inverse_transform`, but does not support fitting
    on a ``np.ndarray`` because it is designed for dataframes.
    It can also identify a miss-calling of :meth:`transform` and :meth:`inverse_transform` (calling transform on
    transformed dataframe, and vice versa), and return the input dataframe directly. But the functionality won't work
    if the dataframe to be transformed/inverse-transformed only contains categorical features whose categories before
    encoding are all integers.
    """


[docs]
    def __init__(self):
        self.mapping = {}
        self.num_unique = {}
        self.features = []
        self.dtypes = {}
        self.dtypes_samples = {}
        self.fitted = False



[docs]
    def fit(self, df: pd.DataFrame):
        """
        Fit the ordinal encoder.

        Parameters
        ----------
        df
            A dataframe that only contains categorical features.
        """
        df = df.copy()
        self.features = list(df.columns)
        for feature, col_type in zip(df.columns, df.dtypes):
            # The imputation procedure is the same as that in AbstractImputer
            dtype = get_imputed_dtype(col_type)
            unknown_value = get_unknown_value(dtype)
            values = fill_cat_nan(df[[feature]], {feature: dtype}).values.flatten()
            self.dtypes[feature] = dtype
            unique_values = list(sorted(set(values)))
            if unknown_value not in unique_values:
                unique_values += [unknown_value]
            self.mapping[feature] = unique_values
            self.num_unique[feature] = len(unique_values)
            self.dtypes_samples[feature] = unknown_value
        self.fitted = True
        return self



[docs]
    def _transform_or_inverse_transform(
        self, df: Union[pd.DataFrame, np.ndarray], transform: bool
    ) -> Union[pd.DataFrame, np.ndarray]:
        """
        Automatically distinguish transform/inverse-transform and ``pd.DataFrame``/``np.ndarray``.

        Parameters
        ----------
        df
            A pd.DataFrame or a np.ndarray
        transform
            True for transform and False for inverse-transform.

        Returns
        -------
        A pd.DataFrame or a np.ndarray
            depending on the type of the input.
        """
        if isinstance(df, pd.DataFrame):
            input_type = "dataframe"
            df = df.copy()
        else:
            input_type = str(type(df))
            df = pd.DataFrame(data=df, columns=self.features, index=np.arange(len(df)))
        try:
            if transform:
                trans_df = self._transform(df.copy())
            else:
                trans_df = self._inverse_transform(df.copy())
        except _OrdinalEncodingWrongDirException as e_forw:
            try:
                if transform:
                    _ = self._inverse_transform(df.copy())
                else:
                    _ = self._transform(df.copy())
                trans_df = df
            except Exception as e_inv:
                raise Exception(
                    f"The dataframe can be neither transformed nor inverse transformed by the ordinal encoder.\n"
                    f"Exception when calling {'transform' if transform else 'inverse_transform'}: {e_forw}\n"
                    f"Exception when calling {'inverse_transform' if transform else 'transform'}: {e_inv}"
                )
        return trans_df if input_type == "dataframe" else trans_df.values



[docs]
    def transform(self, df: Union[pd.DataFrame, np.ndarray]):
        """
        Ordinal-encoding categorical features. If the input is a ``np.ndarray``, the columns should match the recorded
        categorical features (:attr:`features`).
        """
        return self._transform_or_inverse_transform(df, transform=True)



[docs]
    def inverse_transform(self, df: Union[pd.DataFrame, np.ndarray]):
        """
        Inverse ordinal-encoding categorical features. If the input is a ``np.ndarray``, the columns should match the
        recorded categorical features (:attr:`features`).
        """
        return self._transform_or_inverse_transform(df, transform=False)



[docs]
    def _transform(self, df: pd.DataFrame):
        for idx, feature in enumerate(self.features):
            if feature not in df.columns:
                continue
            # The imputation procedure is the same as that in fit.
            unknown_val = get_unknown_value(self.dtypes[feature])
            values = fill_cat_nan(
                df[[feature]], {feature: self.dtypes[feature]}
            ).values.flatten()

            unique_values = list(set(values))
            encoded_values = list(range(self.num_unique[feature]))
            unknown_values = [
                val for val in unique_values if val not in self.mapping[feature]
            ]
            known_values = [
                val
                for val in unique_values
                if val not in unknown_values and val != unknown_val
            ]

            is_int = (
                lambda x: str(x).replace(".", "").isdigit() and float(x).is_integer()
            )
            # If the input is transformed, the unique values will be strings of integers because of fill_cat_nan.
            # Otherwise, they will be at least non-digits. One exception is that all categories are integers.
            str_int_in_encoded = lambda x: str(x).isdigit() and int(x) in encoded_values
            if (
                any([str_int_in_encoded(val) for val in unknown_values])
                and all([str_int_in_encoded(val) for val in known_values])
                and not (
                    all([is_int(val) for val in unknown_values + known_values])
                    and self.dtypes[feature] == int
                )
            ):
                # The input is already transformed.
                raise _OrdinalEncodingWrongDirException

            transformed_values = np.zeros_like(values, dtype=int)
            for val in unique_values:
                transformed_values[values == val] = self.mapping[feature].index(
                    unknown_val if val in unknown_values else val
                )
            df[feature] = transformed_values.astype(int)
        return df



[docs]
    def _inverse_transform(self, df: pd.DataFrame):
        for idx, feature in enumerate(self.features):
            if feature not in df.columns:
                continue

            unknown_val = get_unknown_value(self.dtypes[feature])
            encoded_unknown_val = self.mapping[feature].index(unknown_val)
            values = df[feature].fillna(encoded_unknown_val).values

            unique_values = list(set(values))
            encoded_values = list(range(self.num_unique[feature]))
            unknown_values = [val for val in unique_values if val not in encoded_values]
            known_values = [
                val
                for val in unique_values
                if val not in unknown_values and val != encoded_unknown_val
            ]

            dtype = self.dtypes[feature]
            is_int = (
                lambda x: str(x).replace(".", "").isdigit() and float(x).is_integer()
            )
            if dtype == int:
                # Do not let floats pass the following check and return None.
                dtype = lambda x: int(x) if is_int(x) else x
            # In fit or _transform, the values are all translated to a consistent dtype (str or int) because of
            # fill_cat_nan. If the input here is an integer, it can also be a category before transform when other
            # categories are strings.
            if (
                any([dtype(val) in self.mapping[feature] for val in unknown_values])
                and all([dtype(val) in self.mapping[feature] for val in known_values])
                and self.dtypes[feature] != int
            ):
                raise _OrdinalEncodingWrongDirException

            if not all([is_int(x) for x in unique_values]):
                raise _OrdinalEncodingWrongDirException(
                    f"The feature {feature} is not integeral ({unique_values}), therefore can not be "
                    f"inverse-transformed."
                )

            transformed_values = np.ones_like(values).astype(
                self.dtypes[feature] if self.dtypes[feature] != str else "U256"
            )
            for i in range(len(transformed_values)):
                transformed_values[i] = unknown_val
            for val in unique_values:
                transformed_values[values == val] = (
                    unknown_val
                    if val in unknown_values
                    else self.mapping[feature][int(val)]
                )
            df[feature] = transformed_values.astype(
                self.dtypes[feature] if self.dtypes[feature] != str else "U256"
            )
        return df