Source code for humancompatible.detect.methods.l_inf.l_inf

import logging
import numpy as np
from typing import Any
from .lp_tools import lin_prog_feas

from humancompatible.detect.binarizer import Binarizer

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")



[docs]
def check_l_inf_gap(
    X: np.ndarray,
    y: np.ndarray,
    binarizer: Binarizer,
    feature_involved: str,
    subgroup_to_check: Any,
    delta: float,
) -> float:
    """
    Test whether a protected subgroup's outcome distribution differs from the
    overall population by **at most** `delta` in the l_inf-norm.

    Args:
        X (np.ndarray): Protected-attribute slice of the dataset (same rows as `y`).
        y (np.ndarray): Boolean target vector.
        binarizer (Binarizer): The binarizer used to encode `X` and `y`.
        feature_involved (str): Name of the protected column whose subgroup is tested.
        subgroup_to_check (Any): Raw value of the subgroup to isolate.
        delta (float): Threshold for the L-infinity norm.

    Returns:
        float: 1.0 (which means True) if the subgroup histogram is within `delta`; 
            0.0 (which means False) otherwise.

    Raises:
        ValueError: If `delta` is not positive.
        KeyError: If `feature_involved` is not in the binarizer's feature names.
        KeyError: If `subgroup_to_check` is not a valid value for the feature.
    """
    if delta <= 0:
        raise ValueError("delta must be positive")

    if feature_involved not in binarizer.data_handler.feature_names:
        raise KeyError(f"Feature '{feature_involved}' not in protected set")
    
    X_bin = binarizer.data_handler.encode(X, one_hot=False)
    y_bin = binarizer.encode_y(y)

    feat_idx = binarizer.data_handler.feature_names.index(feature_involved)
    feature = binarizer.data_handler.features[feat_idx]

    try:
        subgroup_code = feature.value_mapping[subgroup_to_check]
    except KeyError as e:
        allowed = list(feature.value_mapping.keys())
        raise KeyError(f"{subgroup_to_check!r} not a valid value "
                       f"for '{feature_involved}'. Allowed: {allowed}") from e

    # Retain only the instances with a positive target outcome -> X_bin_pos
    X_bin_pos = X_bin[y_bin == 1]

    # Filter instances of the (potentially) discriminated subgroup -> discr
    discr = X_bin_pos[X_bin_pos[:, feat_idx] == subgroup_code]

    # Create array with the dataset feature values (to create histograms) and
    # get number of encoded subgroups per feature (required for binning)
    bins = []
    columns_all = np.empty(X_bin_pos.shape[0], )
    columns_discr = np.empty(discr.shape[0], )

    for i in range(X_bin_pos.shape[1]):
        if i != feat_idx:
            bins.append(int(X_bin_pos[:, i].max() + 1))
            columns_all = np.vstack((columns_all, X_bin_pos[:, i]))
            columns_discr = np.vstack((columns_discr, discr[:, i]))

    columns_all = columns_all[1:, :]
    columns_discr = columns_discr[1:, :]

    # "Histogramisation"
    all_hist, _ = np.histogramdd(columns_all.T, bins=bins, density=True)
    discr_hist, _ = np.histogramdd(columns_discr.T, bins=bins, density=True)

    # Reshaping
    dim = 1
    for e in all_hist.shape:
        dim *= e

    all_rsh = all_hist.reshape(dim, 1)
    discr_rsh = discr_hist.reshape(dim, 1)

    status = lin_prog_feas(all_rsh, discr_rsh, delta=delta)
    is_within = float(status == 0)  # 0 = feasible
    if is_within:
        logger.info(f"The most impacted subgroup bias <= {delta}")
    else:
        logger.info(f"The most impacted subgroup bias > {delta}")

    return is_within