Source code for humancompatible.detect.methods.l_inf.l_inf

import logging
import numpy as np
from typing import Any
from .lp_tools import lin_prog_feas

from humancompatible.detect.binarizer import Binarizer

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")


[docs] def check_l_inf_gap( X: np.ndarray, y: np.ndarray, binarizer: Binarizer, feature_involved: str, subgroup_to_check: Any, delta: float, ) -> float: """ Test whether a protected subgroup's outcome distribution differs from the overall population by **at most** `delta` in the l_inf-norm. Args: X (np.ndarray): Protected-attribute slice of the dataset (same rows as `y`). y (np.ndarray): Boolean target vector. binarizer (Binarizer): The binarizer used to encode `X` and `y`. feature_involved (str): Name of the protected column whose subgroup is tested. subgroup_to_check (Any): Raw value of the subgroup to isolate. delta (float): Threshold for the L-infinity norm. Returns: float: 1.0 (which means True) if the subgroup histogram is within `delta`; 0.0 (which means False) otherwise. Raises: ValueError: If `delta` is not positive. KeyError: If `feature_involved` is not in the binarizer's feature names. KeyError: If `subgroup_to_check` is not a valid value for the feature. """ if delta <= 0: raise ValueError("delta must be positive") if feature_involved not in binarizer.data_handler.feature_names: raise KeyError(f"Feature '{feature_involved}' not in protected set") X_bin = binarizer.data_handler.encode(X, one_hot=False) y_bin = binarizer.encode_y(y) feat_idx = binarizer.data_handler.feature_names.index(feature_involved) feature = binarizer.data_handler.features[feat_idx] try: subgroup_code = feature.value_mapping[subgroup_to_check] except KeyError as e: allowed = list(feature.value_mapping.keys()) raise KeyError(f"{subgroup_to_check!r} not a valid value " f"for '{feature_involved}'. Allowed: {allowed}") from e # Retain only the instances with a positive target outcome -> X_bin_pos X_bin_pos = X_bin[y_bin == 1] # Filter instances of the (potentially) discriminated subgroup -> discr discr = X_bin_pos[X_bin_pos[:, feat_idx] == subgroup_code] # Create array with the dataset feature values (to create histograms) and # get number of encoded subgroups per feature (required for binning) bins = [] columns_all = np.empty(X_bin_pos.shape[0], ) columns_discr = np.empty(discr.shape[0], ) for i in range(X_bin_pos.shape[1]): if i != feat_idx: bins.append(int(X_bin_pos[:, i].max() + 1)) columns_all = np.vstack((columns_all, X_bin_pos[:, i])) columns_discr = np.vstack((columns_discr, discr[:, i])) columns_all = columns_all[1:, :] columns_discr = columns_discr[1:, :] # "Histogramisation" all_hist, _ = np.histogramdd(columns_all.T, bins=bins, density=True) discr_hist, _ = np.histogramdd(columns_discr.T, bins=bins, density=True) # Reshaping dim = 1 for e in all_hist.shape: dim *= e all_rsh = all_hist.reshape(dim, 1) discr_rsh = discr_hist.reshape(dim, 1) status = lin_prog_feas(all_rsh, discr_rsh, delta=delta) is_within = float(status == 0) # 0 = feasible if is_within: logger.info(f"The most impacted subgroup bias <= {delta}") else: logger.info(f"The most impacted subgroup bias > {delta}") return is_within