Skip to content

Correlation

CorrelationResult

CorrelationResult dataclass

CorrelationResult(matrix: DataFrame, blocks: list[tuple[str, int, int]], block_stats: DataFrame, method: CorrelationMethod)

Output of every correlation metric in this module.

matrix instance-attribute

matrix: DataFrame

Square DataFrame indexed/columned by feature name in graph order.

blocks instance-attribute

blocks: list[tuple[str, int, int]]

[(concept_path, start_idx, end_idx_exclusive), ...] over rows/cols.

block_stats instance-attribute

block_stats: DataFrame

One row per block. Columns: concept_path, size, mean_abs, median_abs, min, max.

feature_correlation

feature_correlation

feature_correlation(graph: ConceptGraph, X: DataFrame, *, method: CorrelationMethod = 'spearman') -> CorrelationResult

Block-structured correlation matrix on feature values (P14).

Diagonal blocks reveal within-concept coherence; off-diagonal blocks reveal boundary leakage (features in different concepts that turn out to be highly correlated).

Source code in src/concept_graph_xai/metrics/correlation.py
def feature_correlation(
    graph: ConceptGraph,
    X: pd.DataFrame,
    *,
    method: CorrelationMethod = "spearman",
) -> CorrelationResult:
    """Block-structured correlation matrix on feature *values* (P14).

    Diagonal blocks reveal *within-concept coherence*; off-diagonal blocks
    reveal *boundary leakage* (features in different concepts that turn out to
    be highly correlated).
    """

    if not isinstance(X, pd.DataFrame):
        raise TypeError("feature_correlation requires a pandas DataFrame X")
    feats = _ordered_feature_names(graph, list(X.columns))
    if not feats:
        raise ValueError("no overlap between graph features and X columns")
    sub = X.loc[:, feats]
    matrix = sub.corr(method=method)
    blocks = block_boundaries(graph, feature_names=feats)
    block_stats = _block_aggregates(matrix.to_numpy(), blocks)
    return CorrelationResult(matrix=matrix, blocks=blocks, block_stats=block_stats, method=method)

nullity_correlation

nullity_correlation

nullity_correlation(graph: ConceptGraph, X: DataFrame, *, method: CorrelationMethod = 'spearman') -> CorrelationResult

Block-structured correlation matrix on feature missingness (P15a).

Built on X.isna(). A high diagonal-block value means the features in that concept tend to go missing together — directly relevant to the AUC drop "this branch is missing" scenario.

Source code in src/concept_graph_xai/metrics/correlation.py
def nullity_correlation(
    graph: ConceptGraph,
    X: pd.DataFrame,
    *,
    method: CorrelationMethod = "spearman",
) -> CorrelationResult:
    """Block-structured correlation matrix on feature *missingness* (P15a).

    Built on ``X.isna()``. A high diagonal-block value means the features in
    that concept tend to go missing together — directly relevant to the AUC
    drop "this branch is missing" scenario.
    """

    if not isinstance(X, pd.DataFrame):
        raise TypeError("nullity_correlation requires a pandas DataFrame X")
    feats = _ordered_feature_names(graph, list(X.columns))
    if not feats:
        raise ValueError("no overlap between graph features and X columns")
    indicators = X.loc[:, feats].isna().astype(float)
    indicators = indicators.loc[:, indicators.std() > 0]
    if indicators.shape[1] == 0:
        empty = pd.DataFrame(np.zeros((len(feats), len(feats))), index=feats, columns=feats)
        blocks = block_boundaries(graph, feature_names=feats)
        return CorrelationResult(
            matrix=empty,
            blocks=blocks,
            block_stats=_block_aggregates(empty.to_numpy(), blocks),
            method=method,
        )
    raw = indicators.corr(method=method)
    matrix = raw.reindex(index=feats, columns=feats).fillna(0.0)
    blocks = block_boundaries(graph, feature_names=feats)
    block_stats = _block_aggregates(matrix.to_numpy(), blocks)
    return CorrelationResult(matrix=matrix, blocks=blocks, block_stats=block_stats, method=method)

shap_correlation

shap_correlation

shap_correlation(graph: ConceptGraph, feature_names: Sequence[str], shap_values: ndarray, *, method: CorrelationMethod = 'spearman') -> CorrelationResult

Block-structured correlation of SHAP values across samples (P17).

Two raw-uncorrelated features can still be SHAP-redundant: diagonal blocks near 1 indicate features inside a concept push the model in the same way; off-diagonal blocks near 1 indicate the model treats different concepts as substitutes.

Source code in src/concept_graph_xai/metrics/correlation.py
def shap_correlation(
    graph: ConceptGraph,
    feature_names: Sequence[str],
    shap_values: np.ndarray,
    *,
    method: CorrelationMethod = "spearman",
) -> CorrelationResult:
    """Block-structured correlation of *SHAP values* across samples (P17).

    Two raw-uncorrelated features can still be SHAP-redundant: diagonal blocks
    near 1 indicate features inside a concept push the model in the same way;
    off-diagonal blocks near 1 indicate the model treats different concepts as
    substitutes.
    """

    arr = np.asarray(shap_values, dtype=float)
    if arr.ndim != 2:
        raise ValueError(f"shap_values must be 2D (N, F); got shape {arr.shape}")
    if arr.shape[1] != len(feature_names):
        raise ValueError(
            f"shap_values has {arr.shape[1]} features, feature_names has {len(feature_names)}"
        )
    df = pd.DataFrame(arr, columns=list(feature_names))
    feats = _ordered_feature_names(graph, list(df.columns))
    sub = df.loc[:, feats]
    matrix = sub.corr(method=method)
    blocks = block_boundaries(graph, feature_names=feats)
    block_stats = _block_aggregates(matrix.to_numpy(), blocks)
    return CorrelationResult(matrix=matrix, blocks=blocks, block_stats=block_stats, method=method)