import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Optional
[docs]
class TargetBasedEncoder(BaseEstimator, TransformerMixin):
"""
A transformer that provides various methods for creating target-based features,
including target mean encoding, smoothed target mean encoding, count encoding,
cross-validated target encoding, and Weight of Evidence (WoE).
"""
def __init__(
self,
method: str = 'target_mean',
target_col: str = '',
group_col: str = '',
smoothing: Optional[int] = None,
n_splits: int = 5
):
"""
Initializes the TargetBasedEncoder.
Args:
method (str): The method to use for encoding. Options: 'target_mean', 'smoothed_mean', 'count',
'cross_validated_mean', 'woe'.
target_col (str): The name of the target column.
group_col (str): The name of the categorical column to encode.
smoothing (int, optional): Smoothing parameter for smoothed target mean encoding (default: None).
n_splits (int): The number of splits for cross-validated encoding (default: 5).
"""
self.method = method
self.target_col = target_col
self.group_col = group_col
self.smoothing = smoothing
self.n_splits = n_splits
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'TargetBasedEncoder':
"""
Fit method for compatibility with the scikit-learn API.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (ignored).
Returns:
TargetBasedEncoder: Returns self.
"""
return self
def _target_mean_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Apply target mean encoding to the group column.
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with target mean encoding.
"""
mean_encoding = df.groupby(self.group_col)[self.target_col].mean()
return df[self.group_col].map(mean_encoding).to_frame(f'{self.group_col}_target_mean')
def _smoothed_target_mean_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Apply smoothed target mean encoding to the group column.
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with smoothed target mean encoding.
"""
global_mean = df[self.target_col].mean()
agg = df.groupby(self.group_col)[self.target_col].agg(['mean', 'count'])
smoothed_mean = (agg['count'] * agg['mean'] + self.smoothing * global_mean) / (agg['count'] + self.smoothing)
return df[self.group_col].map(smoothed_mean).to_frame(f'{self.group_col}_smoothed_target_mean')
def _count_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Apply count encoding to the group column.
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with count encoding.
"""
counts = df[self.group_col].value_counts()
return df[self.group_col].map(counts).to_frame(f'{self.group_col}_count')
def _cross_validated_target_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Apply cross-validated target mean encoding to the group column.
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with cross-validated target encoding.
"""
kf = KFold(n_splits=self.n_splits, shuffle=True)
df_encoded = df.copy()
df_encoded['encoded'] = 0
for train_idx, val_idx in kf.split(df):
train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
mean_encoding = train_df.groupby(self.group_col)[self.target_col].mean()
df_encoded.loc[val_idx, 'encoded'] = val_df[self.group_col].map(mean_encoding)
return df_encoded[['encoded']].rename(columns={'encoded': f'{self.group_col}_cv_target_mean'})
def _calculate_woe(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate Weight of Evidence (WoE) for the group column based on the target variable.
Args:
df (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: DataFrame with WoE encoding.
"""
pos_prob = df.groupby(self.group_col)[self.target_col].mean()
neg_prob = 1 - pos_prob
woe = np.log(pos_prob / neg_prob)
return df[self.group_col].map(woe).to_frame(f'{self.group_col}_woe')