Source code for transform_features
import numpy as np
import pandas as pd
from sklearn.preprocessing import (
StandardScaler,
MinMaxScaler,
PowerTransformer,
QuantileTransformer,
FunctionTransformer
)
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List, Optional, Union, Any, Dict
from scipy.stats import boxcox
[docs]
class FeatureTransformer(BaseEstimator, TransformerMixin):
"""
A transformer class that applies various feature transformations to numerical data,
including logarithmic, square root, power, scaling, and other transformations.
"""
def __init__(
self,
columns: Optional[List[str]] = None,
transformations: Optional[Union[str, List[str], Dict[str, str]]] = None,
method: str = 'yeo-johnson',
output_distribution: str = 'normal',
**kwargs: Any
):
"""
Initializes the FeatureTransformer.
Args:
columns (List[str], optional): List of column names to transform. If None, all numeric columns are used.
transformations (Union[str, List[str], Dict[str, str]], optional): Transformation(s) to apply.
Can be a single string, a list of strings, or a dictionary mapping columns to transformations.
Supported transformations include:
- 'log'
- 'sqrt'
- 'power'
- 'boxcox'
- 'zscore'
- 'minmax'
- 'quantile'
- 'rank'
- 'dft' (Discrete Fourier Transform)
method (str): Method to use for power transformations ('yeo-johnson' or 'box-cox'). Default is 'yeo-johnson'.
output_distribution (str): Desired output distribution for quantile transformation ('normal' or 'uniform').
**kwargs: Additional keyword arguments for specific transformers.
"""
self.columns = columns
self.transformations = transformations
self.method = method
self.output_distribution = output_distribution
self.kwargs = kwargs
self.transformers_: Dict[str, Any] = {}
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'FeatureTransformer':
"""
Fits the transformer to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
X = X.copy()
if self.columns is None:
self.columns = X.select_dtypes(include=[np.number]).columns.tolist()
if isinstance(self.transformations, str):
transformations = {col: self.transformations for col in self.columns}
elif isinstance(self.transformations, list):
transformations = {col: self.transformations[i % len(self.transformations)] for i, col in enumerate(self.columns)}
elif isinstance(self.transformations, dict):
transformations = self.transformations
else:
raise ValueError("transformations must be a string, list, or dictionary.")
for col in self.columns:
trans = transformations.get(col)
if trans == 'log':
self.transformers_[col] = FunctionTransformer(
func=lambda x: np.log(x.replace(0, np.nan)),
inverse_func=np.exp,
check_inverse=False
)
elif trans == 'sqrt':
self.transformers_[col] = FunctionTransformer(
func=np.sqrt,
inverse_func=np.square,
check_inverse=False
)
elif trans == 'power':
self.transformers_[col] = PowerTransformer(method=self.method, **self.kwargs)
elif trans == 'boxcox':
self.transformers_[col] = FunctionTransformer(
func=lambda x: boxcox(x.clip(lower=1e-6))[0],
check_inverse=False
)
elif trans == 'zscore':
self.transformers_[col] = StandardScaler(**self.kwargs)
elif trans == 'minmax':
self.transformers_[col] = MinMaxScaler(**self.kwargs)
elif trans == 'quantile':
self.transformers_[col] = QuantileTransformer(
output_distribution=self.output_distribution, **self.kwargs
)
elif trans == 'rank':
self.transformers_[col] = FunctionTransformer(
func=lambda x: x.rank(),
check_inverse=False
)
elif trans == 'dft':
self.transformers_[col] = FunctionTransformer(
func=lambda x: np.fft.fft(x.to_numpy()).real,
check_inverse=False
)
else:
raise ValueError(f"Unsupported transformation: {trans}")
self.transformers_[col].fit(X[[col]])
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
X_transformed = X.copy()
for col, transformer in self.transformers_.items():
if col in X.columns:
X_transformed[col] = transformer.transform(X[[col]])
else:
raise ValueError(f"Column '{col}' not found in input DataFrame.")
return X_transformed
[docs]
def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Inverse transforms the input DataFrame.
Args:
X (pd.DataFrame): Transformed DataFrame.
Returns:
pd.DataFrame: Original DataFrame.
"""
X_inv = X.copy()
for col, transformer in self.transformers_.items():
if hasattr(transformer, 'inverse_transform'):
X_inv[col] = transformer.inverse_transform(X[[col]])
else:
raise ValueError(f"Transformer for column '{col}' does not support inverse_transform.")
return X_inv
[docs]
def get_params(self, deep: bool = True) -> Dict[str, Any]:
"""
Get parameters for this estimator.
Args:
deep (bool): If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns:
Dict[str, Any]: Parameter names mapped to their values.
"""
return {
'columns': self.columns,
'transformations': self.transformations,
'method': self.method,
'output_distribution': self.output_distribution,
**self.kwargs
}
[docs]
def set_params(self, **params: Any) -> 'FeatureTransformer':
"""
Set the parameters of this estimator.
Args:
**params: Estimator parameters.
Returns:
FeatureTransformer: Returns self.
"""
for key, value in params.items():
setattr(self, key, value)
return self