Source code for reduce_dimension
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE, Isomap
from sklearn.exceptions import NotFittedError
import umap.umap_ as umap
from typing import Optional, Any
# Conditional import of keras depending on version
try:
from keras.layers import Input, Dense # type: ignore
from keras.models import Model # type: ignore
except ImportError:
from tensorflow.keras.layers import Input, Dense # type: ignore
from tensorflow.keras.models import Model # type: ignore
[docs]
class PCAReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Principal Component Analysis (PCA).
"""
def __init__(self, n_components: int = 2, **kwargs: Any):
"""
Initializes the PCAReducer.
Args:
n_components (int): Number of principal components to keep.
**kwargs: Additional keyword arguments for sklearn.decomposition.PCA.
"""
self.n_components = n_components
self.kwargs = kwargs
self.pca = PCA(n_components=self.n_components, **self.kwargs)
self.columns_ = None
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'PCAReducer':
"""
Fits the PCA model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
self.columns_ = X.columns
self.pca.fit(X)
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted PCA model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if self.pca is None:
raise NotFittedError("This PCAReducer instance is not fitted yet.")
X_pca = self.pca.transform(X)
component_names = [f'PC{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_pca, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the PCA model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
pd.DataFrame: Transformed DataFrame.
"""
return self.fit(X).transform(X)
[docs]
class LDAReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Linear Discriminant Analysis (LDA).
"""
def __init__(self, n_components: int = 2, **kwargs: Any):
"""
Initializes the LDAReducer.
Args:
n_components (int): Number of linear discriminants to retain.
**kwargs: Additional keyword arguments for sklearn.discriminant_analysis.LinearDiscriminantAnalysis.
"""
self.n_components = n_components
self.kwargs = kwargs
self.lda = LDA(n_components=self.n_components, **self.kwargs)
self.columns_ = None
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series) -> 'LDAReducer':
"""
Fits the LDA model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series): Target variable.
Returns:
self
"""
self.columns_ = X.columns
self.lda.fit(X, y)
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted LDA model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if self.lda is None:
raise NotFittedError("This LDAReducer instance is not fitted yet.")
X_lda = self.lda.transform(X)
component_names = [f'LD{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_lda, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
"""
Fits the LDA model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series): Target variable.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
return self.fit(X, y).transform(X)
[docs]
class SVDReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Truncated Singular Value Decomposition (SVD).
"""
def __init__(self, n_components: int = 2, **kwargs: Any):
"""
Initializes the SVDReducer.
Args:
n_components (int): Number of singular values to keep.
**kwargs: Additional keyword arguments for sklearn.decomposition.TruncatedSVD.
"""
self.n_components = n_components
self.kwargs = kwargs
self.svd = TruncatedSVD(n_components=self.n_components, **self.kwargs)
self.columns_ = None
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'SVDReducer':
"""
Fits the SVD model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
self.columns_ = X.columns
self.svd.fit(X)
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted SVD model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if self.svd is None:
raise NotFittedError("This SVDReducer instance is not fitted yet.")
X_svd = self.svd.transform(X)
component_names = [f'SVD{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_svd, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the SVD model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
pd.DataFrame: Transformed DataFrame.
"""
return self.fit(X).transform(X)
[docs]
class FactorAnalysisReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Factor Analysis.
"""
def __init__(self, n_components: int = 2, **kwargs: Any):
"""
Initializes the FactorAnalysisReducer.
Args:
n_components (int): Number of factors to retain.
**kwargs: Additional keyword arguments for sklearn.decomposition.FactorAnalysis.
"""
self.n_components = n_components
self.kwargs = kwargs
self.fa = FactorAnalysis(n_components=self.n_components, **self.kwargs)
self.columns_ = None
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'FactorAnalysisReducer':
"""
Fits the Factor Analysis model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
self.columns_ = X.columns
self.fa.fit(X)
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted Factor Analysis model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if self.fa is None:
raise NotFittedError("This FactorAnalysisReducer instance is not fitted yet.")
X_fa = self.fa.transform(X)
component_names = [f'FA{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_fa, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the Factor Analysis model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
pd.DataFrame: Transformed DataFrame.
"""
return self.fit(X).transform(X)
[docs]
class TSNEReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using t-Distributed Stochastic Neighbor Embedding (t-SNE).
"""
def __init__(self, n_components: int = 2, perplexity: float = 30.0, **kwargs: Any):
"""
Initializes the TSNEReducer.
Args:
n_components (int): Number of dimensions to reduce to.
perplexity (float): Perplexity parameter for t-SNE.
**kwargs: Additional keyword arguments for sklearn.manifold.TSNE.
"""
self.n_components = n_components
self.perplexity = perplexity
self.kwargs = kwargs
self.tsne = TSNE(n_components=self.n_components, perplexity=self.perplexity, **self.kwargs)
self.columns_ = None
self.fitted = False
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'TSNEReducer':
"""
Fits the t-SNE model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
self.columns_ = X.columns
self.tsne.fit(X)
self.fitted = True
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted t-SNE model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if not self.fitted:
raise NotFittedError("This TSNEReducer instance is not fitted yet.")
X_tsne = self.tsne.fit_transform(X)
component_names = [f'tSNE{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_tsne, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the t-SNE model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
pd.DataFrame: Transformed DataFrame.
"""
self.fit(X)
return self.transform(X)
[docs]
class UMAPReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Uniform Manifold Approximation and Projection (UMAP).
"""
def __init__(self, n_components: int = 2, **kwargs: Any):
"""
Initializes the UMAPReducer.
Args:
n_components (int): Number of dimensions to reduce to.
**kwargs: Additional keyword arguments for umap.UMAP.
"""
self.n_components = n_components
self.kwargs = kwargs
self.umap = umap.UMAP(n_components=self.n_components, **self.kwargs)
self.columns_ = None
self.fitted = False
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'UMAPReducer':
"""
Fits the UMAP model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable.
Returns:
self
"""
self.columns_ = X.columns
self.umap.fit(X, y=y)
self.fitted = True
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted UMAP model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if not self.fitted:
raise NotFittedError("This UMAPReducer instance is not fitted yet.")
X_umap = self.umap.transform(X)
component_names = [f'UMAP{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_umap, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the UMAP model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
return self.fit(X, y=y).transform(X)
[docs]
class IsomapReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Isomap.
"""
def __init__(self, n_components: int = 2, n_neighbors: int = 5, **kwargs: Any):
"""
Initializes the IsomapReducer.
Args:
n_components (int): Number of dimensions to reduce to.
n_neighbors (int): Number of neighbors to use when computing geodesic distances.
**kwargs: Additional keyword arguments for sklearn.manifold.Isomap.
"""
self.n_components = n_components
self.n_neighbors = n_neighbors
self.kwargs = kwargs
self.isomap = Isomap(n_components=self.n_components, n_neighbors=self.n_neighbors, **self.kwargs)
self.columns_ = None
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'IsomapReducer':
"""
Fits the Isomap model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
self.columns_ = X.columns
self.isomap.fit(X)
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the fitted Isomap model.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
X_isomap = self.isomap.transform(X)
component_names = [f'Isomap{i+1}' for i in range(self.n_components)]
return pd.DataFrame(X_isomap, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the Isomap model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
pd.DataFrame: Transformed DataFrame.
"""
return self.fit(X).transform(X)
[docs]
class AutoencoderReducer(BaseEstimator, TransformerMixin):
"""
Dimensionality reduction using Autoencoders.
"""
def __init__(self, encoding_dim: int = 10, epochs: int = 50, batch_size: int = 32, optimizer: str = 'adam', loss: str = 'mse', **kwargs: Any):
"""
Initializes the AutoencoderReducer.
Args:
encoding_dim (int): Size of the encoding layer.
epochs (int): Number of training epochs.
batch_size (int): Batch size for training.
optimizer (str): Optimizer to use for training.
loss (str): Loss function to use for training.
**kwargs: Additional keyword arguments for keras models.
"""
self.encoding_dim = encoding_dim
self.epochs = epochs
self.batch_size = batch_size
self.optimizer = optimizer
self.loss = loss
self.kwargs = kwargs
self.autoencoder = None
self.encoder = None
self.columns_ = None
[docs]
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'AutoencoderReducer':
"""
Fits the Autoencoder model to the data.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
self
"""
self.columns_ = X.columns
input_dim = X.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(self.encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)
self.autoencoder = Model(input_layer, decoded)
self.autoencoder.compile(optimizer=self.optimizer, loss=self.loss) # type: ignore
# Train the autoencoder
self.autoencoder.fit( # type: ignore
X.values, X.values,
epochs=self.epochs,
batch_size=self.batch_size,
shuffle=True,
verbose=0
)
# Create encoder model
self.encoder = Model(inputs=input_layer, outputs=encoded)
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the input DataFrame using the trained Autoencoder.
Args:
X (pd.DataFrame): Input DataFrame.
Returns:
pd.DataFrame: Transformed DataFrame.
"""
if self.encoder is None:
raise NotFittedError("This AutoencoderReducer instance is not fitted yet.")
X_encoded = self.encoder.predict(X.values)
component_names = [f'AE{i+1}' for i in range(self.encoding_dim)]
return pd.DataFrame(X_encoded, columns=component_names, index=X.index)
[docs]
def fit_transform(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> pd.DataFrame:
"""
Fits the Autoencoder model and transforms the input DataFrame.
Args:
X (pd.DataFrame): Input DataFrame.
y (pd.Series, optional): Target variable (not used).
Returns:
pd.DataFrame: Transformed DataFrame.
"""
self.fit(X)
return self.transform(X)