Source code for visualize_data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from typing import List, Optional
from itertools import combinations



[docs]
class DataVisualizer:
    """
    A class for visualizing different aspects of the dataset, including distributions, feature interactions,
    outlier detection, temporal data, dimensionality reduction, and more.

    Methods:
    - plot_distribution: Plot the distribution of specified columns.
    - plot_missing_data: Visualize missing data in the dataframe.
    - plot_correlation_heatmap: Plot a heatmap of correlations between numerical features.
    - plot_swarmplot: Create a swarmplot to visualize data distribution across categories.
    - plot_3d_scatter: Create a 3D scatter plot for three numerical features.
    - plot_pairwise_relationships: Plot pairwise relationships between features.
    - plot_scatter_with_outliers: Plot scatter plot with outliers highlighted.
    - plot_boxplot_with_outliers: Plot boxplots for columns to visualize potential outliers.
    - plot_isolation_forest_outliers: Highlight outliers detected by Isolation Forest.
    - plot_time_series: Plot time series data with optional rolling window.
    - plot_pca: Plot the results of Principal Component Analysis.
    - plot_tsne: Plot the results of t-SNE dimensionality reduction.
    - plot_umap: Plot the results of UMAP dimensionality reduction.
    - plot_clusters: Plot data points color-coded by cluster labels.
    - plot_interactive_histogram: Create an interactive histogram using Plotly.
    - plot_interactive_correlation: Create an interactive correlation heatmap using Plotly.
    - plot_interactive_scatter: Create an interactive scatter plot using Plotly.
    - plot_feature_importance: Plot feature importance from a machine learning model.
    - plot_barplot: Create a barplot for aggregated numerical values across categories.
    - plot_boxplot_categorical: Create a boxplot for numerical distribution across categories.
    - plot_categorical_distribution: Plot the distribution of a categorical feature.
    - plot_categorical_heatmap: Create a heatmap for co-occurrences between two categorical features.
    - plot_target_distribution: Plot the distribution of a target variable.
    - display_basic_data: Display basic data such as the number of unique elements in each column and the number of missing values.
    """

    def __init__(self) -> None:
        """
        Initializes the DataVisualizer class.
        """
        pass

    # 1. General Data Exploration

[docs]
    def plot_distribution(self, df: pd.DataFrame, columns: Optional[List[str]] = None, kind: str = 'histogram') -> None:
        """
        Plot the distribution of specified columns or all possible combinations of columns in the dataframe.

        Args:
            df (pd.DataFrame): Input dataframe.
            columns (List[str], optional): List of column names to plot. If None, all numeric columns are considered.
            kind (str): Type of plot ('histogram', 'kde', or 'box'). Default is 'histogram'.
        """
        if columns is None:
            columns = df.select_dtypes(include=[np.number]).columns.tolist()
        elif isinstance(columns, str):
            columns = [columns]
        for col in columns:
            plt.figure(figsize=(8, 4))
            if kind == 'histogram':
                sns.histplot(df[col].dropna(), kde=True)
            elif kind == 'kde':
                sns.kdeplot(df[col].dropna(), shade=True)
            elif kind == 'box':
                sns.boxplot(x=df[col].dropna())
            else:
                raise ValueError(f"Unsupported kind: {kind}. Use 'histogram', 'kde', or 'box'.")
            plt.title(f'Distribution of {col}')
            plt.xlabel(col)
            plt.ylabel('Frequency')
            plt.show()



[docs]
    def plot_missing_data(self, df: pd.DataFrame) -> None:
        """
        Visualize missing data in the dataframe using a heatmap.

        Args:
            df (pd.DataFrame): Input dataframe.
        """
        plt.figure(figsize=(10, 6))
        sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
        plt.title("Missing Data Heatmap")
        plt.xlabel("Columns")
        plt.ylabel("Rows")
        plt.show()



[docs]
    def plot_correlation_heatmap(self, df: pd.DataFrame, method: str = 'pearson') -> None:
        """
        Plot a heatmap of correlations between numerical features in the dataframe.

        Args:
            df (pd.DataFrame): Input dataframe.
            method (str): Correlation method ('pearson', 'spearman', 'kendall'). Default is 'pearson'.
        """
        # Select only numeric columns
        numeric_df = df.select_dtypes(include=[np.number])
        corr_matrix = numeric_df.corr(method=method)
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
        plt.title(f"Correlation Heatmap ({method.capitalize()})")
        plt.show()



[docs]
    def plot_swarmplot(
        self,
        df: pd.DataFrame,
        x: Optional[List[str]] = None,
        y: Optional[List[str]] = None,
        hue: Optional[str] = None,
        marker_size: int = 5,
        max_unique: int = 10
    ) -> None:
        """
        Create a swarmplot to visualize the distribution of data points across different categories.

        Args:
            df (pd.DataFrame): Input dataframe.
            x (List[str], optional): The categorical feature(s) to plot on the x-axis.
            y (List[str], optional): The numerical feature(s) to plot on the y-axis.
            hue (str, optional): Column name for adding a hue to the plot.
            marker_size (int): Marker size of the plot.
            max_unique (int): Maximum number of unique values to consider a column categorical.
        """
        # Determine categorical and numerical columns
        categorical_cols = [col for col in df.columns if df[col].nunique() <= max_unique]
        numerical_cols = [col for col in df.select_dtypes(include=[np.number]).columns if df[col].nunique() > max_unique]
        if x is None:
            x = categorical_cols
        elif isinstance(x, str):
            x = [x]
        if y is None:
            y = numerical_cols
        elif isinstance(y, str):
            y = [y]
        # Generate all possible combinations of x and y
        combinations_xy = [(xi, yi) for xi in x for yi in y if xi != yi]
        for xi, yi in combinations_xy:
            if xi not in df.columns:
                raise ValueError(f"Column '{xi}' not found in dataframe.")
            if yi not in df.columns:
                raise ValueError(f"Column '{yi}' not found in dataframe.")
            plt.figure(figsize=(10, 6))
            sns.swarmplot(x=xi, y=yi, hue=hue, data=df, size=marker_size)
            plt.title(f'Swarmplot of {yi} by {xi}')
            plt.xlabel(xi)
            plt.ylabel(yi)
            plt.show()



[docs]
    def plot_3d_scatter(self, df: pd.DataFrame, x: Optional[List[str]] = None, y: Optional[List[str]] = None, z: Optional[List[str]] = None, color: Optional[str] = None) -> None:
        """
        Create a 3D scatter plot for visualizing relationships between three numerical features.

        Args:
            df (pd.DataFrame): Input dataframe.
            x (List[str], optional): X-axis column(s).
            y (List[str], optional): Y-axis column(s).
            z (List[str], optional): Z-axis column(s).
            color (str, optional): Column for coloring the points.
        """
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if x is None:
            x = numeric_cols
        elif isinstance(x, str):
            x = [x]
        if y is None:
            y = numeric_cols
        elif isinstance(y, str):
            y = [y]
        if z is None:
            z = numeric_cols
        elif isinstance(z, str):
            z = [z]
        # Generate all possible combinations of x, y, z
        combinations_xyz = [(xi, yi, zi) for xi in x for yi in y for zi in z if xi != yi and yi != zi and xi != zi]
        for xi, yi, zi in combinations_xyz:
            for col in [xi, yi, zi]:
                if col not in df.columns:
                    raise ValueError(f"Column '{col}' not found in dataframe.")
            fig = px.scatter_3d(df, x=xi, y=yi, z=zi, color=color, title=f"3D Scatter Plot of {xi}, {yi}, {zi}")
            fig.show()


    # 2. Feature Interactions

[docs]
    def plot_pairwise_relationships(self, df: pd.DataFrame, columns: Optional[List[str]] = None) -> None:
        """
        Plot pairwise relationships between features in the dataframe.

        Args:
            df (pd.DataFrame): Input dataframe.
            columns (List[str], optional): List of column names to plot pairwise relationships.
        """
        if columns is None:
            columns = df.select_dtypes(include=[np.number]).columns.tolist()
        sns.pairplot(df[columns].dropna(), diag_kind="kde")
        plt.show()



[docs]
    def plot_scatter_with_outliers(self, df: pd.DataFrame, outliers: pd.Series, x: Optional[List[str]] = None, y: Optional[List[str]] = None) -> None:
        """
        Plot scatter plots with outliers highlighted.

        Args:
            df (pd.DataFrame): Input dataframe.
            x (List[str], optional): X-axis column(s).
            y (List[str], optional): Y-axis column(s).
            outliers (pd.Series): Boolean series indicating outliers.
        """
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if x is None:
            x = numeric_cols
        elif isinstance(x, str):
            x = [x]
        if y is None:
            y = numeric_cols
        elif isinstance(y, str):
            y = [y]
        combinations_xy = [(xi, yi) for xi in x for yi in y if xi != yi]
        for xi, yi in combinations_xy:
            if xi not in df.columns:
                raise ValueError(f"Column '{xi}' not found in dataframe.")
            if yi not in df.columns:
                raise ValueError(f"Column '{yi}' not found in dataframe.")
            if not isinstance(outliers, pd.Series):
                raise ValueError("outliers must be a pandas Series.")
            if len(df) != len(outliers):
                raise ValueError("Length of outliers Series must match length of dataframe.")
            plt.figure(figsize=(8, 6))
            plt.scatter(df[xi], df[yi], c=outliers.map({True: 'red', False: 'blue'}), edgecolor='k', alpha=0.7)
            plt.xlabel(xi)
            plt.ylabel(yi)
            plt.title(f'Scatter plot of {xi} vs {yi} with Outliers Highlighted')
            plt.show()


    # 3. Outlier Detection Visualization

[docs]
    def plot_boxplot_with_outliers(self, df: pd.DataFrame, columns: Optional[List[str]] = None) -> None:
        """
        Plot boxplots for columns to visualize potential outliers.

        Args:
            df (pd.DataFrame): Input dataframe.
            columns (List[str], optional): List of column names to plot.
        """
        if columns is None:
            columns = df.select_dtypes(include=[np.number]).columns.tolist()
        plt.figure(figsize=(12, 6))
        df[columns].boxplot()
        plt.title("Box Plot for Outlier Detection")
        plt.ylabel("Value")
        plt.show()



[docs]
    def plot_isolation_forest_outliers(self, df: pd.DataFrame, outliers: pd.Series) -> None:
        """
        Highlight outliers detected by Isolation Forest in a scatter plot.

        Args:
            df (pd.DataFrame): Input dataframe (should have at least two columns).
            outliers (pd.Series): Boolean series indicating outliers.
        """
        if df.select_dtypes(include=[np.number]).shape[1] < 2:
            raise ValueError("Dataframe must have at least two numeric columns.")
        if not isinstance(outliers, pd.Series):
            raise ValueError("outliers must be a pandas Series.")
        if len(df) != len(outliers):
            raise ValueError("Length of outliers Series must match length of dataframe.")
        numeric_df = df.select_dtypes(include=[np.number])
        x_col, y_col = numeric_df.columns[:2]
        fig = px.scatter(df, x=x_col, y=y_col, color=outliers.map({True: 'Outlier', False: 'Inlier'}))
        fig.update_layout(title='Isolation Forest Outliers', xaxis_title=x_col, yaxis_title=y_col)
        fig.show()


    # 4. Temporal Data Visualization

[docs]
    def plot_time_series(self, df: pd.DataFrame, date_col: Optional[str] = None, value_cols: Optional[List[str]] = None, rolling_window: Optional[int] = None) -> None:
        """
        Plot time series data with an optional rolling window.

        Args:
            df (pd.DataFrame): Input dataframe.
            date_col (str, optional): Name of the datetime column. If None, uses the first datetime column.
            value_cols (List[str], optional): Names of the value columns to plot.
            rolling_window (int, optional): Optional rolling window size.
        """
        if date_col is None:
            date_cols = df.select_dtypes(include=['datetime', 'datetime64[ns]']).columns.tolist()
            if not date_cols:
                raise ValueError("No datetime column found in dataframe.")
            date_col = date_cols[0]
        else:
            if date_col not in df.columns:
                raise ValueError(f"Column '{date_col}' not found in dataframe.")
            if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
                df[date_col] = pd.to_datetime(df[date_col])
        if value_cols is None:
            value_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        elif isinstance(value_cols, str):
            value_cols = [value_cols]
        for value_col in value_cols:
            if value_col not in df.columns:
                raise ValueError(f"Column '{value_col}' not found in dataframe.")
            plt.figure(figsize=(12, 6))
            plt.plot(df[date_col], df[value_col], label='Original Data')
            if rolling_window:
                plt.plot(df[date_col], df[value_col].rolling(window=rolling_window).mean(), label=f'Rolling Mean ({rolling_window})')
            plt.xlabel('Date')
            plt.ylabel(value_col)
            plt.title(f'Time Series of {value_col}')
            plt.legend()
            plt.show()


    # 5. Dimensionality Reduction Visualization

[docs]
    def plot_pca(self, df: pd.DataFrame, columns: Optional[List[str]] = None, n_components: int = 2, color: Optional[str] = None) -> None:
        """
        Plot the results of Principal Component Analysis (PCA).

        Args:
            df (pd.DataFrame): Input dataframe.
            columns (List[str], optional): List of columns to use for PCA. If None, all numeric columns are used.
            n_components (int): Number of components to reduce to. Default is 2.
            color (str, optional): Column name to use for coloring the points.
        """
        if columns is None:
            columns = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_df = df[columns].select_dtypes(include=[np.number])
        if n_components < 1 or n_components > numeric_df.shape[1]:
            raise ValueError(f"n_components must be between 1 and {numeric_df.shape[1]}")
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(numeric_df)
        pca_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(n_components)])
        if color and color in df.columns:
            pca_df[color] = df[color].values
        plt.figure(figsize=(8, 6))
        if n_components == 2:
            sns.scatterplot(x='PC1', y='PC2', hue=color, data=pca_df)
            plt.title('PCA Result')
            plt.xlabel('PC1')
            plt.ylabel('PC2')
            plt.legend()
            plt.show()
        elif n_components == 3:
            fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color=color, title='PCA Result')
            fig.show()
        else:
            raise ValueError("n_components must be 2 or 3 for plotting.")



[docs]
    def plot_tsne(self, df: pd.DataFrame, n_components: int = 2, perplexity: int = 30, color: Optional[str] = None) -> None:
        """
        Plot the results of t-SNE dimensionality reduction.

        Args:
            df (pd.DataFrame): Input dataframe.
            n_components (int): Number of components to reduce to. Default is 2.
            perplexity (int): Perplexity parameter for t-SNE. Default is 30.
            color (str, optional): Column name to use for coloring the points.
        """
        # Select only numeric columns
        numeric_df = df.select_dtypes(include=[np.number])
        if n_components != 2:
            raise ValueError("n_components must be 2 for t-SNE plotting.")
        tsne = TSNE(n_components=n_components, perplexity=perplexity)
        tsne_result = tsne.fit_transform(numeric_df)
        tsne_df = pd.DataFrame(tsne_result, columns=['Component 1', 'Component 2'])
        if color and color in df.columns:
            tsne_df[color] = df[color].values
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='Component 1', y='Component 2', hue=color, data=tsne_df)
        plt.title('t-SNE Result')
        plt.xlabel('Component 1')
        plt.ylabel('Component 2')
        plt.legend()
        plt.show()



[docs]
    def plot_umap(self, df: pd.DataFrame, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.1, color: Optional[str] = None) -> None:
        """
        Plot the results of UMAP dimensionality reduction.

        Args:
            df (pd.DataFrame): Input dataframe.
            n_components (int): Number of components to reduce to. Default is 2.
            n_neighbors (int): The size of the local neighborhood.
            min_dist (float): Minimum distance between points in the low-dimensional space.
            color (str, optional): Column name to use for coloring the points.
        """
        # Select only numeric columns
        numeric_df = df.select_dtypes(include=[np.number])
        if n_components != 2:
            raise ValueError("n_components must be 2 for UMAP plotting.")
        # UMAP model
        umap_model = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
        umap_result = umap_model.fit_transform(numeric_df)
        umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
        if color and color in df.columns:
            umap_df[color] = df[color].values
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='UMAP1', y='UMAP2', hue=color, data=umap_df)
        plt.title('UMAP Result')
        plt.xlabel('UMAP1')
        plt.ylabel('UMAP2')
        plt.legend()
        plt.show()



[docs]
    def plot_clusters(self, df: pd.DataFrame, cluster_labels: pd.Series, method: str = 'pca', n_components: int = 2) -> None:
        """
        Plot data points color-coded by cluster labels using dimensionality reduction.

        Args:
            df (pd.DataFrame): The input dataframe containing the features.
            cluster_labels (pd.Series): The cluster labels for each data point.
            method (str): The dimensionality reduction method ('pca', 'umap', 'tsne', or 'identity'). Default is 'pca'.
            n_components (int): Number of dimensions to reduce to. Default is 2.
        """
        if method == 'pca':
            reducer = PCA(n_components=n_components)
        elif method == 'umap':
            reducer = umap.UMAP(n_components=n_components)
        elif method == 'tsne':
            reducer = TSNE(n_components=n_components)
        elif method == 'identity':
            reducer = None
        else:
            raise ValueError(f"Unsupported dimensionality reduction method: {method}")

        numeric_df = df.select_dtypes(include=[np.number])
        if reducer:
            reduced_data = reducer.fit_transform(numeric_df)
        else:
            reduced_data = numeric_df.values[:, :n_components]

        plot_df = pd.DataFrame(reduced_data, columns=[f'Dim{i+1}' for i in range(n_components)])
        plot_df['Cluster'] = cluster_labels.values

        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='Dim1', y='Dim2', hue='Cluster', palette='tab20', data=plot_df, s=50, edgecolor='k')
        plt.title(f'Clusters Visualized using {method.upper()}')
        plt.xlabel(f'{method.upper()} 1')
        plt.ylabel(f'{method.upper()} 2')
        plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.show()


    # 6. Interactive Visualizations using Plotly

[docs]
    def plot_interactive_histogram(self, df: pd.DataFrame, columns: Optional[List[str]] = None) -> None:
        """
        Create interactive histograms for specified columns.

        Args:
            df (pd.DataFrame): Input dataframe.
            columns (List[str], optional): List of columns to visualize.
        """
        if columns is None:
            columns = df.columns.tolist()
        elif isinstance(columns, str):
            columns = [columns]
        for col in columns:
            if col not in df.columns:
                raise ValueError(f"Column '{col}' not found in dataframe.")
            fig = px.histogram(df, x=col, nbins=50, title=f'Interactive Histogram of {col}')
            fig.show()



[docs]
    def plot_interactive_correlation(self, df: pd.DataFrame) -> None:
        """
        Create an interactive correlation heatmap using Plotly.

        Args:
            df (pd.DataFrame): Input dataframe.
        """
        # Select only numeric columns
        numeric_df = df.select_dtypes(include=[np.number])
        corr_matrix = numeric_df.corr()
        fig = go.Figure(data=go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.index,
            colorscale='Viridis',
            zmin=-1,
            zmax=1))
        fig.update_layout(title="Interactive Correlation Heatmap", xaxis_nticks=36)
        fig.show()


    # 7. Interactive Scatter Plots

[docs]
    def plot_interactive_scatter(
        self,
        df: pd.DataFrame,
        x: Optional[List[str]] = None,
        y: Optional[List[str]] = None,
        color: Optional[str] = None,
        size: Optional[str] = None,
        max_unique: int = 10
    ) -> None:
        """
        Create interactive scatter plots for all possible combinations of x and y columns.

        Args:
            df (pd.DataFrame): Input dataframe.
            x (List[str], optional): X-axis column(s).
            y (List[str], optional): Y-axis column(s).
            color (str, optional): Column for color encoding.
            size (str, optional): Column for size encoding.
            max_unique (int): Maximum number of unique values to consider a column categorical.
        """
        numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns if df[col].nunique() > max_unique]
        if x is None:
            x = numeric_cols
        elif isinstance(x, str):
            x = [x]
        if y is None:
            y = numeric_cols
        elif isinstance(y, str):
            y = [y]
        combinations_xy = [(xi, yi) for xi in x for yi in y if xi != yi]
        for xi, yi in combinations_xy:
            for col in [xi, yi]:
                if col not in df.columns:
                    raise ValueError(f"Column '{col}' not found in dataframe.")
            fig = px.scatter(df, x=xi, y=yi, color=color, size=size, title=f'Interactive Scatter Plot of {xi} vs {yi}')
            fig.show()


    # 8. Feature Importance Visualization

[docs]
    def plot_feature_importance(self, feature_importances: np.ndarray, feature_names: List[str]) -> None:
        """
        Plot feature importance from a machine learning model.

        Args:
            feature_importances (np.ndarray): Array of feature importance values.
            feature_names (List[str]): List of feature names.
        """
        if len(feature_importances) != len(feature_names):
            raise ValueError("Length of feature_importances and feature_names must match.")
        indices = np.argsort(feature_importances)[::-1]
        plt.figure(figsize=(10, 6))
        plt.title("Feature Importance")
        plt.bar(range(len(feature_importances)), feature_importances[indices], align='center')
        plt.xticks(range(len(feature_importances)), [feature_names[i] for i in indices], rotation=90)
        plt.tight_layout()
        plt.show()


    # 9. Categorical Data Visualization

[docs]
    def plot_barplot(self, df: pd.DataFrame, x: Optional[List[str]] = None, y: Optional[List[str]] = None, hue: Optional[str] = None) -> None:
        """
        Create barplots for visualizing the aggregated values of numerical features across categories.

        Args:
            df (pd.DataFrame): Input dataframe.
            x (List[str], optional): The categorical feature(s) to plot on the x-axis.
            y (List[str], optional): The numerical feature(s) to aggregate and plot on the y-axis.
            hue (str, optional): Column name for adding a hue to the plot.
        """
        if x is None:
            x = [col for col in df.columns if df[col].nunique() <= 10]
        elif isinstance(x, str):
            x = [x]
        if y is None:
            y = df.select_dtypes(include=[np.number]).columns.tolist()
        elif isinstance(y, str):
            y = [y]
        combinations_xy = [(xi, yi) for xi in x for yi in y if xi != yi]
        for xi, yi in combinations_xy:
            plt.figure(figsize=(10, 6))
            sns.barplot(x=xi, y=yi, hue=hue, data=df, errorbar=None)
            plt.title(f'Barplot of {yi} by {xi}')
            plt.xlabel(xi)
            plt.ylabel(yi)
            plt.show()



[docs]
    def plot_boxplot_categorical(self, df: pd.DataFrame, x: Optional[List[str]] = None, y: Optional[List[str]] = None, hue: Optional[str] = None, max_unique: int = 10) -> None:
        """
        Create boxplots to visualize the distribution of numerical features across different categories.

        Args:
            df (pd.DataFrame): Input dataframe.
            x (List[str], optional): The categorical feature(s) to plot on the x-axis.
            y (List[str], optional): The numerical feature(s) to plot on the y-axis. If None, only columns with more than 'max_unique' unique elements are considered.
            hue (str, optional): Column name for adding a hue to the plot.
            max_unique (int): Maximum number of unique values to consider a column categorical.
        """
        # Select x columns (categorical)
        if x is None:
            x = [col for col in df.columns if df[col].nunique() <= max_unique]
        elif isinstance(x, str):
            x = [x]

        # Select y columns (numerical with more than max_unique unique values)
        if y is None:
            y = [col for col in df.select_dtypes(include=[np.number]).columns if df[col].nunique() > max_unique]
        elif isinstance(y, str):
            y = [y]

        # Create all combinations of x and y
        combinations_xy = [(xi, yi) for xi in x for yi in y if xi != yi]

        # Plot boxplots
        for xi, yi in combinations_xy:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=xi, y=yi, hue=hue, data=df)
            plt.title(f'Boxplot of {yi} by {xi}')
            plt.xlabel(xi)
            plt.ylabel(yi)
            plt.show()



[docs]
    def plot_categorical_distribution(self, df: pd.DataFrame, columns: Optional[List[str]] = None, hue: Optional[str] = None, max_unique: int = 10) -> None:
        """
        Plot the distribution of categorical features.

        Args:
            df (pd.DataFrame): Input dataframe.
            columns (List[str], optional): Names of the categorical columns.
            hue (str, optional): Column name for adding a hue to the plot.
            max_unique (int): Maximum number of unique values to consider a column categorical.
        """
        if columns is None:
            columns = [col for col in df.columns if df[col].nunique() <= max_unique]
        elif isinstance(columns, str):
            columns = [columns]
        for col in columns:
            plt.figure(figsize=(8, 6))
            sns.countplot(x=col, hue=hue, data=df)
            plt.title(f'Distribution of {col}')
            plt.xlabel(col)
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.show()



[docs]
    def plot_categorical_heatmap(self, df: pd.DataFrame, cols: Optional[List[str]] = None, max_unique: int = 10) -> None:
        """
        Create heatmaps for visualizing the frequency of co-occurrences between categorical features.

        Args:
            df (pd.DataFrame): Input dataframe.
            cols (List[str], optional): List of categorical columns.
            max_unique (int): Maximum number of unique values to consider a column categorical.
        """
        if cols is None:
            cols = [col for col in df.columns if df[col].nunique() <= max_unique]
        elif isinstance(cols, str):
            cols = [cols]
        # Generate all possible combinations of two columns
        combinations_cols = combinations(cols, 2)
        for c1, c2 in combinations_cols:
            crosstab = pd.crosstab(df[c1], df[c2])
            plt.figure(figsize=(10, 6))
            sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues')
            plt.title(f'Heatmap of {c1} vs {c2}')
            plt.xlabel(c2)
            plt.ylabel(c1)
            plt.show()


    # 10. Plot Target Distribution

[docs]
    def plot_target_distribution(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> None:
        """
        Plot the distribution of target variable(s).

        Args:
            df (pd.DataFrame): Input dataframe.
            target_columns (List[str], optional): Names of the target columns.
        """
        if target_columns is None:
            target_columns = [df.columns[-1]]  # Assume the last column is target
        elif isinstance(target_columns, str):
            target_columns = [target_columns]
        for target_column in target_columns:
            if target_column not in df.columns:
                raise ValueError(f"Column '{target_column}' not found in dataframe.")
            plt.figure(figsize=(8, 6))
            if df[target_column].dtype == 'object' or df[target_column].dtype.name == 'category':
                sns.countplot(x=target_column, data=df)
                plt.ylabel('Count')
            else:
                sns.histplot(df[target_column], kde=True)
                plt.ylabel('Frequency')
            plt.title(f'Target Distribution: {target_column}')
            plt.xlabel(target_column)
            plt.show()



[docs]
    def display_basic_data(self, df: pd.DataFrame) -> None:
        """
        Display basic data such as the number of unique elements in each column and the number of missing values.

        Args:
            df (pd.DataFrame): Input dataframe.
        """
        summary_df = pd.DataFrame({
            'Unique Values': df.nunique(),
            'Missing Values': df.isnull().sum(),
            'Data Type': df.dtypes
        })
        summary_df = summary_df.reset_index().rename(columns={'index': 'Column'})
        print(summary_df)