Source code for operate_dataframe

import pandas as pd
from typing import List, Tuple, Union, Callable, Dict, Optional, Any


[docs] class DataFrameOperator: """ A class that provides various DataFrame operations such as merging, concatenation, splitting, and other utility functions for DataFrame manipulation. """
[docs] @staticmethod def merge_dataframes( df1: pd.DataFrame, df2: pd.DataFrame, on: Optional[Union[str, List[str]]] = None, how: str = 'inner', left_on: Optional[Union[str, List[str]]] = None, right_on: Optional[Union[str, List[str]]] = None, left_index: bool = False, right_index: bool = False, suffixes: Tuple[str, str] = ('_x', '_y'), indicator: bool = False, validate: Optional[str] = None ) -> pd.DataFrame: """ Merge two DataFrames using database-style joins. Args: df1 (pd.DataFrame): The first DataFrame. df2 (pd.DataFrame): The second DataFrame. on (Union[str, List[str], None], optional): Column or index level names to join on. how (str, optional): Type of merge to be performed ('left', 'right', 'outer', 'inner', 'cross'). Defaults to 'inner'. left_on (Union[str, List[str], None], optional): Column(s) from the left DataFrame to use as keys. right_on (Union[str, List[str], None], optional): Column(s) from the right DataFrame to use as keys. left_index (bool, optional): Use index from the left DataFrame as join key. Defaults to False. right_index (bool, optional): Use index from the right DataFrame as join key. Defaults to False. suffixes (Tuple[str, str], optional): Suffixes to apply to overlapping column names. Defaults to ('_x', '_y'). indicator (bool, optional): Adds a column '_merge' with merge information. Defaults to False. validate (str, optional): Checks if merge is of specified type. Defaults to None. Returns: pd.DataFrame: A merged DataFrame. """ return pd.merge( df1, df2, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, suffixes=suffixes, indicator=indicator, validate=validate )
[docs] @staticmethod def concat_dataframes( dfs: List[pd.DataFrame], axis: int = 0, join: str = 'outer', ignore_index: bool = False, keys: Optional[List] = None, levels: Optional[List] = None, names: Optional[List[str]] = None, verify_integrity: bool = False, sort: bool = False, copy: bool = True ) -> pd.DataFrame: """ Concatenate pandas objects along a particular axis. Args: dfs (List[pd.DataFrame]): List of DataFrames to concatenate. axis (int, optional): The axis to concatenate along (0 for index, 1 for columns). Defaults to 0. join (str, optional): How to handle indexes on other axes ('inner', 'outer'). Defaults to 'outer'. ignore_index (bool, optional): If True, do not use the index values along the concatenation axis. Defaults to False. keys (List, optional): Sequence of keys to use to construct a hierarchical index. Defaults to None. levels (List, optional): Specific levels to use for the hierarchical index. Defaults to None. names (List[str], optional): Names for the levels in the resulting hierarchical index. Defaults to None. verify_integrity (bool, optional): Check whether the new concatenated axis contains duplicates. Defaults to False. sort (bool, optional): Sort non-concatenation axis if not aligned. Defaults to False. copy (bool, optional): If False, do not copy data unnecessarily. Defaults to True. Returns: pd.DataFrame: The concatenated DataFrame. """ return pd.concat( dfs, axis=axis, join=join, ignore_index=ignore_index, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, sort=sort, copy=copy )
[docs] @staticmethod def split_dataframe( df: pd.DataFrame, columns: List[str] ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Split a DataFrame into two DataFrames based on specified columns. Args: df (pd.DataFrame): The input DataFrame. columns (List[str]): List of column names to separate. Returns: Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing: - DataFrame with the specified columns. - DataFrame without the specified columns. """ missing_cols = set(columns) - set(df.columns) if missing_cols: raise ValueError(f"Columns {missing_cols} not found in DataFrame.") df_selected = df[columns].copy() df_remaining = df.drop(columns=columns) return df_selected, df_remaining
[docs] @staticmethod def drop_columns( df: pd.DataFrame, columns: List[Union[str, int]] ) -> pd.DataFrame: """ Drop specified columns from the DataFrame by name or index position. Args: df (pd.DataFrame): The input DataFrame. columns (List[Union[str, int]]): List of column names or index positions to drop. Returns: pd.DataFrame: A DataFrame with the specified columns dropped. """ columns_to_drop = [] for col in columns: if isinstance(col, int): try: columns_to_drop.append(df.columns[col]) except IndexError: raise IndexError(f"Column index {col} is out of bounds.") elif isinstance(col, str): if col in df.columns: columns_to_drop.append(col) else: raise ValueError(f"Column '{col}' not found in DataFrame.") else: raise TypeError("Columns must be a list of column names or index positions.") return df.drop(columns=columns_to_drop)
[docs] @staticmethod def groupby( df: pd.DataFrame, by: Union[str, List[str]], agg_funcs: Union[str, List[str], Dict[str, Union[str, List[str]]]] ) -> pd.DataFrame: """ Perform a group-by operation and apply aggregation functions. Args: df (pd.DataFrame): The input DataFrame. by (Union[str, List[str]]): Column(s) to group by. agg_funcs (Union[str, List[str], Dict[str, Union[str, List[str]]]]): Aggregation function(s). Returns: pd.DataFrame: A DataFrame with grouped and aggregated data. """ return df.groupby(by).agg(agg_funcs).reset_index()
[docs] @staticmethod def apply_function( df: pd.DataFrame, columns: List[str], func: Callable, element_wise: bool = True ) -> pd.DataFrame: """ Apply a custom function to specified columns. Args: df (pd.DataFrame): The input DataFrame. columns (List[str]): List of column names to apply the function to. func (Callable): The function to apply. element_wise (bool, optional): If True, apply function element-wise. If False, apply column-wise. Defaults to True. Returns: pd.DataFrame: A DataFrame with the function applied to the specified columns. """ missing_cols = set(columns) - set(df.columns) if missing_cols: raise ValueError(f"Columns {missing_cols} not found in DataFrame.") df_copy = df.copy() if element_wise: df_copy[columns] = df_copy[columns].applymap(func) else: df_copy[columns] = df_copy[columns].apply(func) return df_copy
[docs] @staticmethod def filter_rows( df: pd.DataFrame, condition: str ) -> pd.DataFrame: """ Filter rows in the DataFrame based on a given condition. Args: df (pd.DataFrame): The input DataFrame. condition (str): The condition to filter rows by (e.g., "age > 30"). Returns: pd.DataFrame: A new DataFrame with filtered rows. """ return df.query(condition)
[docs] @staticmethod def fill_missing( df: pd.DataFrame, value: Optional[Union[float, Dict[str, Union[float, str]]]] = 0, columns: Optional[List[str]] = None, method: Optional[str] = None, axis: Optional[int] = None, limit: Optional[int] = None ) -> pd.DataFrame: """ Fill missing values in the DataFrame. Args: df (pd.DataFrame): The input DataFrame. value (Union[float, Dict[str, Union[float, str]]], optional): Value to use for filling holes. columns (List[str], optional): Specific columns to fill missing values in. method (str, optional): Method to use for filling holes ('backfill', 'bfill', 'pad', 'ffill', None). axis (int, optional): Axis along which to fill missing values. limit (int, optional): Maximum number of consecutive NaNs to fill. Returns: pd.DataFrame: A DataFrame with missing values filled. """ df_copy = df.copy() if columns: missing_cols = set(columns) - set(df.columns) if missing_cols: raise ValueError(f"Columns {missing_cols} not found in DataFrame.") df_copy[columns] = df_copy[columns].fillna( value=value, method=method, axis=axis, limit=limit ) else: df_copy = df_copy.fillna( value=value, method=method, axis=axis, limit=limit ) return df_copy
[docs] @staticmethod def rename_columns( df: pd.DataFrame, columns_dict: Dict[str, str] ) -> pd.DataFrame: """ Rename columns in the DataFrame based on a given dictionary. Args: df (pd.DataFrame): The input DataFrame. columns_dict (Dict[str, str]): A dictionary mapping old column names to new ones. Returns: pd.DataFrame: A DataFrame with renamed columns. """ missing_cols = set(columns_dict.keys()) - set(df.columns) if missing_cols: raise ValueError(f"Columns {missing_cols} not found in DataFrame.") return df.rename(columns=columns_dict)
[docs] @staticmethod def change_column_types( df: pd.DataFrame, columns_types: Dict[str, Union[str, type]] ) -> pd.DataFrame: """ Change the data types of specified columns. Args: df (pd.DataFrame): The input DataFrame. columns_types (Dict[str, Union[str, type]]): A dictionary mapping column names to target data types. Returns: pd.DataFrame: A DataFrame with the specified column types changed. """ missing_cols = set(columns_types.keys()) - set(df.columns) if missing_cols: raise ValueError(f"Columns {missing_cols} not found in DataFrame.") return df.astype(columns_types)
[docs] @staticmethod def sort_values( df: pd.DataFrame, by: Union[str, List[str]], ascending: Union[bool, List[bool]] = True, inplace: bool = False, na_position: str = 'last' ) -> pd.DataFrame: """ Sort the DataFrame by specified column(s). Args: df (pd.DataFrame): The input DataFrame. by (Union[str, List[str]]): Column name(s) to sort by. ascending (Union[bool, List[bool]], optional): Sort ascending vs. descending. Defaults to True. inplace (bool, optional): If True, perform operation in-place. Defaults to False. na_position (str, optional): 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. Defaults to 'last'. Returns: pd.DataFrame: The sorted DataFrame. """ return df.sort_values( by=by, ascending=ascending, inplace=inplace, na_position=na_position )
[docs] @staticmethod def split_by_missing_values( df: pd.DataFrame ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Split the DataFrame into two DataFrames based on missing values. Args: df (pd.DataFrame): The input DataFrame. Returns: Tuple[pd.DataFrame, pd.DataFrame]: - DataFrame with columns that contain missing values. - DataFrame with columns that do not have any missing values. """ columns_with_missing = df.columns[df.isnull().any()] columns_without_missing = df.columns[~df.isnull().any()] df_with_missing = df[columns_with_missing].copy() df_without_missing = df[columns_without_missing].copy() return df_with_missing, df_without_missing
[docs] @staticmethod def drop_duplicates( df: pd.DataFrame, subset: Optional[List[str]] = None, keep: str = 'first', inplace: bool = False, ignore_index: bool = False ) -> pd.DataFrame: """ Remove duplicate rows from the DataFrame. Args: df (pd.DataFrame): The input DataFrame. subset (List[str], optional): Columns to consider when identifying duplicates. keep (str, optional): Which duplicate to keep ('first', 'last', False). Defaults to 'first'. inplace (bool, optional): If True, perform operation in-place. Defaults to False. ignore_index (bool, optional): If True, reset index after dropping duplicates. Defaults to False. Returns: pd.DataFrame: The DataFrame with duplicates removed. """ return df.drop_duplicates( subset=subset, keep=keep, inplace=inplace, ignore_index=ignore_index )
[docs] @staticmethod def sample_dataframe( df: pd.DataFrame, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, weights: Optional[Union[str, pd.Series]] = None, random_state: Optional[int] = None, axis: int = 0 ) -> pd.DataFrame: """ Return a random sample of items from an axis of object. Args: df (pd.DataFrame): The input DataFrame. n (int, optional): Number of items from axis to return. frac (float, optional): Fraction of axis items to return. replace (bool, optional): Sample with or without replacement. Defaults to False. weights (Union[str, pd.Series], optional): Weights for sampling. random_state (int, optional): Seed for the random number generator. axis (int, optional): Axis to sample. Defaults to 0. Returns: pd.DataFrame: A random sample of the DataFrame. """ return df.sample( n=n, frac=frac, replace=replace, weights=weights, random_state=random_state, axis=axis )
[docs] @staticmethod def pivot_table( df: pd.DataFrame, values: Optional[Union[str, List[str]]] = None, index: Optional[Union[str, List[str]]] = None, columns: Optional[Union[str, List[str]]] = None, aggfunc: Union[str, List[str], Dict[str, Union[str, List[str]]]] = 'mean', fill_value: Optional[Any] = None, margins: bool = False, dropna: bool = True, margins_name: str = 'All', observed: bool = False, sort: bool = True ) -> pd.DataFrame: """ Create a spreadsheet-style pivot table as a DataFrame. Args: df (pd.DataFrame): The input DataFrame. values (Union[str, List[str]], optional): Column(s) to aggregate. index (Union[str, List[str]], optional): Keys to group by on the pivot table index. columns (Union[str, List[str]], optional): Keys to group by on the pivot table column. aggfunc (Union[str, List[str], Dict[str, Union[str, List[str]]]], optional): Aggregation function(s). Defaults to 'mean'. fill_value (Any, optional): Value to replace missing values with. margins (bool, optional): Add all rows/columns (subtotals). Defaults to False. dropna (bool, optional): Do not include columns whose entries are all NaN. Defaults to True. margins_name (str, optional): Name of the row/column that will contain the totals. Defaults to 'All'. observed (bool, optional): This only applies if any of the groupers are categoricals. Defaults to False. sort (bool, optional): Sort group keys. Defaults to True. Returns: pd.DataFrame: The pivot table. """ return pd.pivot_table( df, values=values, index=index, columns=columns, aggfunc=aggfunc, fill_value=fill_value, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, sort=sort )