Source code for instaeda.instaeda

import numpy as np
import pandas as pd
import altair as alt
from sklearn.impute import SimpleImputer
import warnings


[docs]def plot_intro(df, plot_title='', theme_config='Dimension'):
    """Takes a dataframe with configurations and returns an altair object with summary metrics.

    Parameters
    -----------
    df: pd.DataFrame
        Dataframe from which to take columns not limited to numerical columns only
    plot_title : string, optional
        User can specify the plot title, by default to show the memory usage
    theme_config : list, optional
        A list of color configurations to be passed to theme, by default to use Demension as config

    Returns
    -------
    plot : altair.Chart object
        An altair plot object displaying summary metrics including the memory usage and 
        the basic description of the input data.

    Examples
    -------
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, 0],
                                    'num_wings': [2, 0, 0, 0],
                                    'num_specimen_seen': [10, 2, 1, 8]})
    >>> instaeda_py.plot_intro(example_df)
    """

    # Check basic information for input data
    sum_missing_columns = df.isnull().sum(axis = 0) 
    num_of_all_missing_columns = sum(sum_missing_columns)

    sum_missing_rows = df.isnull().sum(axis = 1)
    num_complete_rows = df.shape[0] - sum(sum_missing_rows)

    # Create info dataframe
    info_df = pd.DataFrame({'rows': df.shape[0], 
                            'columns': df.shape[1],
                            'numeric_columns': len(list(df.select_dtypes(include=[np.number]).columns.values)),
                            'all_missing_columns': num_of_all_missing_columns, 
                            'total_missing_values': df.isnull().sum().sum(),
                            'complete_rows': num_complete_rows,
                            'total_observations': df.shape[0] * df.shape[1],
                            'memory_usage': df.memory_usage(deep=True).sum(),
                       }, index = [0])
    
    # Create the plotting dataframe   
    plot_df = pd.DataFrame({'Metrics': ['Numeric Columns', 'All Missing Columns', 'Missing Observations', 'Complete Rows'], 
                            'Value': [float(info_df['numeric_columns']/info_df['columns']), 
                                      float(info_df['all_missing_columns']/info_df['columns']),
                                      float(info_df['total_missing_values']/info_df['total_observations']),
                                      float(info_df['complete_rows']/info_df['rows'])],
                            'Dimension': ['column', 'column', 'observation', 'row']
                })

    # Create the plot

    ## Check whether the user specifies a plotting title
    if len(plot_title) == 0:
        plot_title = 'Memory Usage: ' + str(float(info_df['memory_usage'])) + 'kb'
        intro_plot = alt.Chart(plot_df, title=plot_title).mark_bar().encode(
            alt.X('Value', axis=alt.Axis(format='%')),
            alt.Y('Metrics'),
            color=alt.Color(theme_config)) 
        
    else:
        intro_plot = alt.Chart(plot_df, title=plot_title).mark_bar().encode(
            alt.X('Value', axis=alt.Axis(format='%')),
            alt.Y('Metrics'),
            color=alt.Color(theme_config)) 

    return intro_plot

[docs]def plot_corr(df, cols=None, method="pearson", colour_palette="purpleorange"):
    """Takes a dataframe, subsets numeric columns and returns a correlation plot object.

    Parameters
    -----------
    df: pd.DataFrame
        Dataframe from which to take columns and calculate, plot correlation between columns.
    cols: list, optional
        List of columns to perform correlation on. By default, None (perform on all numeric).
    method : string, optional
        correlation calculation method, one of: {'pearson', 'kendall', 'spearman'}. By default 'pearson'
    colour_palette : string, optional
        one of Altair accepted colour schemes

    Returns
    -------
    plot : altair.Chart object
        Correlation plot object displaying column names and corresponding correlation values.

    Examples
    -------
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, 0],
                                    'num_wings': [2, 0, 0, 0],
                                    'num_specimen_seen': [10, 2, 1, 8]})
    >>> instaeda_py.plot_corr(example_df)
    """
    
    # check user input
    correlation_methods = {'pearson', 'kendall', 'spearman'}
    colour_palette_list = {'blueorange', 'brownbluegreen', 'purplegreen', 'pinkyellowgreen', 'purpleorange', 'redblue', 'redgrey', 'redyellowblue', 'redyellowgreen', 'spectral'}
    numeric_cols = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    assert isinstance(df, pd.DataFrame), "must pass in pandas DataFrame"
    assert method in correlation_methods, "correlation method not acceptable"
    if colour_palette not in colour_palette_list:
        warnings.warn("Recommended Altair continuous diverging colour palette")
    
    # calculate 
    if cols == None:
        assert(df.select_dtypes(np.number).shape[1] >= 2), "Dataframe does not have enough numeric columns for comparison"
        df = df.select_dtypes(include = numeric_cols)
    else:
        assert(df[cols].select_dtypes(np.number).shape[1] >= 2), "Dataframe does not have enough numeric columns for comparison"
        df = df[cols].select_dtypes(include = numeric_cols)
    corr_df = round(df.corr(method=method),4).stack().reset_index(name='corr').rename(columns={'level_0':'variable_1','level_1':'variable_2'})
    
    # plot base plot
    corr_plot = alt.Chart(corr_df, title='Correlations between variables').mark_rect().encode(
    x = alt.X('variable_1', title=''),
    y = alt.Y('variable_2', title=''),
        color = alt.Color('corr', scale = alt.Scale(scheme=colour_palette, domain=(-1, 1)))
    ).properties(height=400, width=400)
    
    # plot corr values
    text = corr_plot.mark_text().encode(
    text='corr:Q',
    color=alt.value('black')
    )
    
    return corr_plot + text

[docs]def divide_and_fill(
    dataframe,
    cols=None,
    missing_values=np.nan,
    strategy="mean",
    fill_value=None,
    random=False,
    parts=1,
    verbose=0,
):
    """Takes a dataframe, subsets selected columns and divides into parts for imputation of missing values and returns a data frame.

    Parameters
    -----------
    dataframe: pd.DataFrame
        Dataframe from which to take columns and check for missing values.
    cols: list, optional
        List of columns to perform imputation on. By default, None (perform on all numeric columns).
    missing_values: int, float, str, np.nan or None
        The placeholder for the missing values. All occurences of missing values will be imputed.
    strategy : string, optional
        imputation strategy, one of: {'mean', 'median', 'constant', 'most_frequent'}. By default, 'mean'.
    fill_value : string or numerical value, optional
        When strategy == 'constant', fill_value is used to replace all occurences of missing_values.
        If left to default, fill_value will be 0 when filling numerical data and 'missing' for strings or object data types.
    random : boolean, optional
        When random == True, shuffles data frame before filling. By default, False.
    parts : integer, optional
        The number of parts to divide rows of data frame into. By default, 1.
    verbose : integer, optional
        Controls the verbosity of the divide and fill. By default, 0.


    Returns
    -------
    dataframe : pandas.DataFrame object
        Data frame obtained after divide and fill on the corresponding columns.

    Examples
    -------
    >>> import numpy as np
    >>> from instaeda import divide_and_fill
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, np.nan],
                                    'num_wings': [2, np.nan, 0, 0],
                                    'num_specimen_seen': [10, 2, np.nan, np.nan]})
    >>> divide_and_fill(example_df)
    """
    filled_df = None
    allowed_strategies = ["mean", "median", "constant", "most_frequent"]

    # Checking inputs
    if verbose:
        print("Checking inputs")

    if not isinstance(dataframe, pd.DataFrame):
        raise Exception("The input data must be of type pandas.DataFrame!")

    if cols == None:
        cols = list(dataframe.select_dtypes(include="number").columns)

    if (
        not isinstance(cols, list)
        or not all(isinstance(x, str) for x in cols)
        or not set(cols).issubset(set(dataframe.columns))
    ):
        raise Exception(
            "The input cols must be a list of strings belong to the column names for input dataframe!"
        )

    if (
        not isinstance(missing_values, int)
        and not isinstance(missing_values, float)
        and not isinstance(missing_values, str)
        and (missing_values is not None)
    ):
        raise Exception(
            "The input missing values must be one of the following: (int, float, str, np.nan, None)"
        )

    if strategy not in allowed_strategies:
        raise ValueError(
            "Can only use these strategies: {0} got strategy = {1}".format(
                allowed_strategies, strategy
            )
        )

    if (
        (fill_value is not None)
        and not isinstance(fill_value, int)
        and not isinstance(fill_value, float)
        and not isinstance(fill_value, str)
    ):
        raise Exception(
            "The input fill values must be one of the following: (int, float, str, None)"
        )

    if not isinstance(random, bool):
        raise Exception("The input random must be True or False")

    if not isinstance(parts, int) or (parts < 1):
        raise ValueError("Can only use positive integer parts.")

    if not isinstance(verbose, int):
        raise ValueError("Can only use integer for verbose.")

    # Constructing filled dataframe skeleton.
    if verbose:
        print("Constructing filled dataframe skeleton.")

    if random:
        filled_df = dataframe.copy().sample(frac=1).reset_index(drop=True)
    else:
        filled_df = dataframe.copy()

    if (set(cols) <= set(dataframe.select_dtypes(include="number").columns)):
        if isinstance(fill_value, str) :
            raise ValueError(
                "For numeric columns, can only use fill values: (int, float, None)"
            )
    elif (set(cols) <= set(dataframe.select_dtypes(exclude="number").columns)):
        if isinstance(fill_value, int) or isinstance(fill_value, float):
            raise ValueError(
                "For non-numeric columns, can only use fill values: (None, str)"
            )
    else:
        raise Exception("All items in list cols must be numeric, or non-numeric.")

    # Filling data frame
    spacing = filled_df.shape[0]/(parts + 1)
    indexing = np.arange(0, filled_df.shape[0] + spacing, spacing, dtype=int)
    for i in range(len(indexing) - 1):
        imputer = SimpleImputer(
            missing_values=missing_values, strategy=strategy, fill_value=fill_value
        )
        filled_df.loc[indexing[i] : indexing[i + 1], cols] = imputer.fit_transform(
            filled_df.loc[indexing[i] : indexing[i + 1], cols]
        )

    if verbose:
        print("Returning data frame.")
    return filled_df



[docs]def plot_basic_distributions(df, cols=None, include=None, vega_theme="ggplot2"):
    """Takes a dataframe and generates plots based on types

    Parameters
    -----------
    df: pd.DataFrame
        Dataframe from which to generate plots for each column from
    cols: list, optional
        List of columns to generate plots for. By default, None (builds charts for all columns).
    include: string, optional
        Select the data types to include. Supported values include None, "string" and "number". By default, None - it will return both string and number columns.
    vega_theme : string, optional
        Select the vega.themes for the altair plots. The options include: excel, ggplot2, quartz, vox, fivethirtyeight, dark, latimes, urbaninstitute, and googlecharts. By default, it uses ggplot2.

    Returns
    -------
    dict_plots: dict of altair.Chart objects using the column name as the key
        dictionary of generated altair.Chart objects with the column name as the key

    Examples
    -------
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, 0],
                                    'num_wings': [2, 0, 0, 0],
                                    'num_specimen_seen': [10, 2, 1, 8]})
    >>> instaeda_py.plot_distribution(example_df)
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("The df parameter must be a pandas dataframe")


    if vega_theme not in ('excel','ggplot2','quartz','vox','fivethirtyeight', 'dark', 'latimes', 'urbaninstitute', 'googlecharts'):
        warnings.warn("You have selected a theme that is not one of the default Vega color themes.")
    # Set vega theme
    alt.renderers.enable(embed_options={'theme': vega_theme})

    dict_plots = {}
    df_data = None

    # First filter:  select columns
    if cols is None:    
        df_data = df
    else:
        df_data = df[cols]    
    
    if include not in (None, 'number', 'string'):
        raise KeyError("The include parameter must be None, 'number' or 'string'")

    # Second filter: select types to include
    if include == 'number' or include is None:
        
        df_data_number = df_data.select_dtypes(include="number")        
        for col in df_data_number.columns.tolist():
            dict_plots[col] = alt.Chart(df_data_number).mark_bar().encode(
                                alt.X(col, bin=alt.Bin(maxbins=50)), y='count()')
    
    if include == 'string' or include is None:

        df_data_string = df_data.select_dtypes(include="object")
        for col in df_data_string.columns.tolist():            
            dict_plots[col] = alt.Chart(df_data_string).mark_bar().encode(
                                    x=alt.X('count()'),
                                    y=alt.Y(col, sort='-x')
                                )

    if len(dict_plots) == 0:
        warnings.warn("Zero plots were generated. Please ensure you specifiy the correct parameters for cols and include")

    return dict_plots
Source code for instaeda.instaeda

InstaEDA

Navigation