import numpy as np
import pandas as pd
import altair as alt
from sklearn.impute import SimpleImputer
import warnings
[docs]def plot_intro(df, plot_title='', theme_config='Dimension'):
"""Takes a dataframe with configurations and returns an altair object with summary metrics.
Parameters
-----------
df: pd.DataFrame
Dataframe from which to take columns not limited to numerical columns only
plot_title : string, optional
User can specify the plot title, by default to show the memory usage
theme_config : list, optional
A list of color configurations to be passed to theme, by default to use Demension as config
Returns
-------
plot : altair.Chart object
An altair plot object displaying summary metrics including the memory usage and
the basic description of the input data.
Examples
-------
>>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
'num_legs': [2, 4, 8, 0],
'num_wings': [2, 0, 0, 0],
'num_specimen_seen': [10, 2, 1, 8]})
>>> instaeda_py.plot_intro(example_df)
"""
# Check basic information for input data
sum_missing_columns = df.isnull().sum(axis = 0)
num_of_all_missing_columns = sum(sum_missing_columns)
sum_missing_rows = df.isnull().sum(axis = 1)
num_complete_rows = df.shape[0] - sum(sum_missing_rows)
# Create info dataframe
info_df = pd.DataFrame({'rows': df.shape[0],
'columns': df.shape[1],
'numeric_columns': len(list(df.select_dtypes(include=[np.number]).columns.values)),
'all_missing_columns': num_of_all_missing_columns,
'total_missing_values': df.isnull().sum().sum(),
'complete_rows': num_complete_rows,
'total_observations': df.shape[0] * df.shape[1],
'memory_usage': df.memory_usage(deep=True).sum(),
}, index = [0])
# Create the plotting dataframe
plot_df = pd.DataFrame({'Metrics': ['Numeric Columns', 'All Missing Columns', 'Missing Observations', 'Complete Rows'],
'Value': [float(info_df['numeric_columns']/info_df['columns']),
float(info_df['all_missing_columns']/info_df['columns']),
float(info_df['total_missing_values']/info_df['total_observations']),
float(info_df['complete_rows']/info_df['rows'])],
'Dimension': ['column', 'column', 'observation', 'row']
})
# Create the plot
## Check whether the user specifies a plotting title
if len(plot_title) == 0:
plot_title = 'Memory Usage: ' + str(float(info_df['memory_usage'])) + 'kb'
intro_plot = alt.Chart(plot_df, title=plot_title).mark_bar().encode(
alt.X('Value', axis=alt.Axis(format='%')),
alt.Y('Metrics'),
color=alt.Color(theme_config))
else:
intro_plot = alt.Chart(plot_df, title=plot_title).mark_bar().encode(
alt.X('Value', axis=alt.Axis(format='%')),
alt.Y('Metrics'),
color=alt.Color(theme_config))
return intro_plot
[docs]def plot_corr(df, cols=None, method="pearson", colour_palette="purpleorange"):
"""Takes a dataframe, subsets numeric columns and returns a correlation plot object.
Parameters
-----------
df: pd.DataFrame
Dataframe from which to take columns and calculate, plot correlation between columns.
cols: list, optional
List of columns to perform correlation on. By default, None (perform on all numeric).
method : string, optional
correlation calculation method, one of: {'pearson', 'kendall', 'spearman'}. By default 'pearson'
colour_palette : string, optional
one of Altair accepted colour schemes
Returns
-------
plot : altair.Chart object
Correlation plot object displaying column names and corresponding correlation values.
Examples
-------
>>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
'num_legs': [2, 4, 8, 0],
'num_wings': [2, 0, 0, 0],
'num_specimen_seen': [10, 2, 1, 8]})
>>> instaeda_py.plot_corr(example_df)
"""
# check user input
correlation_methods = {'pearson', 'kendall', 'spearman'}
colour_palette_list = {'blueorange', 'brownbluegreen', 'purplegreen', 'pinkyellowgreen', 'purpleorange', 'redblue', 'redgrey', 'redyellowblue', 'redyellowgreen', 'spectral'}
numeric_cols = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
assert isinstance(df, pd.DataFrame), "must pass in pandas DataFrame"
assert method in correlation_methods, "correlation method not acceptable"
if colour_palette not in colour_palette_list:
warnings.warn("Recommended Altair continuous diverging colour palette")
# calculate
if cols == None:
assert(df.select_dtypes(np.number).shape[1] >= 2), "Dataframe does not have enough numeric columns for comparison"
df = df.select_dtypes(include = numeric_cols)
else:
assert(df[cols].select_dtypes(np.number).shape[1] >= 2), "Dataframe does not have enough numeric columns for comparison"
df = df[cols].select_dtypes(include = numeric_cols)
corr_df = round(df.corr(method=method),4).stack().reset_index(name='corr').rename(columns={'level_0':'variable_1','level_1':'variable_2'})
# plot base plot
corr_plot = alt.Chart(corr_df, title='Correlations between variables').mark_rect().encode(
x = alt.X('variable_1', title=''),
y = alt.Y('variable_2', title=''),
color = alt.Color('corr', scale = alt.Scale(scheme=colour_palette, domain=(-1, 1)))
).properties(height=400, width=400)
# plot corr values
text = corr_plot.mark_text().encode(
text='corr:Q',
color=alt.value('black')
)
return corr_plot + text
[docs]def divide_and_fill(
dataframe,
cols=None,
missing_values=np.nan,
strategy="mean",
fill_value=None,
random=False,
parts=1,
verbose=0,
):
"""Takes a dataframe, subsets selected columns and divides into parts for imputation of missing values and returns a data frame.
Parameters
-----------
dataframe: pd.DataFrame
Dataframe from which to take columns and check for missing values.
cols: list, optional
List of columns to perform imputation on. By default, None (perform on all numeric columns).
missing_values: int, float, str, np.nan or None
The placeholder for the missing values. All occurences of missing values will be imputed.
strategy : string, optional
imputation strategy, one of: {'mean', 'median', 'constant', 'most_frequent'}. By default, 'mean'.
fill_value : string or numerical value, optional
When strategy == 'constant', fill_value is used to replace all occurences of missing_values.
If left to default, fill_value will be 0 when filling numerical data and 'missing' for strings or object data types.
random : boolean, optional
When random == True, shuffles data frame before filling. By default, False.
parts : integer, optional
The number of parts to divide rows of data frame into. By default, 1.
verbose : integer, optional
Controls the verbosity of the divide and fill. By default, 0.
Returns
-------
dataframe : pandas.DataFrame object
Data frame obtained after divide and fill on the corresponding columns.
Examples
-------
>>> import numpy as np
>>> from instaeda import divide_and_fill
>>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
'num_legs': [2, 4, 8, np.nan],
'num_wings': [2, np.nan, 0, 0],
'num_specimen_seen': [10, 2, np.nan, np.nan]})
>>> divide_and_fill(example_df)
"""
filled_df = None
allowed_strategies = ["mean", "median", "constant", "most_frequent"]
# Checking inputs
if verbose:
print("Checking inputs")
if not isinstance(dataframe, pd.DataFrame):
raise Exception("The input data must be of type pandas.DataFrame!")
if cols == None:
cols = list(dataframe.select_dtypes(include="number").columns)
if (
not isinstance(cols, list)
or not all(isinstance(x, str) for x in cols)
or not set(cols).issubset(set(dataframe.columns))
):
raise Exception(
"The input cols must be a list of strings belong to the column names for input dataframe!"
)
if (
not isinstance(missing_values, int)
and not isinstance(missing_values, float)
and not isinstance(missing_values, str)
and (missing_values is not None)
):
raise Exception(
"The input missing values must be one of the following: (int, float, str, np.nan, None)"
)
if strategy not in allowed_strategies:
raise ValueError(
"Can only use these strategies: {0} got strategy = {1}".format(
allowed_strategies, strategy
)
)
if (
(fill_value is not None)
and not isinstance(fill_value, int)
and not isinstance(fill_value, float)
and not isinstance(fill_value, str)
):
raise Exception(
"The input fill values must be one of the following: (int, float, str, None)"
)
if not isinstance(random, bool):
raise Exception("The input random must be True or False")
if not isinstance(parts, int) or (parts < 1):
raise ValueError("Can only use positive integer parts.")
if not isinstance(verbose, int):
raise ValueError("Can only use integer for verbose.")
# Constructing filled dataframe skeleton.
if verbose:
print("Constructing filled dataframe skeleton.")
if random:
filled_df = dataframe.copy().sample(frac=1).reset_index(drop=True)
else:
filled_df = dataframe.copy()
if (set(cols) <= set(dataframe.select_dtypes(include="number").columns)):
if isinstance(fill_value, str) :
raise ValueError(
"For numeric columns, can only use fill values: (int, float, None)"
)
elif (set(cols) <= set(dataframe.select_dtypes(exclude="number").columns)):
if isinstance(fill_value, int) or isinstance(fill_value, float):
raise ValueError(
"For non-numeric columns, can only use fill values: (None, str)"
)
else:
raise Exception("All items in list cols must be numeric, or non-numeric.")
# Filling data frame
spacing = filled_df.shape[0]/(parts + 1)
indexing = np.arange(0, filled_df.shape[0] + spacing, spacing, dtype=int)
for i in range(len(indexing) - 1):
imputer = SimpleImputer(
missing_values=missing_values, strategy=strategy, fill_value=fill_value
)
filled_df.loc[indexing[i] : indexing[i + 1], cols] = imputer.fit_transform(
filled_df.loc[indexing[i] : indexing[i + 1], cols]
)
if verbose:
print("Returning data frame.")
return filled_df
[docs]def plot_basic_distributions(df, cols=None, include=None, vega_theme="ggplot2"):
"""Takes a dataframe and generates plots based on types
Parameters
-----------
df: pd.DataFrame
Dataframe from which to generate plots for each column from
cols: list, optional
List of columns to generate plots for. By default, None (builds charts for all columns).
include: string, optional
Select the data types to include. Supported values include None, "string" and "number". By default, None - it will return both string and number columns.
vega_theme : string, optional
Select the vega.themes for the altair plots. The options include: excel, ggplot2, quartz, vox, fivethirtyeight, dark, latimes, urbaninstitute, and googlecharts. By default, it uses ggplot2.
Returns
-------
dict_plots: dict of altair.Chart objects using the column name as the key
dictionary of generated altair.Chart objects with the column name as the key
Examples
-------
>>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
'num_legs': [2, 4, 8, 0],
'num_wings': [2, 0, 0, 0],
'num_specimen_seen': [10, 2, 1, 8]})
>>> instaeda_py.plot_distribution(example_df)
"""
if not isinstance(df, pd.DataFrame):
raise TypeError("The df parameter must be a pandas dataframe")
if vega_theme not in ('excel','ggplot2','quartz','vox','fivethirtyeight', 'dark', 'latimes', 'urbaninstitute', 'googlecharts'):
warnings.warn("You have selected a theme that is not one of the default Vega color themes.")
# Set vega theme
alt.renderers.enable(embed_options={'theme': vega_theme})
dict_plots = {}
df_data = None
# First filter: select columns
if cols is None:
df_data = df
else:
df_data = df[cols]
if include not in (None, 'number', 'string'):
raise KeyError("The include parameter must be None, 'number' or 'string'")
# Second filter: select types to include
if include == 'number' or include is None:
df_data_number = df_data.select_dtypes(include="number")
for col in df_data_number.columns.tolist():
dict_plots[col] = alt.Chart(df_data_number).mark_bar().encode(
alt.X(col, bin=alt.Bin(maxbins=50)), y='count()')
if include == 'string' or include is None:
df_data_string = df_data.select_dtypes(include="object")
for col in df_data_string.columns.tolist():
dict_plots[col] = alt.Chart(df_data_string).mark_bar().encode(
x=alt.X('count()'),
y=alt.Y(col, sort='-x')
)
if len(dict_plots) == 0:
warnings.warn("Zero plots were generated. Please ensure you specifiy the correct parameters for cols and include")
return dict_plots