Hello users of pandas,
I often find myself printing the shapes of the dataframes after every step of processing. I do this to monitor how the shape of the data changes and to ensure that it is done correctly.
e.g.
print(df.shape)
df=df.dropna()
print(df.shape)
df=df.melt()
print(df.shape)
...
I wonder if there is any better/elegant way, preferably a shorthad or an automatic way to do this kind of stuff.
I improvised on Matthew Cox's answer, and added an attribute to the pandas dataframe itself. This simplifies things a lot.
import numpy as np
import pandas as pd
# set logger
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# log changes in dataframe
def log_(df, fun, *args, **kwargs):
logging.info(f'shape changed from {df.shape}', )
df1 = getattr(df, fun)(*args, **kwargs)
logging.info(f'shape changed to {df1.shape}')
return df1
# custom pandas dataframe
@pd.api.extensions.register_dataframe_accessor("log")
class log:
def __init__(self, pandas_obj):
self._obj = pandas_obj
def dropna(self,**kws):
return log_(self._obj,fun='dropna',**kws)
# demo data
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
"toy": [np.nan, 'Batmobile', 'Bullwhip'],
"born": [pd.NaT, pd.Timestamp("1940-04-25"),
pd.NaT]})
# trial
df.log.dropna()
# stderr
INFO:root:shape changed from (3, 3)
INFO:root:shape changed to (1, 3)
# returns dropna'd dataframe