dataframe operations

PHOTO EMBED

Tue Jan 04 2022 13:03:09 GMT+0000 (Coordinated Universal Time)

Saved by @ahoeweler

df.describe()  # get statistics of df
df.info()
df.dtypes
df.values  # get values of df as numpy array
df.columns  # get all columns

df.index # get index of df
df.set_index('col1')  # can also be applied on multiple columns (use ['col1', 'col2'])
df.reset_index()  # option: drop=True
df.sort_index(level=['col1', 'col2'], ascending=[False, True])  # for multiindex
# then we can slice by indeces:
df.loc[('Pakistan', 'Lahore'):('Russia', 'Moscow')]  # Slice from first tuple to second tuple where Pakistan is first index and Lahore second
df.loc[("a", "b"):("c", "d"), "e":"f"]  # can also slice two ways
# get last 10 rows of data frame
df.iloc[-10:]


df.info()  # get info about missing values
df.shape()  # get nbr of rows and columns
df.sort_values(['col1', 'col2'], ascending=[True, False])  # sort values by col1 (ascending) and col2 (descending)
df['col'].mean()  # .median(), min(), max(), std(), var(), quantile()
# min() also works for dates
df['col1'].cumsum()  # sum of row AND previous row, also .cummax(), cumprod()
# df[["col1", "col2", "col3"]].agg([function1, function2]))

# unique counts
df.drop_duplicates(subset=['col1', 'col2'])
df['col1'].value_counts(sort=True, normalize=True)
df['col1'].unique() 

# get largest 10 values in data frame
df.nlargest(10, "col")
content_copyCOPY