def detect_outlier(feature): outliers = [] data = titanic[feature] mean = np.mean(data) std =np.std(data) for y in data: z_score= (y - mean)/std if np.abs(z_score) > 3: outliers.append(y) print('\nOutlier caps for {}:'.format(feature)) print(' --95p: {:.1f} / {} values exceed that'.format(data.quantile(.95), len([i for i in data if i > data.quantile(.95)]))) print(' --3sd: {:.1f} / {} values exceed that'.format(mean + 3*(std), len(outliers))) print(' --99p: {:.1f} / {} values exceed that'.format(data.quantile(.99), len([i for i in data if i > data.quantile(.99)]))) # Determine what the upperbound should be for continuous features for feat in ['Age_clean', 'SibSp', 'Parch', 'Fare']: detect_outlier(feat) titanic['Age_clean'].clip(upper = titanic['Age_clean'].quantile(0.99),inplace=True)