# Plot histogram for each continuous feature to see if a transformation is necessary for feature in ['Age_clean', 'Fare_clean']: sns.distplot(titanic[feature], kde=False) plt.title('Histogram for {}'.format(feature)) plt.show() # Generate QQ plots for i in [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: data_t = titanic['Fare_clean']**(1/i) qqplot(data_t, line='s') plt.title("Transformation: 1/{}".format(str(i))) # Select a better range from the QQ plots # Box-Cox transformation for i in [3, 4, 5, 6, 7]: data_t = titanic['Fare_clean']**(1/i) n, bins, patches = plt.hist(data_t, 50, density=True) mu = np.mean(data_t) sigma = np.std(data_t) plt.plot(bins, scipy.stats.norm.pdf(bins, mu, sigma)) plt.title("Transformation: 1/{}".format(str(i))) plt.show() # Create the new transformed feature titanic['Fare_clean_tr'] = titanic['Fare_clean'].apply() titanic.head()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter