# Plot histogram for each continuous feature to see if a transformation is necessary
for feature in ['Age_clean', 'Fare_clean']:
sns.distplot(titanic[feature], kde=False)
plt.title('Histogram for {}'.format(feature))
plt.show()
# Generate QQ plots
for i in [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
data_t = titanic['Fare_clean']**(1/i)
qqplot(data_t, line='s')
plt.title("Transformation: 1/{}".format(str(i)))
# Select a better range from the QQ plots
# Box-Cox transformation
for i in [3, 4, 5, 6, 7]:
data_t = titanic['Fare_clean']**(1/i)
n, bins, patches = plt.hist(data_t, 50, density=True)
mu = np.mean(data_t)
sigma = np.std(data_t)
plt.plot(bins, scipy.stats.norm.pdf(bins, mu, sigma))
plt.title("Transformation: 1/{}".format(str(i)))
plt.show()
# Create the new transformed feature
titanic['Fare_clean_tr'] = titanic['Fare_clean'].apply()
titanic.head()
Comments