Transforming Skewed Data

PHOTO EMBED

Wed Mar 16 2022 07:10:35 GMT+0000 (Coordinated Universal Time)

Saved by @abhin__dev

# Plot histogram for each continuous feature to see if a transformation is necessary
for feature in ['Age_clean', 'Fare_clean']:
    sns.distplot(titanic[feature], kde=False)
    plt.title('Histogram for {}'.format(feature))
    plt.show()
    
# Generate QQ plots
for i in [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    data_t = titanic['Fare_clean']**(1/i)
    qqplot(data_t, line='s')
    plt.title("Transformation: 1/{}".format(str(i)))

# Select a better range from the QQ plots
# Box-Cox transformation
for i in [3, 4, 5, 6, 7]:
    data_t = titanic['Fare_clean']**(1/i)
    n, bins, patches = plt.hist(data_t, 50, density=True)
    mu = np.mean(data_t)
    sigma = np.std(data_t)
    plt.plot(bins, scipy.stats.norm.pdf(bins, mu, sigma))
    plt.title("Transformation: 1/{}".format(str(i)))
    plt.show()
    
# Create the new transformed feature
titanic['Fare_clean_tr'] = titanic['Fare_clean'].apply()
titanic.head()
content_copyCOPY

http://localhost:8888/notebooks/Desktop/Python/EDA/archive/Untitled.ipynb