SQL - Data Cleaning - Fraud Aus

PHOTO EMBED

Thu Jun 29 2023 12:28:06 GMT+0000 (Coordinated Universal Time)

Saved by @roym17 #sql

-- CREATING TABLES BEFORE IMPORTING RAW DATA


CREATE TABLE t_aus (
	category VARCAR(100),
	reported_loss VARCHAR(100),
	total_reports VARCHAR(100),
	reports_with_loss VARCHAR(100),
	loss_from_total_reports VARCHAR(100),
	percentage_change VARCHAR(100)

);

CREATE TABLE t_state(
	state VARCHAR(100),
	category VARCHAR(100),
	reported_loss VARCHAR(100),
	total_reports VARCHAR(100),
	reports_with_loss VARCHAR(100),
	loss_from_total_reports VARCHAR(100),
	percentage_change VARCHAR(100)
);

CREATE TABLE t_age (
	age VARCHAR(100),
	amount_loss VARCHAR(100),

	reported_loss VARCHAR (100)
);

CREATE TABLE t_gender(
	gender VARCHAR(100),
	amount_loss VARCHAR(100),
	percentage VARCHAR(100)

);

CREATE TABLE t_contact (
	contact_method VARCHAR(100),
	amount_lost VARCHAR(100),
	reported_loss VARCHAR(100)
);


-- Creating duplicate tables for Cleaning/Altering

CREATE TABLE t_aus_copy AS
SELECT * FROM t_aus;

CREATE TABLE t_state_copy AS
SELECT * FROM t_state;

CREATE TABLE t_age_copy AS
SELECT * FROM t_age;

CREATE TABLE t_gender_copy AS
SELECT * FROM t_gender;

CREATE TABLE t_contact_copy AS
SELECT * FROM t_contact;

-- DATA CLEANING: t_Aus_copy Table

	-- Dropping unecessary columns

	ALTER TABLE t_aus
	DROP COLUMN percentage_change;

	-- Removing Symbols

	UPDATE t_aus
	SET reported_loss = REPLACE(reported_loss, '$','');

	UPDATE t_aus
	SET reported_loss = REPLACE(reported_loss,',','');

	UPDATE t_aus
	SET total_reports = REPLACE(total_reports,',','');

	UPDATE t_aus
	SET reports_with_loss = REPLACE(reports_with_loss,',','');

	UPDATE t_aus
	SET loss_from_total_reports = REPLACE(loss_from_total_reports,',','');


-- Dropping unnecessary columns

ALTER TABLE t_aus
DROP COLUMN loss_from_total_reports;

ALTER TABLE t_aus
DROP COLUMN percentage_change;

	-- Finding & Replacing missing values

	SELECT *
	FROM t_aus
	WHERE category IS NULL AND reported_loss IS NULL AND total_reports IS NULL AND reports_with_loss IS NULL AND loss_from_total_reports IS NULL;
	-- No columns have missing values

-- Converting Data Types From String To Integers for t_aus

ALTER TABLE t_aus
ALTER COLUMN reported_loss TYPE integer
USING (CASE WHEN reported_loss ~ '^\d+$' THEN reported_loss::integer ELSE NULL END);

ALTER TABLE t_aus
ALTER COLUMN total_reports TYPE integer
USING (CASE WHEN total_reports ~ '^\d+$' THEN total_reports::integer ELSE NULL END);

ALTER TABLE t_aus
ALTER COLUMN reports_with_loss TYPE integer
USING (CASE WHEN reports_with_loss ~ '^\d+$' THEN reports_with_loss::integer ELSE NULL END);

-- Converting Data Types From String To Integers for t_state

ALTER TABLE t_state
ALTER COLUMN reported_loss TYPE integer
USING (CASE WHEN reported_loss ~ '^\d+$' THEN reported_loss::integer ELSE NULL END);

ALTER TABLE t_state
ALTER COLUMN total_reports TYPE integer
USING (CASE WHEN total_reports ~ '^\d+$' THEN total_reports::integer ELSE NULL END);

ALTER TABLE t_state
ALTER COLUMN reports_with_loss TYPE integer
USING (CASE WHEN reports_with_loss ~ '^\d+$' THEN reports_with_loss::integer ELSE NULL END);
content_copyCOPY