from pyspark.sql import functions as F import pyspark.sql.types as Dtype columns_to_normalize = ["MakeText", "BodyColorText"] col_to_settings = dict({ "MakeText": dict( { "MERCEDES-BENZ": "Mercedes", "FERRARI": "Ferrari" } ), "BodyColorText": dict( { "schwarz": "Black", "schwarz mét.": "Black", "braun": "Brown", "braun mét.": "Brown", "orange": "Orange" } ) }) def normalizer(value, col): settings = col_to_settings.get(col).get(value) return str(settings) ##normalize_column = udf(lambda x: normalizer(x, col), Dtype.StringType()) pre = spark.read.option("header","true").format("csv").load("/mnt/mducdevdl/mdu/valid/viaturas.csv") for col in columns_to_normalize: print(col) normalize_column = udf(lambda x: normalizer(x, col), Dtype.StringType()) pre = pre.withColumn(col, normalize_column(pre[col])) pre.select(col).distinct().show()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter