normalizer

PHOTO EMBED

Sat Aug 13 2022 11:30:52 GMT+0000 (Coordinated Universal Time)

Saved by @Safrugans #python

from pyspark.sql import functions as F
import pyspark.sql.types as Dtype
 
columns_to_normalize = ["MakeText", "BodyColorText"]
col_to_settings = dict({
        
    "MakeText": dict(
        {
            "MERCEDES-BENZ": "Mercedes",
            "FERRARI": "Ferrari"
        }
    ),
    "BodyColorText": dict(
        {
            "schwarz": "Black",
            "schwarz mét.": "Black",
            "braun": "Brown",
            "braun mét.": "Brown",
            "orange": "Orange"
        }
    )
})
 
def normalizer(value, col):
 
    settings = col_to_settings.get(col).get(value)
    return str(settings)
    
##normalize_column = udf(lambda x: normalizer(x, col), Dtype.StringType())
 
pre = spark.read.option("header","true").format("csv").load("/mnt/mducdevdl/mdu/valid/viaturas.csv")
 
for col in columns_to_normalize:
  print(col)
  
  normalize_column = udf(lambda x: normalizer(x, col), Dtype.StringType())
 
  pre = pre.withColumn(col, normalize_column(pre[col]))
  pre.select(col).distinct().show()
content_copyCOPY