normalizer
Sat Aug 13 2022 11:30:52 GMT+0000 (Coordinated Universal Time)
Saved by
@Safrugans
#python
from pyspark.sql import functions as F
import pyspark.sql.types as Dtype
columns_to_normalize = ["MakeText", "BodyColorText"]
col_to_settings = dict({
"MakeText": dict(
{
"MERCEDES-BENZ": "Mercedes",
"FERRARI": "Ferrari"
}
),
"BodyColorText": dict(
{
"schwarz": "Black",
"schwarz mét.": "Black",
"braun": "Brown",
"braun mét.": "Brown",
"orange": "Orange"
}
)
})
def normalizer(value, col):
settings = col_to_settings.get(col).get(value)
return str(settings)
##normalize_column = udf(lambda x: normalizer(x, col), Dtype.StringType())
pre = spark.read.option("header","true").format("csv").load("/mnt/mducdevdl/mdu/valid/viaturas.csv")
for col in columns_to_normalize:
print(col)
normalize_column = udf(lambda x: normalizer(x, col), Dtype.StringType())
pre = pre.withColumn(col, normalize_column(pre[col]))
pre.select(col).distinct().show()
content_copyCOPY
Comments