Recent | Popular
#udf #pyspark
import pyspark.sql.types as T @udf(returnType=T.ArrayType(T.ArrayType(T.DoubleType()))) def aad(): return [[1.22]] df.withColumn("new_col", aad(F.col()))
#pyspark #python
# get first row print(dataframe.collect()[0]) # get second row print(dataframe.collect()[1]) # get last row print(dataframe.collect()[-1]) # get third row print(dataframe.collect()[2])
df.groupBy("Add","Name").agg({'id':'sum'}).show()
#pyspark #python #panda
>>> ps.date_range(start='1/1/2018', periods=5, freq='M') DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], dtype='datetime64[ns]', freq=None)
#pyspark #unique #count #dataframe #column
trx_1.select(f.countDistinct("stg_nexus_member_cd")).show()
#pyspark #spark #python #etl
split_col = pyspark.sql.functions.split(df['my_str_col'], '-') df = df.withColumn('NAME1', split_col.getItem(0)) df = df.withColumn('NAME2', split_col.getItem(1))
#python #pyspark #spark #spark-session
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('abc').getOrCreate()
Mon Dec 12 2022 15:41:23 GMT+0000 (Coordinated Universal Time) https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
Tue Oct 04 2022 14:19:27 GMT+0000 (Coordinated Universal Time) https://www.geeksforgeeks.org/get-specific-row-from-pyspark-dataframe/
Tue Oct 04 2022 13:29:22 GMT+0000 (Coordinated Universal Time) https://www.educba.com/pyspark-groupby-multiple-columns/
Tue Oct 04 2022 13:28:17 GMT+0000 (Coordinated Universal Time) https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.date_range.html
Wed May 26 2021 08:31:02 GMT+0000 (Coordinated Universal Time)
Wed Feb 24 2021 17:36:03 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/39235704/split-spark-dataframe-string-column-into-multiple-columns
Fri Sep 04 2020 05:19:26 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/39780792/how-to-build-a-sparksession-in-spark-2-0-using-pyspark