pyspark

PHOTO EMBED

Sun Oct 26 2025 14:28:49 GMT+0000 (Coordinated Universal Time)

Saved by @rcb

!pip install pyspark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, round, when, current_date, datediff

# -----------------------------------------------------
# Create Spark Session
# -----------------------------------------------------
spark = SparkSession.builder.appName("EmployeeDataAnalysis").getOrCreate()

# -----------------------------------------------------
# Read CSV file into DataFrame
# -----------------------------------------------------
df = spark.read.csv("emp1.csv", header=True, inferSchema=True)

print("\n=== Original Employee Data ===")
df.show()

# -----------------------------------------------------
# (a) Calculate the average salary for each department
# -----------------------------------------------------
avg_salary = df.groupBy("Department").agg(round(avg("Salary"), 2).alias("Avg_Salary"))

print("\n=== (a) Average Salary per Department ===")
avg_salary.show()

# -----------------------------------------------------
# Join the average salary data back to the main DataFrame
# -----------------------------------------------------
df = df.join(avg_salary, on="Department", how="left")

print("\n=== After Joining Average Salary to Main Data ===")
df.show()

# -----------------------------------------------------
# (b) Create new column 'Salary Increase (%)'
# -----------------------------------------------------
df = df.withColumn(
    "Salary_Increase(%)",
    round(((col("Salary") - col("Avg_Salary")) / col("Avg_Salary")) * 100, 2)
)

print("\n=== (b) Salary Increase (%) for Each Employee ===")
df.select("EmpID", "Name", "Department", "Salary", "Avg_Salary", "Salary_Increase(%)").show()

# -----------------------------------------------------
# (c) Add new column 'YearsWithCompany'
# -----------------------------------------------------
df = df.withColumn(
    "YearsWithCompany",
    round(datediff(current_date(), col("JoinDate")) / 365, 1)
)

print("\n=== (c) Years Each Employee Has Been with the Company ===")
df.select("EmpID", "Name", "JoinDate", "YearsWithCompany").show()

# -----------------------------------------------------
# (d) Categorize salary into ranges (Low, Medium, High)
# -----------------------------------------------------
df = df.withColumn(
    "Salary_Category",
    when(col("Salary") < 55000, "Low")
    .when((col("Salary") >= 55000) & (col("Salary") < 65000), "Medium")
    .otherwise("High")
)

print("\n=== (d) Salary Category Based on Salary Range ===")
df.select("EmpID", "Name", "Salary", "Salary_Category").show()

# -----------------------------------------------------
# Final DataFrame
# -----------------------------------------------------
print("\n=== Final Employee Data with All Calculations ===")
df.show()

# -----------------------------------------------------
# Stop the Spark session
# -----------------------------------------------------
spark.stop()


emp1
EmpID,Name,Department,Salary,JoinDate
101,John,IT,60000,2018-03-15
102,Alice,HR,50000,2016-07-21
103,Bob,Finance,55000,2015-09-10
104,David,IT,70000,2019-05-01
105,Emma,HR,65000,2020-02-12




content_copyCOPY