Solution: Predicting Diabetes Using PySpark MLlib
The solution to predicting diabetes in patients using PySpark MLlib.
Press + to interact
main.py
diabetes.csv
from pyspark.sql import SparkSessionfrom pyspark.sql.functions import col, avgfrom pyspark.sql.types import IntegerTypefrom pyspark.ml.feature import StringIndexer, VectorAssemblerfrom pyspark.ml.classification import LogisticRegressionfrom pyspark.ml import Pipelinefrom pyspark.ml.evaluation import BinaryClassificationEvaluatorspark = SparkSession.builder.getOrCreate()# Task 1: Load the Diabetes prediction data into a PySpark DataFrameprint("Reading 'diabetes.csv' into diabetes_df dataframe:")diabetes_df = spark.read.csv("diabetes.csv", header = True, inferSchema = True)print("First 5 rows of the diabetes_df:")diabetes_df.show(5)print("Check the column types of diabetes_df:")print(diabetes_df.dtypes)# Task 2: Data Preprocessing and EDAprint("Converting `age` column to Integer Type:")diabetes_df = diabetes_df.withColumn("age", col("age").cast(IntegerType()))print("Value types in the smoking_history column")diabetes_df.groupBy("smoking_history").count()print("Remvoing smoking_history column")diabetes_df2 = diabetes_df.drop("smoking_history")print("Calculating the average blood glucose level for diabetic patients")print(diabetes_df2.filter(col("diabetes") == 1).select(avg("blood_glucose_level")).first()[0])print("Calculating the average blood glucose level for normal patients")print(diabetes_df2.filter(col("diabetes") == 0).select(avg("blood_glucose_level")).first()[0])print("Calculating the average blood glucose level for normal patients")diabetes_df2.groupBy("gender").count().orderBy('count').show()# Task 3: Model Training and Evaluationprint("Performing string indexing on the gender column")indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep")for column in ["gender"]]print("Assembling the features into a vector column")assembler = VectorAssembler(inputCols=["age", "hypertension", "heart_disease", "gender_index","bmi", "HbA1c_level", "blood_glucose_level"],outputCol="features")print("Instantiate a Logistic regression model")lr = LogisticRegression(featuresCol="features", labelCol="diabetes")print("Create a ML Pipeline combining indexer, assembler and lr")pipeline = Pipeline(stages=indexers + [assembler, lr])print("Split the data into training and test sets (80:20)")(trainingData, testData) = diabetes_df2.randomSplit([0.8, 0.2])print("Fit the model to the training data")pipelineModel = pipeline.fit(trainingData)print("Use the model to make predictions on the test data")predictions = pipelineModel.transform(testData)print("Select the diabetes and prediction columns to see how well we have done")predictions.select("diabetes", "prediction")print("Evaluate the model")evaluator = BinaryClassificationEvaluator(labelCol="diabetes")accuracy = evaluator.evaluate(predictions)print("Accuracy:", round(accuracy, 2))
Here’s a breakdown of what’s happening:
- Lines 1–8: Import the necessary libraries