Solution: Predicting Diabetes Using PySpark MLlib
The solution to predicting diabetes in patients using PySpark MLlib.
main.py
diabetes.csv
from pyspark.sql import SparkSessionfrom pyspark.sql.functions import col, avgfrom pyspark.sql.types import IntegerTypefrom pyspark.ml.feature import StringIndexer, VectorAssemblerfrom pyspark.ml.classification import LogisticRegressionfrom pyspark.ml import Pipelinefrom pyspark.ml.evaluation import BinaryClassificationEvaluatorspark = SparkSession.builder.getOrCreate()# Task 1: Load the Diabetes prediction data into a PySpark DataFrameprint("Reading 'diabetes.csv' into diabetes_df dataframe:")diabetes_df = spark.read.csv("diabetes.csv", header = True, inferSchema = True)print("First 5 rows of the diabetes_df:")diabetes_df.show(5)print("Check the column types of diabetes_df:")print(diabetes_df.dtypes)# Task 2: Data Preprocessing and EDAprint("Converting `age` column to Integer Type:")diabetes_df = diabetes_df.withColumn("age", col("age").cast(IntegerType()))print("Value types in the smoking_history column")diabetes_df.groupBy("smoking_history").count()print("Remvoing smoking_history column")diabetes_df2 = diabetes_df.drop("smoking_history")print("Calculating the average blood glucose level for diabetic patients")print(diabetes_df2.filter(col("diabetes") == 1).select(avg("blood_glucose_level")).first()[0])print("Calculating the average blood glucose level for normal patients")print(diabetes_df2.filter(col("diabetes") == 0).select(avg("blood_glucose_level")).first()[0])print("Calculating the average blood glucose level for normal patients")diabetes_df2.groupBy("gender").count().orderBy('count').show()# Task 3: Model Training and Evaluationprint("Performing string indexing on the gender column")indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep")for column in ["gender"]]print("Assembling the features into a vector column")assembler = VectorAssembler(inputCols=["age", "hypertension", "heart_disease", "gender_index","bmi", "HbA1c_level", "blood_glucose_level"],outputCol="features")print("Instantiate a Logistic regression model")lr = LogisticRegression(featuresCol="features", labelCol="diabetes")print("Create a ML Pipeline combining indexer, assembler and lr")pipeline = Pipeline(stages=indexers + [assembler, lr])print("Split the data into training and test sets (80:20)")(trainingData, testData) = diabetes_df2.randomSplit([0.8, 0.2])print("Fit the model to the training data")pipelineModel = pipeline.fit(trainingData)print("Use the model to make predictions on the test data")predictions = pipelineModel.transform(testData)print("Select the diabetes and prediction columns to see how well we have done")predictions.select("diabetes", "prediction")print("Evaluate the model")evaluator = BinaryClassificationEvaluator(labelCol="diabetes")accuracy = evaluator.evaluate(predictions)print("Accuracy:", round(accuracy, 2))
Get hands-on with 1400+ tech skills courses.