...

/

Solution: Data Transformation

Solution: Data Transformation

Let's see the solution to the data transformation Challenge.

We'll cover the following...

Task

Perform summary statistics on the review_text” and vote` columns.

Solution

def impute_NAN_values(df,columnName,value):
    """Replace the NaN values"""
    df = df.fillna({columnName: value})
    return df

def show_vote_stat(df: SparkDf) -> None:
    """
    Show summary status about the vote
    :param df: A Dataframe having asin and vote column
    :return: No Return
    """
    summary_df = (
        df
        .groupby("asin")
        .agg(fn.mean(col("vote")).alias("mean_vote"))
        .select("mean_vote")
        .summary("count", "min", "25%", "75%", "max")
    )

    summary = summary_df.rdd.map(lambda row: row.asDict(recursive=True)).collect()
    pprint(summary)

def show_review_text_stat(df: SparkDf) -> None:
    """
    Show general Stats for review text length
    :param df: DataFrame
    :return: Nothing
    """
    summary_df = (
        df
        .filter(col("review_text_len") > 0)
        .select('review_text_len')
        .summary("count", "min", "25%", "75%", "max")
    )
    summary = summary_df.rdd.map(lambda row: row.asDict(recursive=True)).collect()
    print("Review Length Stat")
    pprint(summary)
    weired_reviews = df.filter(col('review_text_len') <= 1).count()
    print(f"Reviews with length one or less: {weired_reviews}")

Solution of challenge data transformation

Explanation

  • Line 3: We use the .fillna method to impute the
...