cartoons <-read.csv("../../data/cartoonsvs.csv")
print(head(cartoons))

  Participant.ID Gender     Cartoon Rating
1             P1   Male Chota Bheem    8.5
2             P2   Male Chota Bheem    6.0
3             P3   Male Chota Bheem    8.0
4             P4   Male Chota Bheem    7.0
5             P5   Male Chota Bheem    8.0
6             P6   Male Chota Bheem   10.0

## Research Question: Which is better among Doraemon, Dragon Tales, and Chhota Bheem?

library(dplyr)


cartoons_modified <- cartoons %>%
  mutate(Gender = as.factor(Gender)) %>%
  mutate(Cartoon = as.factor(Cartoon))

colnames(cartoons_modified)

[1] "Participant.ID" "Gender"         "Cartoon"        "Rating"

Plotting Graphs for Eda

gf_histogram(~Rating,
  fill = ~Cartoon, 
  data = cartoons_modified, 
  alpha = 0.5,  
  bins = 25  
) %>%
  gf_vline(xintercept = ~ mean(Rating, na.rm = TRUE),  
            linetype = "dashed", color = "red") %>%
  gf_labs(
    title = "Histogram of Cartoon Ratings",
    x = "Rating", 
    y = "Count"
  ) %>%
  gf_text(
    label = "Overall Mean", 
    x = mean(cartoons_modified$Rating, na.rm = TRUE),  
    y = 2, 
    color = "red"
  ) %>%
  gf_refine(guides(fill = guide_legend(title = "Cartoon")))

gf_boxplot(
  data = cartoons_modified,
  Rating ~ Cartoon,
  fill = ~Cartoon,
  alpha = 0.5
) %>%
  gf_vline(xintercept = ~ mean(Rating, na.rm = TRUE)) %>%
  gf_labs(
    title = "Boxplots of Cartoon Ratings",
    x = "Cartoon", 
    y = "Rating",
   
  ) %>%
  gf_refine(
    scale_x_discrete(guide = "prism_bracket"),
    guides(fill = guide_legend(title = "Cartoon"))
  )

Warning: The S3 guide system was deprecated in ggplot2 3.5.0.
ℹ It has been replaced by a ggproto system that can be extended.

Observations

Doraemon with the highest mean seems to be the most popular, also shows wide variability

Dragon tales - less popular than the other two

Chota Bheem shows a smaller range of ratings

Outliers for Doraemon and Dragon tales

Anova

cartoon_anova <- aov(Rating ~ Cartoon, data = cartoons_modified)

supernova::pairwise(cartoon_anova,
  correction = "Bonferroni", # Try "Tukey"
  alpha = 0.05, # 95% CI calculation
  var_equal = TRUE, # We'll see
  plot = TRUE
)

── Pairwise t-tests with Bonferroni correction ─────────────────────────────────

Model: Rating ~ Cartoon

Cartoon

Levels: 3

Family-wise error-rate: 0.049


  group_1      group_2      diff pooled_se     t    df  lower upper  p_adj
  <chr>        <chr>       <dbl>     <dbl> <dbl> <int>  <dbl> <dbl>  <dbl>
1 Doraemon     Chota Bheem 0.580     0.354 1.636    87 -0.186 1.346  .3161
2 Dragon Tales Chota Bheem 0.597     0.354 1.683    87 -0.170 1.363  .2877
3 Dragon Tales Doraemon    0.017     0.354 0.047    87 -0.750 0.783 1.0000

Doraemon and Chota bheem have significantly different mean ratings.
Dragon Tales and Chota bheem have significantly different mean ratings.
There is no significant difference in mean ratings between Dragon Tales and Doraemon.

Doraemon is the highest rated followed by cHOTA bheem and then Dragon tales

supernova::equation(cartoon_anova)

Fitted equation:
Rating = 6.67 + 0.58*CartoonDoraemon + 0.5966667*CartoonDragon Tales + e

Checking Assumptions

Check for normality

shapiro.test(x = cartoons_modified$Rating)


    Shapiro-Wilk normality test

data:  cartoons_modified$Rating
W = 0.93517, p-value = 0.0002269

checking normality for each cartoon

normality_results <- cartoons_modified %>%
  group_by(Cartoon) %>%
  summarise(shapiro_p_value = shapiro.test(Rating)$p.value)

print(normality_results)

# A tibble: 3 × 2
  Cartoon      shapiro_p_value
  <fct>                  <dbl>
1 Chota Bheem           0.185 
2 Doraemon              0.0139
3 Dragon Tales          0.0240

based on p values, only the data for chota bheem is normally distributed

Residual post-model:

cartoon_anova$residuals %>%
  as_tibble() %>%
  gf_dhistogram(~value, data = .) %>%
  gf_fitdistr()

##
cartoon_anova$residuals %>%
  as_tibble() %>%
  gf_qq(~value, data = .) %>%
  gf_qqstep() %>%
  gf_qqline()

##
shapiro.test(cartoon_anova$residuals)


    Shapiro-Wilk normality test

data:  cartoon_anova$residuals
W = 0.93926, p-value = 0.0003856

residuals are also not normally distributed

Check for Variance

cartoons_modified %>%
  group_by(Cartoon) %>%
  summarise(variance = var(Rating))

# A tibble: 3 × 2
  Cartoon      variance
  <fct>           <dbl>
1 Chota Bheem      2.21
2 Doraemon         5.25
3 Dragon Tales     3.84

# Perform Levene's Test for homogeneity of variances
DescTools::LeveneTest(Rating ~ Cartoon, data = cartoons_modified)

Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  2  1.2923 0.2798
      87

# Perform Fligner-Killeen Test for homogeneity of variances
fligner.test(Rating ~ Cartoon, data = cartoons_modified)


    Fligner-Killeen test of homogeneity of variances

data:  Rating by Cartoon
Fligner-Killeen:med chi-squared = 1.8135, df = 2, p-value = 0.4038

both tests indicate that variances are approx equal.

Anova using permutation

observed_infer <-
  cartoons_modified %>%  
  specify(Rating ~ Cartoon) %>%
  hypothesise(null = "independence") %>%
  calculate(stat = "F")

observed_infer

Response: Rating (numeric)
Explanatory: Cartoon (factor)
Null Hypothesis: independence
# A tibble: 1 × 1
   stat
  <dbl>
1 0.919

null_dist_infer <- cartoons_modified %>%
  specify(Rating ~ Cartoon) %>%
  hypothesise(null = "independence") %>%
  generate(reps = 4999, type = "permute") %>%
  calculate(stat = "F")
##
head(null_dist_infer, n = 15)

Response: Rating (numeric)
Explanatory: Cartoon (factor)
Null Hypothesis: independence
# A tibble: 15 × 2
   replicate   stat
       <int>  <dbl>
 1         1 0.530 
 2         2 0.0222
 3         3 0.111 
 4         4 0.108 
 5         5 0.190 
 6         6 0.310 
 7         7 1.39  
 8         8 1.27  
 9         9 0.215 
10        10 0.852 
11        11 0.696 
12        12 0.0170
13        13 0.566 
14        14 0.792 
15        15 2.90

##
null_dist_infer %>%
  visualise(method = "simulation") +
  shade_p_value(obs_stat = observed_infer$stat, direction = "right") +
  scale_x_continuous(trans = "log10", expand = c(0, 0)) +
  coord_cartesian(xlim = c(0.2, 500), clip = "off") +
  annotation_logticks(outside = FALSE)

Warning in transformation$transform(x): NaNs produced

Warning in scale_x_continuous(trans = "log10", expand = c(0, 0)): log-10
transformation introduced infinite values.

based on the infer based permutation test, the observed test statistic is not unusual and we fail to reject the null hypothesis- ???

Based on pairwise comparisons though…Doraemon > Chota Bheem > Dragon Tales