Code Replication for mpg dataset

Reading mpg

mpg

# A tibble: 234 × 11
   manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
   <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
 1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
 2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
 3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
 4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
 5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
 6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
 7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
 8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
 9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
# ℹ 224 more rows

Examining the data set using glimpse, inspect and skimr

dplyr::glimpse(mpg)

Rows: 234
Columns: 11
$ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
$ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
$ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
$ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
$ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
$ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
$ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
$ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
$ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
$ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
$ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…

mosaic::inspect(mpg)


categorical variables:  
          name     class levels   n missing
1 manufacturer character     15 234       0
2        model character     38 234       0
3        trans character     10 234       0
4          drv character      3 234       0
5           fl character      5 234       0
6        class character      7 234       0
                                   distribution
1 dodge (15.8%), toyota (14.5%) ...            
2 caravan 2wd (4.7%) ...                       
3 auto(l4) (35.5%), manual(m5) (24.8%) ...     
4 f (45.3%), 4 (44%), r (10.7%)                
5 r (71.8%), p (22.2%), e (3.4%) ...           
6 suv (26.5%), compact (20.1%) ...             

quantitative variables:  
   name   class    min     Q1 median     Q3  max        mean       sd   n
1 displ numeric    1.6    2.4    3.3    4.6    7    3.471795 1.291959 234
2  year integer 1999.0 1999.0 2003.5 2008.0 2008 2003.500000 4.509646 234
3   cyl integer    4.0    4.0    6.0    8.0    8    5.888889 1.611534 234
4   cty integer    9.0   14.0   17.0   19.0   35   16.858974 4.255946 234
5   hwy integer   12.0   18.0   24.0   27.0   44   23.440171 5.954643 234
  missing
1       0
2       0
3       0
4       0
5       0

skimr::skim(mpg)

Data summary
Name	mpg
Number of rows	234
Number of columns	11
_______________________
Column type frequency:
character	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
manufacturer	1	4	10	15
model	1	2	22	38
trans	1	8	10	10
drv	1	1	1	3
fl	1	1	1	5
class	1	3	10	7

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
displ	1	3.47	1.29	1.6	2.4	3.3	4.6	7	▇▆▆▃▁
year	1	2003.50	4.51	1999.0	1999.0	2003.5	2008.0	2008	▇▁▁▁▇
cyl	1	5.89	1.61	4.0	4.0	6.0	8.0	8	▇▁▇▁▇
cty	1	16.86	4.26	9.0	14.0	17.0	19.0	35	▆▇▃▁▁
hwy	1	23.44	5.95	12.0	18.0	24.0	27.0	44	▅▅▇▁▁

munging

mpg_modified <- mpg %>%
  dplyr::mutate(
    cyl = as_factor(cyl),
    fl = as_factor(fl),
    drv = as_factor(drv),
    class = as_factor(class),
    trans = as_factor(trans)
  )
glimpse(mpg_modified)

Rows: 234
Columns: 11
$ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
$ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
$ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
$ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
$ cyl          <fct> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
$ trans        <fct> auto(l5), manual(m5), manual(m6), auto(av), auto(l5), man…
$ drv          <fct> f, f, f, f, f, f, f, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, r, …
$ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
$ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
$ fl           <fct> p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, r, …
$ class        <fct> compact, compact, compact, compact, compact, compact, com…

Math Anxiety

math_anxiety <- read_delim("../../data/MathAnxiety.csv", delim = ';')

Rows: 599 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr (2): Gender, Grade
dbl (3): AMAS, RCMAS, Arith
num (1): Age

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(head(math_anxiety))

# A tibble: 6 × 6
    Age Gender Grade      AMAS RCMAS Arith
  <dbl> <chr>  <chr>     <dbl> <dbl> <dbl>
1  1378 Boy    Secondary     9    20     6
2  1407 Boy    Secondary    18     8     6
3  1379 Girl   Secondary    23    26     5
4  1428 Girl   Secondary    19    18     7
5  1356 Boy    Secondary    23    20     1
6  1350 Girl   Secondary    27    33     1

math_anxiety

# A tibble: 599 × 6
     Age Gender Grade      AMAS RCMAS Arith
   <dbl> <chr>  <chr>     <dbl> <dbl> <dbl>
 1  1378 Boy    Secondary     9    20     6
 2  1407 Boy    Secondary    18     8     6
 3  1379 Girl   Secondary    23    26     5
 4  1428 Girl   Secondary    19    18     7
 5  1356 Boy    Secondary    23    20     1
 6  1350 Girl   Secondary    27    33     1
 7  1336 Boy    Secondary    22    23     4
 8  1393 Boy    Secondary    17    11     7
 9  1317 Girl   Secondary    28    32     2
10  1348 Boy    Secondary    20    30     6
# ℹ 589 more rows

Examining the data

dplyr::glimpse(math_anxiety)

Rows: 599
Columns: 6
$ Age    <dbl> 1378, 1407, 1379, 1428, 1356, 1350, 1336, 1393, 1317, 1348, 141…
$ Gender <chr> "Boy", "Boy", "Girl", "Girl", "Boy", "Girl", "Boy", "Boy", "Gir…
$ Grade  <chr> "Secondary", "Secondary", "Secondary", "Secondary", "Secondary"…
$ AMAS   <dbl> 9, 18, 23, 19, 23, 27, 22, 17, 28, 20, 16, 20, 21, 36, 16, 27, …
$ RCMAS  <dbl> 20, 8, 26, 18, 20, 33, 23, 11, 32, 30, 10, 4, 23, 26, 24, 21, 3…
$ Arith  <dbl> 6, 6, 5, 7, 1, 1, 4, 7, 2, 6, 2, 5, 2, 6, 2, 7, 2, 4, 7, 3, 8, …

skimr::skim(mpg)

Data summary
Name	mpg
Number of rows	234
Number of columns	11
_______________________
Column type frequency:
character	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
manufacturer	1	4	10	15
model	1	2	22	38
trans	1	8	10	10
drv	1	1	1	3
fl	1	1	1	5
class	1	3	10	7

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
displ	1	3.47	1.29	1.6	2.4	3.3	4.6	7	▇▆▆▃▁
year	1	2003.50	4.51	1999.0	1999.0	2003.5	2008.0	2008	▇▁▁▁▇
cyl	1	5.89	1.61	4.0	4.0	6.0	8.0	8	▇▁▇▁▇
cty	1	16.86	4.26	9.0	14.0	17.0	19.0	35	▆▇▃▁▁
hwy	1	23.44	5.95	12.0	18.0	24.0	27.0	44	▅▅▇▁▁

mosaic::inspect(math_anxiety)


categorical variables:  
    name     class levels   n missing
1 Gender character      2 599       0
2  Grade character      2 599       0
                                   distribution
1 Boy (53.9%), Girl (46.1%)                    
2 Primary (66.9%), Secondary (33.1%)           

quantitative variables:  
   name   class min     Q1 median     Q3  max       mean         sd   n missing
1   Age numeric  37 1061.5   1208 1418.5 1875 1246.49249 223.112183 599       0
2  AMAS numeric   4   18.0     22   26.5   45   21.98164   6.597962 599       0
3 RCMAS numeric   1   14.0     19   25.0   41   19.24040   7.566802 599       0
4 Arith numeric   0    4.0      6    7.0    8    5.30217   2.105220 599       0

skim(math_anxiety) %>%
  kbl(align = "c", caption = "Skim Output for Math anxiety Dataset") %>%
  kable_paper(full_width = F)

Skim Output for Math anxiety Dataset
skim_type	skim_variable	complete_rate	character.min	character.max	character.empty	character.n_unique	character.whitespace	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
character	Gender	1	3	4	0	2	0	NA	NA	NA	NA	NA	NA	NA	NA
character	Grade	1	7	9	0	2	0	NA	NA	NA	NA	NA	NA	NA	NA
numeric	Age	1	NA	NA	NA	NA	NA	1246.49249	223.112183	37	1061.5	1208	1418.5	1875	▁▁▇▇▃
numeric	AMAS	1	NA	NA	NA	NA	NA	21.98164	6.597962	4	18.0	22	26.5	45	▂▆▇▃▁
numeric	RCMAS	1	NA	NA	NA	NA	NA	19.24040	7.566802	1	14.0	19	25.0	41	▂▇▇▅▁
numeric	Arith	1	NA	NA	NA	NA	NA	5.30217	2.105220	0	4.0	6	7.0	8	▂▃▃▇▇

Data Dictionary

  Variable_Name Data_Type                       Description
1        Gender    Factor                            Gender
2         Grade Character Grade the student is currently in
3           Age   Integer                         Age, /120
4          AMAS   Numeric                      Type of Exam
5         RCMAS   Numeric                      Another Exam
6         Arith   Numeric                  Yet another exam

Data Munging

mathanxiety_modified <- math_anxiety %>%
mutate(
    Gender = as.factor(Gender),  
    Age = as.integer(Age / 120)   
  )
glimpse(mathanxiety_modified)

Rows: 599
Columns: 6
$ Age    <int> 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 13, 12, 11, 11, 11,…
$ Gender <fct> Boy, Boy, Girl, Girl, Boy, Girl, Boy, Boy, Girl, Boy, Boy, Boy,…
$ Grade  <chr> "Secondary", "Secondary", "Secondary", "Secondary", "Secondary"…
$ AMAS   <dbl> 9, 18, 23, 19, 23, 27, 22, 17, 28, 20, 16, 20, 21, 36, 16, 27, …
$ RCMAS  <dbl> 20, 8, 26, 18, 20, 33, 23, 11, 32, 30, 10, 4, 23, 26, 24, 21, 3…
$ Arith  <dbl> 6, 6, 5, 7, 1, 1, 4, 7, 2, 6, 2, 5, 2, 6, 2, 7, 2, 4, 7, 3, 8, …

Reading the data again

mathanxiety_modified

# A tibble: 599 × 6
     Age Gender Grade      AMAS RCMAS Arith
   <int> <fct>  <chr>     <dbl> <dbl> <dbl>
 1    11 Boy    Secondary     9    20     6
 2    11 Boy    Secondary    18     8     6
 3    11 Girl   Secondary    23    26     5
 4    11 Girl   Secondary    19    18     7
 5    11 Boy    Secondary    23    20     1
 6    11 Girl   Secondary    27    33     1
 7    11 Boy    Secondary    22    23     4
 8    11 Boy    Secondary    17    11     7
 9    10 Girl   Secondary    28    32     2
10    11 Boy    Secondary    20    30     6
# ℹ 589 more rows

Questions

Is there a difference in anxiety levels between genders across all three exams ?

mathanxiety_modified %>%
  group_by(Gender) %>%
  summarise(average_AMAS = mean(AMAS),   
    average_RCMAS = mean(RCMAS),average_Arith = mean(Arith), 
    count = n()                                
  )

# A tibble: 2 × 5
  Gender average_AMAS average_RCMAS average_Arith count
  <fct>         <dbl>         <dbl>         <dbl> <int>
1 Boy            21.2          18.1          5.27   323
2 Girl           22.9          20.6          5.34   276

Inferences:

Girls seem to show higher levels of math anxiety across all exams.

Negligible difference in the arith exam between boys and girls.

similarly for grade…

do the anxiety levels vary based on the students’ grade level.

mathanxiety_modified %>%
  group_by(Grade) %>%
  summarise( average_AMAS = mean(AMAS),   
    average_RCMAS = mean(RCMAS), average_Arith = mean(Arith), 
    count = n()                                
  )

# A tibble: 2 × 5
  Grade     average_AMAS average_RCMAS average_Arith count
  <chr>            <dbl>         <dbl>         <dbl> <int>
1 Primary           21.8          19.6          5.81   401
2 Secondary         22.3          18.5          4.28   198

Inferences:

Anxiety levels seem pretty consistent across both grades.

Primary grade student experience seem to experience higher anxiety levels, also need to take into consideration that their sample size is larger

Using favstats… overview of anxiety scores across all exams by gender

AMA

mathanxiety_modified %>%
  mosaic::favstats(AMAS ~ Gender, data = .)

  Gender min Q1 median Q3 max     mean       sd   n missing
1    Boy   4 17     21 26  45 21.16718 6.506321 323       0
2   Girl   9 19     23 28  40 22.93478 6.588372 276       0

Arith

mathanxiety_modified %>%
  mosaic::favstats(Arith ~ Gender, data = .)

  Gender min Q1 median Q3 max     mean       sd   n missing
1    Boy   0  4      6  7   8 5.272446 2.122070 323       0
2   Girl   0  4      6  7   8 5.336957 2.088639 276       0

RCMAS

mathanxiety_modified %>%
  mosaic::favstats(RCMAS ~ Gender, data = .)

  Gender min Q1 median Q3 max     mean       sd   n missing
1    Boy   1 13     18 23  41 18.11765 7.534090 323       0
2   Girl   3 15     20 26  38 20.55435 7.404712 276       0

# Check the names of the columns in your dataset
colnames(math_anxiety)

[1] "Age"    "Gender" "Grade"  "AMAS"   "RCMAS"  "Arith"