Groups

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(mosaic)

Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'

The following object is masked from 'package:Matrix':

    mean

The following objects are masked from 'package:dplyr':

    count, do, tally

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    stat

The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum

library(ggformula)
library(palmerpenguins)# Our new favourite dataseti
library(skimr)


Attaching package: 'skimr'

The following object is masked from 'package:mosaic':

    n_missing

min max median 1st quartile 3rd quartile

when we want to compare multiple distributions

wages <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/stevedata/gss_wages.csv")

Rows: 61697 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): occrecode, wrkstat, gender, educcat, maritalcat
dbl (7): rownames, year, realrinc, age, occ10, prestg10, childs

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

examining

glimpse(wages)

Rows: 61,697
Columns: 12
$ rownames   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ year       <dbl> 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974,…
$ realrinc   <dbl> 4935, 43178, NA, NA, 18505, 22206, 55515, NA, NA, 4935, NA,…
$ age        <dbl> 21, 41, 83, 69, 58, 30, 48, 67, 51, 54, 89, 71, 27, 30, 22,…
$ occ10      <dbl> 5620, 2040, NA, NA, 5820, 910, 230, 6355, 4720, 3940, 4810,…
$ occrecode  <chr> "Office and Administrative Support", "Professional", NA, NA…
$ prestg10   <dbl> 25, 66, NA, NA, 37, 45, 59, 49, 28, 38, 47, 45, 50, 29, 33,…
$ childs     <dbl> 0, 3, 2, 2, 0, 0, 2, 1, 2, 2, 3, 1, 4, 3, 0, 1, 2, 3, 4, 8,…
$ wrkstat    <chr> "School", "Full-Time", "Housekeeper", "Housekeeper", "Full-…
$ gender     <chr> "Male", "Male", "Female", "Female", "Female", "Male", "Male…
$ educcat    <chr> "High School", "Bachelor", "Less Than High School", "Less T…
$ maritalcat <chr> "Married", "Married", "Widowed", "Widowed", "Never Married"…

inspect(wages)


categorical variables:  
        name     class levels     n missing
1  occrecode character     11 58136    3561
2    wrkstat character      8 61676      21
3     gender character      2 61697       0
4    educcat character      5 61562     135
5 maritalcat character      5 61670      27
                                   distribution
1 Professional (19%), Service (16.9%) ...      
2 Full-Time (49.4%), Housekeeper (15.1%) ...   
3 Female (56.1%), Male (43.9%)                 
4 High School (51.5%) ...                      
5 Married (51.7%), Never Married (21.8%) ...   

quantitative variables:  
      name   class  min    Q1 median    Q3      max         mean           sd
1 rownames numeric    1 15425  30849 46273  61697.0 30849.000000 17810.534116
2     year numeric 1974  1985   1996  2006   2018.0  1996.073715    12.794470
3 realrinc numeric  227  8156  16563 27171 480144.5 22326.359234 28581.794499
4      age numeric   18    32     44    59     89.0    46.176177    17.561065
5    occ10 numeric   10  2710   4720  6230   9997.0  4695.774081  2627.724076
6 prestg10 numeric   16    33     42    50     80.0    43.060701    12.987526
7   childs numeric    0     0      2     3      8.0     1.923457     1.763569
      n missing
1 61697       0
2 61697       0
3 37887   23810
4 61478     219
5 58136    3561
6 57511    4186
7 61508     189

skim(wages)

Data summary
Name	wages
Number of rows	61697
Number of columns	12
_______________________
Column type frequency:
character	5
numeric	7
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
occrecode	3561	0.94	5	37	11
wrkstat	21	1.00	5	23	8
gender	0	1.00	4	6	2
educcat	135	1.00	8	21	5
maritalcat	27	1.00	7	13	5

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
rownames	0	1.00	30849.00	17810.53	1	15425	30849	46273	61697.0	▇▇▇▇▇
year	0	1.00	1996.07	12.79	1974	1985	1996	2006	2018.0	▆▇▇▇▇
realrinc	23810	0.61	22326.36	28581.79	227	8156	16563	27171	480144.5	▇▁▁▁▁
age	219	1.00	46.18	17.56	18	32	44	59	89.0	▇▇▆▅▂
occ10	3561	0.94	4695.77	2627.72	10	2710	4720	6230	9997.0	▃▅▇▂▃
prestg10	4186	0.93	43.06	12.99	16	33	42	50	80.0	▃▇▇▃▁
childs	189	1.00	1.92	1.76	0	0	2	3	8.0	▇▇▂▁▁

dictionary & munging

wages_modified <-
  wages %>%
  tidyr::drop_na(realrinc)

graphs

wages_modified %>%
  gf_boxplot(realrinc ~ "Income") %>% # Dummy X-axis "variable"
  gf_labs(
    title = "Plot 1A: Income has a skewed distribution",
    subtitle = "Many outliers on the high side"
  )

wages_modified %>%
  gf_boxplot("Income" ~ realrinc) %>%  # Swapping axes for a horizontal boxplot
  gf_labs(
    title = "Plot 1A: Income has a skewed distribution",
    subtitle = "Many outliers on the high side"
  )

is income affected by gender

wages_modified %>%
  gf_boxplot(gender ~ realrinc) %>%
  gf_labs(title = "Plot 2A: Income by Gender")

wages_modified %>%
  gf_boxplot(gender ~ log10(realrinc)) %>% ##reduces effect of extreme values 
  gf_labs(title = "Plot 2B: Log(Income) by Gender")

wages_modified %>%
  gf_boxplot(gender ~ realrinc, fill = ~gender) %>%
  gf_refine(scale_x_log10()) %>%
  gf_labs(title = "Plot 2C: Income filled by Gender, log scale")

real rinc against educcat (degrees)

wages_modified <-
  wages %>%
  tidyr::drop_na(educcat)

wages_modified %>%
  gf_boxplot(educcat ~ realrinc) %>%
  gf_labs(title = "Plot 3A: Income by Education Category")

Warning: Removed 23716 rows containing non-finite outside the scale range
(`stat_boxplot()`).

many outliers, individual in each category earn significantly more hten others

wages_modified %>%
  gf_boxplot(log10(realrinc) ~ educcat) %>%
  gf_labs(title = "Plot 3A: Income by Education Category")

Warning: Removed 23716 rows containing non-finite outside the scale range
(`stat_boxplot()`).