groups

Groups

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mosaic)
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'

The following object is masked from 'package:Matrix':

    mean

The following objects are masked from 'package:dplyr':

    count, do, tally

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    stat

The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum
library(ggformula)
library(palmerpenguins)# Our new favourite dataseti
library(skimr)

Attaching package: 'skimr'

The following object is masked from 'package:mosaic':

    n_missing

min max median 1st quartile 3rd quartile

when we want to compare multiple distributions

wages <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/stevedata/gss_wages.csv")
Rows: 61697 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): occrecode, wrkstat, gender, educcat, maritalcat
dbl (7): rownames, year, realrinc, age, occ10, prestg10, childs

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

examining

glimpse(wages)
Rows: 61,697
Columns: 12
$ rownames   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ year       <dbl> 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974, 1974,…
$ realrinc   <dbl> 4935, 43178, NA, NA, 18505, 22206, 55515, NA, NA, 4935, NA,…
$ age        <dbl> 21, 41, 83, 69, 58, 30, 48, 67, 51, 54, 89, 71, 27, 30, 22,…
$ occ10      <dbl> 5620, 2040, NA, NA, 5820, 910, 230, 6355, 4720, 3940, 4810,…
$ occrecode  <chr> "Office and Administrative Support", "Professional", NA, NA…
$ prestg10   <dbl> 25, 66, NA, NA, 37, 45, 59, 49, 28, 38, 47, 45, 50, 29, 33,…
$ childs     <dbl> 0, 3, 2, 2, 0, 0, 2, 1, 2, 2, 3, 1, 4, 3, 0, 1, 2, 3, 4, 8,…
$ wrkstat    <chr> "School", "Full-Time", "Housekeeper", "Housekeeper", "Full-…
$ gender     <chr> "Male", "Male", "Female", "Female", "Female", "Male", "Male…
$ educcat    <chr> "High School", "Bachelor", "Less Than High School", "Less T…
$ maritalcat <chr> "Married", "Married", "Widowed", "Widowed", "Never Married"…
inspect(wages)

categorical variables:  
        name     class levels     n missing
1  occrecode character     11 58136    3561
2    wrkstat character      8 61676      21
3     gender character      2 61697       0
4    educcat character      5 61562     135
5 maritalcat character      5 61670      27
                                   distribution
1 Professional (19%), Service (16.9%) ...      
2 Full-Time (49.4%), Housekeeper (15.1%) ...   
3 Female (56.1%), Male (43.9%)                 
4 High School (51.5%) ...                      
5 Married (51.7%), Never Married (21.8%) ...   

quantitative variables:  
      name   class  min    Q1 median    Q3      max         mean           sd
1 rownames numeric    1 15425  30849 46273  61697.0 30849.000000 17810.534116
2     year numeric 1974  1985   1996  2006   2018.0  1996.073715    12.794470
3 realrinc numeric  227  8156  16563 27171 480144.5 22326.359234 28581.794499
4      age numeric   18    32     44    59     89.0    46.176177    17.561065
5    occ10 numeric   10  2710   4720  6230   9997.0  4695.774081  2627.724076
6 prestg10 numeric   16    33     42    50     80.0    43.060701    12.987526
7   childs numeric    0     0      2     3      8.0     1.923457     1.763569
      n missing
1 61697       0
2 61697       0
3 37887   23810
4 61478     219
5 58136    3561
6 57511    4186
7 61508     189
skim(wages)
Data summary
Name wages
Number of rows 61697
Number of columns 12
_______________________
Column type frequency:
character 5
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
occrecode 3561 0.94 5 37 0 11 0
wrkstat 21 1.00 5 23 0 8 0
gender 0 1.00 4 6 0 2 0
educcat 135 1.00 8 21 0 5 0
maritalcat 27 1.00 7 13 0 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
rownames 0 1.00 30849.00 17810.53 1 15425 30849 46273 61697.0 ▇▇▇▇▇
year 0 1.00 1996.07 12.79 1974 1985 1996 2006 2018.0 ▆▇▇▇▇
realrinc 23810 0.61 22326.36 28581.79 227 8156 16563 27171 480144.5 ▇▁▁▁▁
age 219 1.00 46.18 17.56 18 32 44 59 89.0 ▇▇▆▅▂
occ10 3561 0.94 4695.77 2627.72 10 2710 4720 6230 9997.0 ▃▅▇▂▃
prestg10 4186 0.93 43.06 12.99 16 33 42 50 80.0 ▃▇▇▃▁
childs 189 1.00 1.92 1.76 0 0 2 3 8.0 ▇▇▂▁▁

dictionary & munging

wages_modified <-
  wages %>%
  tidyr::drop_na(realrinc)

graphs

wages_modified %>%
  gf_boxplot(realrinc ~ "Income") %>% # Dummy X-axis "variable"
  gf_labs(
    title = "Plot 1A: Income has a skewed distribution",
    subtitle = "Many outliers on the high side"
  )

wages_modified %>%
  gf_boxplot("Income" ~ realrinc) %>%  # Swapping axes for a horizontal boxplot
  gf_labs(
    title = "Plot 1A: Income has a skewed distribution",
    subtitle = "Many outliers on the high side"
  )

is income affected by gender

wages_modified %>%
  gf_boxplot(gender ~ realrinc) %>%
  gf_labs(title = "Plot 2A: Income by Gender")

wages_modified %>%
  gf_boxplot(gender ~ log10(realrinc)) %>% ##reduces effect of extreme values 
  gf_labs(title = "Plot 2B: Log(Income) by Gender")

wages_modified %>%
  gf_boxplot(gender ~ realrinc, fill = ~gender) %>%
  gf_refine(scale_x_log10()) %>%
  gf_labs(title = "Plot 2C: Income filled by Gender, log scale")

real rinc against educcat (degrees)

wages_modified <-
  wages %>%
  tidyr::drop_na(educcat)
wages_modified %>%
  gf_boxplot(educcat ~ realrinc) %>%
  gf_labs(title = "Plot 3A: Income by Education Category")
Warning: Removed 23716 rows containing non-finite outside the scale range
(`stat_boxplot()`).

  • many outliers, individual in each category earn significantly more hten others
wages_modified %>%
  gf_boxplot(log10(realrinc) ~ educcat) %>%
  gf_labs(title = "Plot 3A: Income by Education Category")
Warning: Removed 23716 rows containing non-finite outside the scale range
(`stat_boxplot()`).