Visualization Exercise

Original Source:

https://fivethirtyeight.com/features/marriage-isnt-dead-yet/

Graph I am trying to replicate:

Importing Data

Upload raw data to R and install/load packages required to clean data.

#install.packages('dslabs')
#install.packages('tidyverse')
#install.packages('here')
#install.packages('rjson')
#install.packages('plotly')
#install.packages('ggthemes')
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.2.2
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
Warning: package 'ggplot2' was built under R version 4.2.2
Warning: package 'stringr' was built under R version 4.2.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(dslabs)
Warning: package 'dslabs' was built under R version 4.2.2
library(here)
Warning: package 'here' was built under R version 4.2.2
here() starts at C:/Users/Raquel/GitHub/MADA/RaquelFrancisco-MADA-portfolio
library(plotly)
Warning: package 'plotly' was built under R version 4.2.2

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(ggthemes)
Warning: package 'ggthemes' was built under R version 4.2.2
#import file via relative path
#raw_bothsexes <- read_csv(here('Visualization_Exercise/raw_data/both_sexes.csv'))
raw_divorce <- read_csv(here('Visualization_Exercise/raw_data/divorce.csv'))
New names:
Rows: 17 Columns: 21
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," dbl
(20): ...1, year, all_3544, HS_3544, SC_3544, BAp_3544, BAo_3544, GD_35... date
(1): date
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
#raw_men <- read_csv(here('Visualization_Exercise/raw_data/men.csv'))
#raw_women <- read_csv(here('Visualization_Exercise/raw_data/women.csv'))

tibble(raw_divorce)
# A tibble: 17 × 21
    ...1  year date       all_3544 HS_3544 SC_3544 BAp_3544 BAo_3544 GD_3544
   <dbl> <dbl> <date>        <dbl>   <dbl>   <dbl>    <dbl>    <dbl>   <dbl>
 1     1  1960 1960-01-01   0.0344  0.0349  0.0337   0.0275   0.0275 NA     
 2     2  1970 1970-01-01   0.0493  0.0500  0.0487   0.0413   0.0413 NA     
 3     3  1980 1980-01-01   0.106   0.104   0.113    0.0978   0.0978 NA     
 4     4  1990 1990-01-01   0.151   0.159   0.170    0.115    0.119   0.109 
 5     5  2000 2000-01-01   0.157   0.175   0.174    0.106    0.111   0.0959
 6     6  2001 2001-01-01   0.157   0.174   0.178    0.107    0.112   0.0972
 7     7  2002 2002-01-01   0.157   0.175   0.179    0.103    0.110   0.0908
 8     8  2003 2003-01-01   0.154   0.173   0.177    0.103    0.111   0.0864
 9     9  2004 2004-01-01   0.155   0.178   0.177    0.100    0.106   0.0891
10    10  2005 2005-01-01   0.153   0.175   0.177    0.0995   0.107   0.0850
11    11  2006 2006-01-01   0.162   0.189   0.184    0.104    0.111   0.0905
12    12  2007 2007-01-01   0.160   0.187   0.185    0.104    0.112   0.0891
13    13  2008 2008-01-01   0.161   0.188   0.189    0.102    0.111   0.0852
14    14  2009 2009-01-01   0.160   0.187   0.190    0.102    0.112   0.0844
15    15  2010 2010-01-01   0.164   0.190   0.197    0.103    0.112   0.0882
16    16  2011 2011-01-01   0.166   0.192   0.200    0.108    0.117   0.0919
17    17  2012 2012-01-01   0.165   0.190   0.203    0.107    0.115   0.0943
# … with 12 more variables: poor_3544 <dbl>, mid_3544 <dbl>, rich_3544 <dbl>,
#   all_4554 <dbl>, HS_4554 <dbl>, SC_4554 <dbl>, BAp_4554 <dbl>,
#   BAo_4554 <dbl>, GD_4554 <dbl>, poor_4554 <dbl>, mid_4554 <dbl>,
#   rich_4554 <dbl>

Cleaning data to build the: “Divorce Rates by Education”

Variables will be:

Highschool or Less

HS | High school graduate or less (EDUCD < 65)

Some college

SC | Some college (EDUCD >= 65 & <= 100)

College graduate

BAp | Bachelor’s degree or more (EDUCD > 100) BAo | Bachelor’s degree, no graduate degre (EDUCD > 100 & <= 113) GD | Graduate degree (EDUCD > 113)

Goal is to have an X- Axis of “Year (Decade)” and a Y-Axis of “% of Divorce with Education” of only ages 35 to 44

clean_Div <- raw_divorce %>%
  select('year', 'HS_3544', 'SC_3544', 'BAp_3544', 'BAo_3544', 'GD_3544')
  
clean_Div$Graduate <- rowMeans(clean_Div[, c(3:5)], na.rm=TRUE)
#I made the exective decision to get an average of this data because this would be what would me most reminiscent of the original graph. It looks exactly the same as there's after visualization.

clean_Div <- rename(clean_Div, SomeCollege = SC_3544)
clean_Div <- rename(clean_Div, Highschool = HS_3544)
clean_Div <- rename(clean_Div, Year = year)

Div_3345 <- clean_Div %>%
  select('Year','Highschool', 'SomeCollege', 'Graduate')

Start Plotting

main <- ggplot(data = Div_3345) +
  geom_line(aes(x = Year, y = Highschool), color = 'lightblue3', size=1.5) +
  geom_line(aes(x = Year, y = Graduate), color = "paleturquoise", size=1.5) +
  geom_line(aes(x = Year, y = SomeCollege), color = '#336699', size=1.5) +
  xlab("") + 
  ylab("") +
  ggtitle("Divorce Rates By Education", subtitle = "Ages 35 to 44") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight(base_size = 18, base_family = "sans") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  annotate(geom="text", x=1995, y=.2, label="Some College", colour="#336699",
             size=6, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=2000, y=.15, label="High school or less", colour="lightblue3",
             size=6, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=2005, y=.12, label="College graduates", colour="paleturquoise",
             size=6, family="sans", fontface="bold", angle=0)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
#My biggest issue with this graph is I cannot figure out how they added the labels for each education level with black with lines pointing to each trendline. Is it possible that they did this in lightroom after? To compensate, I made the labels "float" near the lines and made their color match.

main

##ggthemes had a theme called 'fivethirtyeight' that was very close to what is seen in the original plot. It was found here: https://yutannihilation.github.io/allYourFigureAreBelongToUs/ggthemes/theme_fivethirtyeight/

Explore Plotly

ggplotly(maintooltip = c("text"))
Warning: plotly.js does not (yet) support horizontal legend items 
You can track progress here: 
https://github.com/plotly/plotly.js/issues/53 
two <- plotly_build(main)
Warning: plotly.js does not (yet) support horizontal legend items 
You can track progress here: 
https://github.com/plotly/plotly.js/issues/53 
names(two$x$data[[1]])
 [1] "x"          "y"          "text"       "type"       "mode"      
 [6] "line"       "hoveron"    "showlegend" "xaxis"      "yaxis"     
[11] "hoverinfo"  "frame"     
names(two$x$layout)
 [1] "margin"        "plot_bgcolor"  "paper_bgcolor" "font"         
 [5] "title"         "xaxis"         "yaxis"         "shapes"       
 [9] "showlegend"    "legend"        "hovermode"     "barmode"      
#I am having trouble editing the layout of plotly. The github and help pages have not been very helpful.