Tidy Tuesday Exercise

Tidy Tuesday Exercise

Hollywood Age Gaps

2/14/2023

Upload public access data for exercise via tidytuesdayR package.

#install.packages("tidytuesdayR")
#install.packages('janitor')

library(tidytuesdayR)
Warning: package 'tidytuesdayR' was built under R version 4.2.2
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.2.2
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
Warning: package 'ggplot2' was built under R version 4.2.2
Warning: package 'stringr' was built under R version 4.2.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(here)
Warning: package 'here' was built under R version 4.2.2
here() starts at C:/Users/Raquel/GitHub/MADA/RaquelFrancisco-MADA-portfolio
library(janitor)
Warning: package 'janitor' was built under R version 4.2.2

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
tuesdata <- tidytuesdayR::tt_load(2023, week = 7)
--- Compiling #TidyTuesday Information for 2023-02-14 ----
--- There is 1 file available ---
--- Starting Download ---

    Downloading file 1 of 1: `age_gaps.csv`
--- Download complete ---
age_gaps <- tuesdata$age_gaps

Data Dictionary

variable; class; description

movie_name; character; Name of the film;

release_year; integer; Release year

director; character; Director of the film

age_difference; integer; Age difference between the characters in whole years

couple_number; integer; An identifier for the couple in case multiple couples are listed for this film

actor_1_name; character; The name of the older actor in this couple

actor_2_name; character; The name of the younger actor in this couple

character_1_gender; character; The gender of the older character, as identified by the person who submitted the data for this couple

character_2_gender; character; The gender of the younger character, as identified by the person who submitted the data for this couple

actor_1_birthdate; date; The birthdate of the older member of the couple

actor_2_birthdate; date; The birthdate of the younger member of the couple

actor_1_age; integer; The age of the older actor when the film was released

actor_2_age; integer; The age of the younger actor when the film was released

tibble(age_gaps)
# A tibble: 1,155 × 13
   movie_name    relea…¹ direc…² age_d…³ coupl…⁴ actor…⁵ actor…⁶ chara…⁷ chara…⁸
   <chr>           <dbl> <chr>     <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>  
 1 Harold and M…    1971 Hal As…      52       1 Ruth G… Bud Co… woman   man    
 2 Venus            2006 Roger …      50       1 Peter … Jodie … man     woman  
 3 The Quiet Am…    2002 Philli…      49       1 Michae… Do Thi… man     woman  
 4 The Big Lebo…    1998 Joel C…      45       1 David … Tara R… man     woman  
 5 Beginners        2010 Mike M…      43       1 Christ… Goran … man     man    
 6 Poison Ivy       1992 Katt S…      42       1 Tom Sk… Drew B… man     woman  
 7 Whatever Wor…    2009 Woody …      40       1 Larry … Evan R… man     woman  
 8 Entrapment       1999 Jon Am…      39       1 Sean C… Cather… man     woman  
 9 Husbands and…    1992 Woody …      38       1 Woody … Juliet… man     woman  
10 Magnolia         1999 Paul T…      38       1 Jason … Julian… man     woman  
# … with 1,145 more rows, 4 more variables: actor_1_birthdate <date>,
#   actor_2_birthdate <date>, actor_1_age <dbl>, actor_2_age <dbl>, and
#   abbreviated variable names ¹​release_year, ²​director, ³​age_difference,
#   ⁴​couple_number, ⁵​actor_1_name, ⁶​actor_2_name, ⁷​character_1_gender,
#   ⁸​character_2_gender
glimpse(age_gaps)
Rows: 1,155
Columns: 13
$ movie_name         <chr> "Harold and Maude", "Venus", "The Quiet American", …
$ release_year       <dbl> 1971, 2006, 2002, 1998, 2010, 1992, 2009, 1999, 199…
$ director           <chr> "Hal Ashby", "Roger Michell", "Phillip Noyce", "Joe…
$ age_difference     <dbl> 52, 50, 49, 45, 43, 42, 40, 39, 38, 38, 36, 36, 35,…
$ couple_number      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ actor_1_name       <chr> "Ruth Gordon", "Peter O'Toole", "Michael Caine", "D…
$ actor_2_name       <chr> "Bud Cort", "Jodie Whittaker", "Do Thi Hai Yen", "T…
$ character_1_gender <chr> "woman", "man", "man", "man", "man", "man", "man", …
$ character_2_gender <chr> "man", "woman", "woman", "woman", "man", "woman", "…
$ actor_1_birthdate  <date> 1896-10-30, 1932-08-02, 1933-03-14, 1930-09-17, 19…
$ actor_2_birthdate  <date> 1948-03-29, 1982-06-03, 1982-10-01, 1975-11-08, 19…
$ actor_1_age        <dbl> 75, 74, 69, 68, 81, 59, 62, 69, 57, 77, 59, 56, 65,…
$ actor_2_age        <dbl> 23, 24, 20, 23, 38, 17, 22, 30, 19, 39, 23, 20, 30,…

Cleaning data

adata <- age_gaps %>%
  ##Pick out data relevant to me
  ## filter out main couples (i.e. couple 1)
  filter(couple_number == 1) %>%
  ## select variables of interest
  select( release_year  | age_difference | movie_name | character_1_gender |  actor_1_age | actor_1_name | character_2_gender | actor_2_age | actor_2_name )

##quick visualization of data to look for trends
plot(adata)

#quick view of data with scatterplots show that actor 1 age and age difference seem related
#but i think this data would be easier to read if we changed actor gender to just male vs female
#we'll flip this to female lead roles and male lead roles

leadFdata <- adata %>%
  filter(character_1_gender == 'woman') %>%
  rename("female_leads" = "character_1_gender") %>%
  rename("lead_female_age" = "actor_1_age") %>%
  rename("lead_name" = "actor_1_name") %>%
  rename("supporting_name" = "actor_2_name") %>%
  rename("supporting_actor_age" = "actor_2_age") %>%
  rename("supporting_gender" = "character_2_gender")

plot(leadFdata)

#looks promising to pursue: age difference and lead female age, lead age and supporting actor age, also personally interested in supporting actress age and release year

leadMdata <- adata %>%
  filter(character_1_gender == 'man') %>%
  rename("male_leads" = "character_1_gender") %>%
  rename("lead_male_age" = "actor_1_age")  %>%
  rename("lead_name" = "actor_1_name") %>%
  rename("supporting_name" = "actor_2_name") %>%
  rename("supporting_actor_age" = "actor_2_age") %>%
  rename("supporting_gender" = "character_2_gender")

plot(leadMdata)

#firstly wow! way more data. Second same trends seem to apply. Would also like to compare the gender of the supporting actor and a same-sex supporting role.
#onto visualization!

Data Vizualization

Actor and Release Data

(because inquiring minds want to know…)

library(ggthemes)
Warning: package 'ggthemes' was built under R version 4.2.2
ARplot <- ggplot() +
  geom_point(data = leadMdata, aes(x = release_year, y = lead_male_age), color = 'dodgerblue4', size=1.5, shape = 15) +
  geom_point(data = leadFdata, aes(x = release_year, y = lead_female_age), color = 'deeppink4', size=1.5, shape = 15) +
    geom_point(data = leadMdata, aes(x = release_year, y = supporting_actor_age), color = 'deepskyblue2', size=1.5, shape = 18) +
  geom_point(data = leadFdata, aes(x = release_year, y = supporting_actor_age), color = 'deeppink1', size=1.5, shape = 18) +
  ggtitle("Actor Age in Relation to the Movie Release Data", subtitle = "Evaluated by Genders") +
labs(x = "Release Year", y = "Actor Ages") +
  annotate(geom="text", x=1950, y=75, label="Lead Male Actors", colour="dodgerblue4", size=4, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=1950, y=80, label="Lead Female Actors", colour="deeppink4", size=4, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=1950, y=65, label="Actors that Support Male Leads", colour="deepskyblue2", size=3, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=1950, y=70, label="Actors that Support Female Leads", colour="deeppink1", size=3, family="sans", fontface="bold", angle=0)

ARplot

Lets look a little closer at lead actors and age gaps

ARAplot <- ggplot() +
  geom_point(data = leadMdata, aes(x = release_year, y = lead_male_age, size = age_difference), fill = 'dodgerblue4', shape = 21, colour = "black") +
  geom_point(data = leadFdata, aes(x = release_year, y = lead_female_age, size = age_difference), fill = 'deeppink4', shape = 21, colour = "black") +

  ggtitle("Lead Actor Age in Relation to the Movie Release Data", subtitle = "Factoring in Age Gaps between Lead and Supporting Actors") +
  labs(x = "Release Year", y = "Actor Ages", color= "Age Gap") +
  annotate(geom="text", x=1950, y=75, label="Lead Male Actors", colour="dodgerblue4", size=4, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=1950, y=80, label="Lead Female Actors", colour="deeppink4", size=4, family="sans", fontface="bold", angle=0) 
  


ARAplot

Relationship between Movie release data, Age difference, and supporting Actor Age

###Supporting actor age

ASAplot <- ggplot() +
    geom_point(data = leadMdata, aes(x = release_year, y = supporting_actor_age, size = age_difference), fill = 'deepskyblue2', shape = 21, colour = "black") +
  geom_point(data = leadFdata, aes(x = release_year, y = supporting_actor_age, size = age_difference), fill = 'deeppink1', shape = 21, colour = "black") +
  ggtitle("Supporting Actor Age in Relation to the Movie Release Data", subtitle = "Factoring in Age Gaps between Lead and Supporting Actors") +
labs(x = "Release Year", y = "Actor Ages") +
   annotate(geom="text", x=1965, y=60, label="Actors that Support Male Leads", colour="deepskyblue2", size=4, family="sans", fontface="bold", angle=0) +
  annotate(geom="text", x=1965, y=65, label="Actors that Support Female Leads", colour="deeppink1", size=4, family="sans", fontface="bold", angle=0)

ASAplot

Lead Age vs Supporting actor age

Opposite vs Same-sex