options(scipen = 999)
library(tidyverse)
library(rvest)
library(tidyr)
library(janitor)
#library(data.table)
James Bond Movies
I enjoy James Bond movies. Got started by watching some of the earlier movies with my parents. I still watch each as it comes out. So let’s see how popular the movies are.
The Economist newspaper did some data analysis when Skyfall came out, stacking up the different Bond actors on killing, drinking martinis, and love conquests (“Booze, bonks and bodies”). They updated the data for the UK release of Spectre, with Daniel Craig jumping in ranking a lot, mainly from his many kills. The Economist also recently did a comparison of box office opening weekends of the Bond films.
Box Office success is one objective measure of film quality; if the movie is bad, people don’t go to the cinemas to watch it, and after all, the market is always right, right?
Others might argue that a critical review scale can assess the quality of a film. This is always debatable: who decides what and how things are grouped into a quality scale?
Data
I pulled data on Bond movies from wikipedia that lists all the Bond films with their respective budget, box office returns, and several critic scales.
Get Data
<- "https://en.wikipedia.org/wiki/List_of_James_Bond_films"
bond.url
## read the page into R
<- read_html(bond.url)
bond.wiki
## film data
<- bond.wiki %>%
bond.films html_nodes("table") %>%
# first table in the page
1]] %>%
.[[# fill, because wiki uses multi-cell formatting :(
html_table(fill = TRUE)
glimpse(bond.films)
Rows: 27
Columns: 9
$ Title <chr> "Title", "Dr. No", "From Russia with Love"…
$ Year <chr> "Year", "1962", "1963", "1964", "1965", "1…
$ `Bond actor` <chr> "Bond actor", "Sean Connery", "Sean Conner…
$ Director <chr> "Director", "Terence Young", "Terence Youn…
$ `Box office (millions)[13]` <chr> "Actual $", "59.5", "78.9", "124.9", "141.…
$ `Box office (millions)[13]` <chr> "Adjusted $2005", "448.8", "543.8", "820.4…
$ `Budget (millions)[13]` <chr> "Actual $", "1.1", "2.0", "3.0", "6.8", "1…
$ `Budget (millions)[13]` <chr> "Adjusted $2005", "7.0", "12.6", "18.6", "…
$ `Ref(s)` <chr> "Ref(s)", "[13][14]", "[13][14][15]", "[13…
Clean the data.
# lots of cleaning to work to be done now...
<- bond.films %>%
bondfilms_clean clean_names(case = "upper_camel") %>%
# remove first and last line (inner-table headers)
head(-1) %>% tail(-1) %>%
rename(Actual_Box_Office = BoxOfficeMillions13, Adjusted_Box_Office = BoxOfficeMillions13_2,
Actual_Budget = BudgetMillions13, Adjusted_Budget = BudgetMillions13_2) %>%
select(Title:Adjusted_Budget)
<- bondfilms_clean %>% mutate(Actual_Budget = str_replace_all(Actual_Budget, c("[^Z0-9]" = "")),
bondfilms_clean Adjusted_Budget = str_remove_all(Adjusted_Budget, c("[^Z0-9]" = "")))
<- bondfilms_clean %>% mutate(Actual_Budget = replace(Actual_Budget, Actual_Budget == "150200", "175"),
bondfilms_clean Actual_Budget = replace(Actual_Budget, Actual_Budget == "245250", "248"),
Actual_Budget = replace(Actual_Budget, Actual_Budget == "250301", "276"),
Adjusted_Budget = replace(Adjusted_Budget, Adjusted_Budget == "128170", "149"),
Adjusted_Budget = replace(Adjusted_Budget, Adjusted_Budget == "202206", "204"),
Adjusted_Budget = replace(Adjusted_Budget, Adjusted_Budget == "189226", "208"),)
options(digits = 3)
<- bondfilms_clean %>% mutate(Actual_Box_Office = as.numeric(Actual_Box_Office),
bondfilms_clean Actual_Budget = as.numeric(Actual_Budget),
Adjusted_Box_Office = as.numeric(Adjusted_Box_Office),
Adjusted_Budget = as.numeric(Adjusted_Budget),
Year = as.numeric(Year),
ROI = (Adjusted_Box_Office/Adjusted_Budget))
glimpse(bondfilms_clean)
Rows: 25
Columns: 9
$ Title <chr> "Dr. No", "From Russia with Love", "Goldfinger", "…
$ Year <dbl> 1962, 1963, 1964, 1965, 1967, 1969, 1971, 1973, 19…
$ BondActor <chr> "Sean Connery", "Sean Connery", "Sean Connery", "S…
$ Director <chr> "Terence Young", "Terence Young", "Guy Hamilton", …
$ Actual_Box_Office <dbl> 59.5, 78.9, 124.9, 141.2, 111.6, 64.6, 116.0, 126.…
$ Adjusted_Box_Office <dbl> 449, 544, 820, 848, 514, 292, 442, 460, 334, 533, …
$ Actual_Budget <dbl> 11, 20, 30, 68, 103, 70, 72, 70, 70, 140, 340, 280…
$ Adjusted_Budget <dbl> 70, 126, 186, 419, 599, 373, 347, 308, 277, 451, 9…
$ ROI <dbl> 6.411, 4.316, 4.411, 2.024, 0.858, 0.782, 1.275, 1…
Then pull and clean the second table which has information on several awards and critic ratings.
## ratings on films
<- bond.wiki %>%
bond_other html_nodes("table") %>%
# second table in the page
2]] %>%
.[[# fill, because of multi-cell formatting :(
html_table(fill = TRUE)
<- bond_other %>% clean_names(case = "upper_camel") %>%
bond_other head(-1) %>% tail(-1) %>%
# mutate(
# # rename for later merging
# Title = f.clean.titles(Film),
# Actor = f.clean.double.names(Actor)
# ) %>%
# reorder to original order
# select(Title, Year:Awards) %>%
rename(Actual_Box_Office = BoxOfficeMillions13, Adjusted_Box_Office = BoxOfficeMillions13_2,
Actual_Budget = BudgetMillions13, Adjusted_Budget = BudgetMillions13_2,
Director = DirectorS) %>% select(Title:Adjusted_Budget)
<- bond_other %>% mutate(Actual_Box_Office = as.numeric(Actual_Box_Office),
bond_other Actual_Budget = as.numeric(Actual_Budget),
Adjusted_Box_Office = as.numeric(Adjusted_Box_Office),
Adjusted_Budget = as.numeric(Adjusted_Budget),
Year = as.numeric(Year),
ROI = (Adjusted_Box_Office/Adjusted_Budget))
glimpse(bond_other)
Rows: 2
Columns: 9
$ Title <chr> "Casino Royale", "Never Say Never Again"
$ Year <dbl> 1967, 1983
$ BondActor <chr> "David Niven", "Sean Connery"
$ Director <chr> "Ken HughesJohn HustonJoseph McGrathRobert Parrish…
$ Actual_Box_Office <dbl> 44.4, 160.0
$ Adjusted_Box_Office <dbl> 260, 314
$ Actual_Budget <dbl> 12, 36
$ Adjusted_Budget <dbl> 70, 71
$ ROI <dbl> 3.71, 4.42
Collect crititc ratings using Rotten Tomatoes as the source.
<- bond.wiki %>%
rottenTomatoes html_nodes("table") %>%
# second table in the page
3]] %>%
.[[# fill, because of multi-cell formatting :(
html_table(fill = TRUE)
<- rottenTomatoes %>% select(1, 2) %>%
rottenTomatoes filter(!row_number() %in% (c(1,2, 3)) & Film != "Non-Eon films")
# split the RT into rating and number of reviewers
<- rottenTomatoes %>% separate(Critical, into = c("rating", "reviews"), sep="%")
rottenTomatoes
<- rottenTomatoes %>% mutate(reviews = sub("^ \\((\\d*) reviews\\).*", "\\1", reviews))
rottenTomatoes
<- rottenTomatoes %>% rename(Title = Film)
rottenTomatoes
<- rottenTomatoes %>% mutate(rating = as.numeric(rating),
rottenTomatoes reviews = as.numeric(reviews))
glimpse(rottenTomatoes)
Rows: 27
Columns: 3
$ Title <chr> "Dr. No", "From Russia with Love", "Goldfinger", "Thunderball"…
$ rating <dbl> 95, 95, 99, 87, 73, 81, 64, 65, 39, 80, 60, 72, 43, 38, 74, 78…
$ reviews <dbl> 60, 62, 69, 52, 52, 54, 50, 52, 51, 56, 53, 53, 49, 61, 57, 59…
Merge all the data together.
## final data set
# data <- bondfilms_clean %>% merge(bond.ratings) %>% select(-BondActor) %>% arrange(Year)
<- bind_rows(bondfilms_clean, bond_other)
data
<- inner_join(data, rottenTomatoes, by = "Title")
data
glimpse(data)
Rows: 25
Columns: 11
$ Title <chr> "Dr. No", "From Russia with Love", "Goldfinger", "…
$ Year <dbl> 1962, 1963, 1964, 1965, 1967, 1969, 1971, 1973, 19…
$ BondActor <chr> "Sean Connery", "Sean Connery", "Sean Connery", "S…
$ Director <chr> "Terence Young", "Terence Young", "Guy Hamilton", …
$ Actual_Box_Office <dbl> 59.5, 78.9, 124.9, 141.2, 111.6, 64.6, 116.0, 126.…
$ Adjusted_Box_Office <dbl> 449, 544, 820, 848, 514, 292, 442, 460, 334, 533, …
$ Actual_Budget <dbl> 11, 20, 30, 68, 103, 70, 72, 70, 70, 140, 340, 280…
$ Adjusted_Budget <dbl> 70, 126, 186, 419, 599, 373, 347, 308, 277, 451, 9…
$ ROI <dbl> 6.411, 4.316, 4.411, 2.024, 0.858, 0.782, 1.275, 1…
$ rating <dbl> 95, 95, 99, 87, 73, 81, 64, 65, 39, 80, 60, 72, 43…
$ reviews <dbl> 60, 62, 69, 52, 52, 54, 50, 52, 51, 56, 53, 53, 49…
Visualization
<- data %>% group_by(BondActor) %>% summarise(yearmin = min(Year), yearmax = max(Year)) %>%
bondactor_grp ungroup() %>% arrange(yearmin) %>% mutate(yraemax = ifelse(yearmin==yearmax, yearmax +1, yearmax))
## plot with actor years of service
ggplot() +
# place geom_rect first, then other geoms will "write over" the rectangles
geom_rect(bondactor_grp, mapping = aes(xmin = yearmin, xmax = yearmax, ymin = -Inf, ymax = Inf, fill = BondActor), alpha = 0.3) +
# write actor names on rectangles
# # place text rather at the top of the y-axis
geom_text(bondactor_grp, mapping = aes(x = yearmin, y = max(data$ROI, na.rm = TRUE), label = BondActor,
angle = 90, hjust = 1, vjust = 1), alpha = 0.6, size = 4) +
# film names
geom_text(data, mapping = aes(x = Year, y = 0, label = Title, angle = 90, hjust = 0, vjust = 0.5), size=3) +
# geom_point(data, mapping = aes(x = Year, y = Adjusted_Box_Office, size = Adjusted_Budget, color = rating)) +
geom_point(data, mapping = aes(x = Year, y = ROI, size = ROI, color = ROI), alpha=.3) +
# Rotten Tomatoes rating gradient
scale_color_continuous(low="tomato", high="Turquoise", name = "Rating") +
# increase minimum point size for readability
scale_size_continuous(name = "ROI", range = c(3, 10)) +
theme_bw() +
theme(plot.title = element_text(lineheight=.8, face="bold"),
legend.key.height = unit(.3, "in"),
legend.title = element_text(size = 8), legend.text = element_text(size = 6)) +
# remove actor names from legend
guides(fill=FALSE, size = "none") +
labs(title = "ROI of James Bond Movies\n",
x="", y="ROI")
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
Results
The honor of the worst-rated Bond film goes to A View to a Kill (Roger Moore), and the film with the worst box office results is License to Kill (Timothy Dalton).
Roger Moore presided over a continuous decline in popularity in the 1980s, and Timothy Dalton could not stop that trend. This lead to a long pause in Bond films until the franchise was resurrected in 1995 with Pierce Brosnan. Also interesting is the fact that with Brosnan, Bond film budgets noticeably increased in size. Only Moonraker comes close to being as expensive as the later Bonds, and that was set in space.
The early Connery Bond films were the most profitable – easily topping the current Craig films, by up to a factor of 22 (Dr. No yielded 64 times its costs, versus Quantum of Solace which made 2.8).