R project 2

IMDB ratings: Differences between directors

Recall the IMBD ratings data. We will explore whether the mean IMDB rating for Steven Spielberg and Tim Burton are the same or not. First we create a graph and then we run a hypothesis test with the t.test and infer package. The null hypothesis is that there is no difference in rating, while the alternative hypothesis is that there is a difference.

We can load the data and examine its structure

movies <- read_csv(here::here("data", "movies.csv"))
glimpse(movies)

## Rows: 2,961
## Columns: 11
## $ title               <chr> "Avatar", "Titanic", "Jurassic World", "The Avenge…
## $ genre               <chr> "Action", "Drama", "Action", "Action", "Action", "…
## $ director            <chr> "James Cameron", "James Cameron", "Colin Trevorrow…
## $ year                <dbl> 2009, 1997, 2015, 2012, 2008, 1999, 1977, 2015, 20…
## $ duration            <dbl> 178, 194, 124, 173, 152, 136, 125, 141, 164, 93, 1…
## $ gross               <dbl> 7.61e+08, 6.59e+08, 6.52e+08, 6.23e+08, 5.33e+08, …
## $ budget              <dbl> 2.37e+08, 2.00e+08, 1.50e+08, 2.20e+08, 1.85e+08, …
## $ cast_facebook_likes <dbl> 4834, 45223, 8458, 87697, 57802, 37723, 13485, 920…
## $ votes               <dbl> 886204, 793059, 418214, 995415, 1676169, 534658, 9…
## $ reviews             <dbl> 3777, 2843, 1934, 2425, 5312, 3917, 1752, 1752, 35…
## $ rating              <dbl> 7.9, 7.7, 7.0, 8.1, 9.0, 6.5, 8.7, 7.5, 8.5, 7.2, …

ratings_per_director_formula_ci <- movies %>%
  filter(director %in% (c("Steven Spielberg", "Tim Burton"))) %>% 
  group_by(director) %>% 
  summarise(mean_rating = mean(rating),
            median_rating = median(rating),
            sd_rating = sd(rating),
            count = n(),
            # get t-critical value with (n-1) degrees of freedom
            t_critical = qt(0.975, count-1),
            se_rating = sd_rating/sqrt(count),
            margin_of_error = t_critical * se_rating,
            rating_low = mean_rating - margin_of_error,
            rating_high = mean_rating + margin_of_error) %>% 
  arrange(desc(mean_rating))

ratings_per_director_formula_ci

## # A tibble: 2 × 10
##   director         mean_rating median_rating sd_rating count t_critical se_rating
##   <chr>                  <dbl>         <dbl>     <dbl> <int>      <dbl>     <dbl>
## 1 Steven Spielberg        7.57           7.6     0.695    23       2.07     0.145
## 2 Tim Burton              6.93           7       0.749    16       2.13     0.187
## # … with 3 more variables: margin_of_error <dbl>, rating_low <dbl>,
## #   rating_high <dbl>

# Create a plot 
ggplot(data=ratings_per_director_formula_ci, aes(x=mean_rating, y=reorder(director, mean_rating))) + 
  
  geom_rect(aes(xmin = 7.27,
            xmax = 7.33,
            ymin = -Inf, ymax = Inf),fill = "gainsboro", alpha = .4) +

  
  geom_point(aes(colour=director), size=5, show.legend=FALSE) + 
  
  geom_errorbar(width=.1, aes(xmin=rating_low, xmax=rating_high, colour= director), size=2,   
                show.legend=FALSE) +
  
  scale_color_manual(values = c("coral", "cyan3")) +
  
  annotate(geom="text", x=c(7.27,  7.87, 6.53,  7.33), 
           y=c(2.1, 2.1, 1.1, 1.1), label=c(7.27, 7.87, 6.53, 7.33),
              color="black", size=6) +
  
   annotate(geom="text", x=c(7.57, 6.93), 
           y=c( 2.1, 1.1), label=c( 7.57, 6.93),
              color="black", size=8) +
  
  theme_minimal() + 
  
  theme(panel.border = element_rect(color = "black",
                                    fill = NA,
                                    size = 1))+
  
  labs(
    title = "Do Spielberg and Burton have the same mean IMBD ratings",
    subtitle = "95% confidence interval overlap",
    x = "Mean IMBD rating",
    y = "Director", 
    cex=0.1)

Now we conduct the mentioned t-tests:

movies_subset <- movies %>%   
  filter(director %in% (c("Steven Spielberg", "Tim Burton")))

#t-test 
t.test(rating ~ director, data=movies_subset)

## 
##  Welch Two Sample t-test
## 
## data:  rating by director
## t = 3, df = 31, p-value = 0.01
## alternative hypothesis: true difference in means between group Steven Spielberg and group Tim Burton is not equal to 0
## 95 percent confidence interval:
##  0.16 1.13
## sample estimates:
## mean in group Steven Spielberg       mean in group Tim Burton 
##                           7.57                           6.93

set.seed(1)
#t-test simulation 
obs_diff_ratings <- movies_subset %>%
  specify(rating ~ director) %>%
  calculate(stat = "diff in means", order = c("Steven Spielberg", "Tim Burton"))

null_dist <- movies_subset %>%
  # specify variables
  specify(rating ~ director) %>%
  
  # assume independence, i.e, there is no difference
  hypothesize(null = "independence") %>%
  
  # generate 1000 reps, of type "permute"
  generate(reps = 1000, type = "permute") %>%
  
  # calculate statistic of difference, namely "diff in means"
  calculate(stat = "diff in means", order = c("Steven Spielberg", "Tim Burton"))

null_dist %>% visualize() +
  shade_p_value(obs_stat = obs_diff_ratings, direction = "two-sided")

null_dist %>%
  get_p_value(obs_stat = obs_diff_ratings, direction = "two_sided")

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.012