Linear regression with R

Regression with R

Regression with R

This is an R Markdown analysis log. This document was compiled at: 2014-02-11 13:41:05

First we need to load the packages I'll be using.

require(RCurl)  # this is due to an issue with https, R and github
## Loading required package: RCurl
## Loading required package: bitops
require(MASS)  # for robust regression
## Loading required package: MASS
require(car)  # AVP plots
## Loading required package: car
require(reshape)  # During the last plot I reshape some data
## Loading required package: reshape
## Loading required package: plyr
##
## Attaching package: 'reshape'
##
## The following objects are masked from 'package:plyr':
##
##     rename, round_any
require(ggplot2)  # last plot is a ggplot
## Loading required package: ggplot2
options(scipen = 99)  # no scientific notation
location_data <- "location of data"
location_output <- "output folder"

Read in data

I pulled this data from a variety of sources. As this isn't a serious analysis, I won't bother citing.

So that others won't have to crawl through different websites, I have saved the data into a gist. So first thing I need to do is load it up.

data <- getURL("https://gist.github.com/epijim/8819934/raw/6c76df80eb095065a9ce0fa4b8f94410ad528fed/college_data.csv")
data <- read.csv(text = data, stringsAsFactors = F)

Check data

Look at the variable types

str(data)
## 'data.frame':    31 obs. of  37 variables:
##  $ College               : chr  "Christ's" "Churchill" "Clare" "Clare Hall" ...
##  $ Grad_coll             : logi  FALSE FALSE FALSE TRUE FALSE TRUE ...
##  $ Wine_budget           : int  71055 87685 79989 17400 79254 17510 77798 131127 23028 30051 ...
##  $ Fellows               : int  84 165 120 67 54 68 65 87 56 118 ...
##  $ Students              : int  617 770 816 181 483 633 696 650 780 722 ...
##  $ Grads                 : int  208 290 312 181 223 633 271 220 288 220 ...
##  $ PhD                   : int  127 190 190 96 145 421 151 121 151 102 ...
##  $ Other                 : int  81 100 122 85 78 212 120 99 137 118 ...
##  $ Rating                : chr  "B" "B" "A" "B" ...
##  $ Part_time             : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Undergrad_applications: int  451 350 573 NA 200 NA 548 503 260 261 ...
##  $ Undergrad_acceptances : int  99 89 134 NA 68 NA 97 126 95 119 ...
##  $ Undergrad_acceptRate  : int  22 25 23 NA 34 NA 18 25 37 46 ...
##  $ Founded               : int  1505 1960 1326 1965 1352 1964 1800 1584 1966 1869 ...
##  $ Percent_male          : int  58 71 52 47 60 54 66 51 63 53 ...
##  $ Percent_female        : int  42 29 48 53 40 46 34 49 37 47 ...
##  $ Fixed_assets          : int  66602000 105978346 70707000 10579203 191233087 33160032 86798000 152640692 43509000 64000000 ...
##  $ Tompkin_1997          : int  3 15 11 NA 23 NA 12 7 13 22 ...
##  $ Tompkin_1998          : int  2 13 6 NA 18 NA 11 5 12 21 ...
##  $ Tompkin_1999          : int  1 20 15 NA 8 NA 16 5 19 21 ...
##  $ Tompkin_2000          : int  1 15 9 NA 10 NA 8 3 21 18 ...
##  $ Tompkin_2001          : int  1 9 6 NA 20 NA 10 2 13 17 ...
##  $ Tompkin_2002          : int  4 10 3 NA 18 NA 8 2 20 16 ...
##  $ Tompkin_2003          : int  2 9 6 NA 7 NA 12 1 20 17 ...
##  $ Tompkin_2004          : int  2 19 4 NA 10 NA 17 1 15 25 ...
##  $ Tompkin_2005          : int  4 18 9 NA 16 NA 15 5 13 24 ...
##  $ Tompkin_2006          : int  6 13 12 NA 8 NA 11 1 19 22 ...
##  $ Tompkin_2007          : int  2 15 17 NA 8 NA 3 1 14 21 ...
##  $ Tompkin_2008          : int  8 6 13 NA 9 NA 12 2 21 22 ...
##  $ Tompkin_2009          : int  13 7 18 NA 10 NA 15 2 21 20 ...
##  $ Tompkin_2010          : int  12 3 8 NA 13 NA 15 1 22 21 ...
##  $ Tompkin_2011          : int  6 10 4 NA 12 NA 17 2 21 23 ...
##  $ Tompkin_2012          : int  9 5 11 NA 3 NA 20 2 19 22 ...
##  $ Tompkin_2013          : int  8 5 11 NA 16 NA 12 4 20 21 ...
##  $ Tompkin_Mean09to13    : num  4.9 11.3 9.6 NA 12.3 NA 12.6 2.7 17.8 20.8 ...
##  $ TompkinsScore_2013    : num  66.8 68.2 66.1 NA 64.9 ...
##  $ Firsts_2013           : num  24.7 28.3 25.3 NA 23.9 NA 23 30.5 21.4 18.5 ...

Set up the data how I want it

Just in case, I'll duplicate college as a string variable, and make grad college a factor.

data$College_char <- as.character(data$College)
data$Grad_coll_fact <- as.factor(data$Grad_coll)

Since I'm dropping some colleges, I'll replicate data as sample, to signify it's the analysis sample

DROP: Lucy Cavendish, as no wine budget, and the Grad colleges, as they aren't ranked.

sample <- subset(data, College_char != "Lucy Cavendish")
sample <- subset(sample, Grad_coll == FALSE)

Now general tidying - Make college name, row name. Add the age of college and members in a college. And make a log of the wine budget.

sample <- data.frame(sample[, -1], row.names = sample[, 1])
sample$Age_College <- 2014 - sample$Founded
sample$Members <- sample$Fellows + sample$Students
sample$log_Wine_budget <- log(sample$Wine_budget)

Model the data!

First by hand

R has functions to make running regressions easy. But it's good to know how they work. So first I'll do a simple analysis by hand of \( y=\alpha+\beta_1 X_1 \).

x <- sample$log_Wine_budget
y <- sample$Firsts_2013
x_bar <- mean(x)
y_bar <- mean(y)
beta_hat_1 <- sum((x - x_bar) * (y - y_bar))/sum((x - x_bar)^2)
beta_hat_0 <- y_bar - beta_hat_1 * x_bar
cat(beta_hat_0, beta_hat_1)
## -46.16 6.221

So here we have the intercept, and the coefficient for log of the wine budget, in that order.

Now using a command

simple <- lm(Firsts_2013 ~ log_Wine_budget, data = sample)
summary(simple)
##
## Call:
## lm(formula = Firsts_2013 ~ log_Wine_budget, data = sample)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -7.152 -2.136 -0.084  1.858 11.237
##
## Coefficients:
##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)      -46.157     11.024   -4.19   0.00029 ***
## log_Wine_budget    6.221      0.986    6.31 0.0000011 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.15 on 26 degrees of freedom
## Multiple R-squared:  0.605,  Adjusted R-squared:  0.59
## F-statistic: 39.8 on 1 and 26 DF,  p-value: 0.00000111

So why did I use the log? Below is the model with raw wine budget.

raw_fit <- lm(Firsts_2013 ~ Wine_budget, data = sample)
coef(raw_fit)
## (Intercept) Wine_budget
## 18.32024204  0.00005192
summary(raw_fit)$r.squared
## [1] 0.3973

So the r-squared suggests this doesn't fit the data as well. So without even looking at whether the assumptions behind the regression are right, it looks like log is a lot better. Usually you look at the fit by looking at the residuals, but here it's pretty easy to see in the raw data (below) that a log curve is going to be better.

plot of chunk LogVsNoLogplot of chunk LogVsNoLog

Now, a model also adjusting for the number of members in a college, the age and the percent female.

ols <- lm(Firsts_2013 ~ Members + log_Wine_budget + Age_College + Percent_female,
    data = sample)
summary(ols)
##
## Call:
## lm(formula = Firsts_2013 ~ Members + log_Wine_budget + Age_College +
##     Percent_female, data = sample)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -6.141 -2.498 -0.524  1.757 10.180
##
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)     -29.98560   15.34763   -1.95   0.0630 .
## Members           0.00479    0.00415    1.15   0.2608
## log_Wine_budget   4.44586    1.51712    2.93   0.0075 **
## Age_College       0.00575    0.00469    1.23   0.2319
## Percent_female   -0.05144    0.04886   -1.05   0.3034
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.14 on 23 degrees of freedom
## Multiple R-squared:  0.652,  Adjusted R-squared:  0.592
## F-statistic: 10.8 on 4 and 23 DF,  p-value: 0.0000449

R is smart enough to know that if you put plot, then a model results, you want to see the fit (below).

plot(ols, las = 1)

plot of chunk PlotResidualsplot of chunk PlotResidualsplot of chunk PlotResidualsplot of chunk PlotResiduals

Cook's D

We can display the observations that have relatively large values of data. A conventional cut-off point is 4 over n, where n is the number of observations in the data set. We will use this criterion to select the values to display.

d1 <- cooks.distance(ols)
r <- stdres(ols)
a <- cbind(sample, d1, r)
a[d1 > 4/nrow(sample), c("Firsts_2013", "Members", "Age_College", "log_Wine_budget",
    "Percent_female")]
##          Firsts_2013 Members Age_College log_Wine_budget Percent_female
## Homerton        16.0    1414          38           10.24             63
## Trinity         41.7    1245         468           12.32             37

Influence plot

influencePlot(ols, id.method = "College_char", main = "Influence Plot", sub = "Circle size is proportial to Cook's Distance")

plot of chunk InflueuncePlot

##                StudRes    Hat   CookD
## Murray Edwards -0.1543 0.5013 0.07071
## Trinity         3.3988 0.2366 0.70059

Compare two regressions in a plot

Set the wine budget we want to plot over

attach(sample)  # save typing it again and again
wine_min <- log(10000)
wine_max <- log(400000)
wine_step <- 40
plot_data <- data.frame(log_Wine_budget = seq(from = wine_min, to = wine_max,
    length.out = wine_step))

Simple OLS model

pred_simple <- data.frame(predict(simple, plot_data, interval = "predict"))

Add in other variables needed at means

plot_data$Age_College <- mean(Age_College)
plot_data$Percent_female <- mean(Percent_female)
plot_data$Members <- mean(Members)

Final OLS model

ols <- lm(Firsts_2013 ~ log_Wine_budget + Age_College + Percent_female + Members,
    data = sample)
pred_ols <- data.frame(predict(ols, plot_data, interval = "predict"))
detach(sample)

Merge plots and make wine budget in a readable scale.

plot_data$pred_simple <- pred_simple$fit
plot_data$pred_ols <- pred_ols$fit
plot_data$Wine_budget <- exp(plot_data$log_Wine_budget)

Plot it

plot_data_gg <- plot_data[, c("Wine_budget", "pred_simple", "pred_ols")]
plot_data_gg <- melt(plot_data_gg, id.vars = "Wine_budget")

# colour of the legend
cols <- c(`#000099` = "black", pred_simple = "red", pred_ols = "blue")

ggplot(plot_data_gg, aes(x = Wine_budget, y = value, colour = variable), xlab = "test",
    ) + geom_line() + ylim(0, 50) + geom_point(data = sample, aes(x = Wine_budget,
    y = Firsts_2013, colour = "#000099"), show_guide = FALSE) + coord_fixed(ratio = 10000/1) +
    theme_bw() + xlab("Wine budget (£)") + ylab("Percent with firsts") + theme(legend.title = element_blank()) +
    scale_colour_manual(values = cols, breaks = c("#000099", "pred_ols", "pred_simple"),
        labels = c("A college", "Adjusted", "Unadjusted"))

plot of chunk PlotIt

A post about: , , and

You May Also Enjoy