# Polynomial and multiple linear regression model in R

1887 Views

##############################################################
### Polynomial and multiple linear regression model
##############################################################

rm(list = ls())
cat(rep(" ",128))
setwd("c:/!!VSE/4ST611/4 week/")
getwd()

# install.packages("car")

# addendum to the topic READING EXTERNAL DATA SET discussed in a previous seminar:
# you can read online data directly to R using e.g. function read.table()
str(dat)

#######################
# Exercises
#######################
# Read the data iris from base package (base R installation)

# 1.] Explore data set "ship_speed_fuel.csv", i.e.
# 1.a. plot histograms for each variable to check the normality and identify outliers
# 1.b. plot pairwise scatterplots to see:
#      the relationship and covariance (and potential heteroskedasticity)
#      and calculate the Pearson's correlation coefficient.

# 2.] Explore the relationship between Speed, Fuel and type of ship (ship_leg).
#     Plot pairwise scatterplot for Fuel and Speed and distinguish by colour the type of ship.
#     Could a straight line regression model be a suitable description of the relationship?

# 3.] Using the least-squares method, estimate the quadratic polynomial regression model
#     for fuel as the dependent variable (Y) on speed (explanatory vairable, X)
#     separately for 2 subsets:
#     - 1st subset are ships with length = 1, 2, or 3.
#     - 2nd subset are ships with length = 4, or 5.

# 4.] Using the least-squares method, estimate the regression line for this 2 subsets.

# 5.] Decide, which model better fits the dependency of Fuel consumption on speed.
#     5a.] based on information criteria: R2adj, AIC, BIC
#     5b.] based on graphical analysis (plot both fitted models and compare them with empirical values).
#     5c.] based on residual analysis.

### SHIP data -----
## 1.)
str(dat)
table(dat\$ship_leg)

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs(dat, gap=0, lower.panel = panel.smooth, upper.panel = panel.cor)

pairs(dat,
gap=0,
lower.panel = panel.smooth,
upper.panel = panel.cor,
diag.panel = function (x, ...) {
par(new = TRUE)
hist(x, col = "light blue", probability = TRUE, axes=FALSE, main = "")
lines(density(x),col="red",lwd=3)
rug(x)
})

# fuel is positively skewed distributed.
# It may help to log transform it to get more symmetrical response variable.
# proper for ANCOVA
dat\$ln.fuel <- log(dat\$fuel)
hist(dat\$ln.fuel)

## 2.)
source("marginal_plot.R")
marginal_plot(x = speed, y = fuel, group = ship_leg, data = dat, bw = "nrd", xlab = "Speed", ylab = "Fuel", pch = 15, cex = 0.5)

## 3.)
# sub1 = 1st subset are ships with length = 1, 2, or 3.
sub1 <- dat[dat\$ship_leg<4> table(sub1\$ship_leg)

# sub2 = 2nd subset are ships with length = 4, or 5.
sub2 <- dat[dat\$ship_leg>3, ]
table(sub2\$ship_leg)

# estimate the quadratic polynomial regression model in sub1 for
fit = lm(fuel ~ poly(speed, degree = 2, raw = TRUE), data = sub1)
fitI = lm(fuel ~ speed + I(speed^2), data = sub1)

summary(fit)
summary(fitI)

fit2 = lm(fuel ~ poly(speed, degree = 2, raw = TRUE), data = sub2)
fit2I = lm(fuel ~ speed + I(speed^2), data = sub2)

summary(fit2)
summary(fit2I)

## 4.)
regline = lm(fuel ~ speed, data = sub1)
regline2 = lm(fuel ~ speed, data = sub2)

summary(regline)
summary(regline2)

## 5.)
# 5a. information criteria: R2adj, AIC, BIC

AIC(fit, fitI, regline)
BIC(fit, fitI, regline)

# 5b. graphical exploration of fitted models
# make models for sub1
fitI = lm(fuel ~ speed + I(speed^2), data = sub1)
regline = lm(fuel ~ speed, data = sub1)

# make new data
new.data <- data.frame(speed = seq(from = min(sub1\$speed),
to = max(sub1\$speed), length.out = 200))

# predict
pred_lm <- predict(fitI, newdata = new.data)
pred_lm2 <- predict(regline, newdata = new.data)

# plot
plot(fuel ~ speed, data = sub1)
lines(pred_lm ~ new.data\$speed, col = "red")
lines(pred_lm2 ~ new.data\$speed, col = "blue")
legend("topleft", c("quadratic", "linear"), col = c("red", "blue"), lty = 1)

# 5c. residual analysis

#Model validation
#1. Homogeneity
#2. Independence
#3. Influential observations
#4. Normality
#5. Does it all make sense?

#Standard graphical output lm:
par(mfrow = c(2, 2))
plot(fitI)
plot(regline)

#More specific output:
#Homogeneity
#E1 <- resid(fitI)   #or better:
E1 <- rstandard(fitI)
F1 <- fitted(fitI)

plot(x = F1,
y = E1,
xlab = "Fitted values",
ylab = "Residuals",
main = "Homogeneity?")
abline(h = 0, v = 0, lty = 2)

#Normality
hist(E1, main = "Normality", breaks=10)
hist(rnorm(100))  # histogram nedoporucuje. I pro nahodne vygenerovana data z normalniho rozdeleni.

#Or qq-plot
#qqnorm(E1)
#qqline(E1)

#Dependence due to model misfit
#Plot residuals versus covariates
plot(x = sub1\$speed,
y = E1)
abline(h = 0, lty = 2)

###################################
# Multiple regression with 2 explanatory variables:
# - one is numerical and one is categorical.
# Tool: ANCOVA
##################################
# In the previous exercise, we had found out,
# that there is non-linear (quadratic) dependency of the consumption on the speed.
# But there was an issue with the pattern in residuals for the covariate ship length (rows 191-192).
# The issue leads us to incorporate this covariate into regression model.
# That means, we should model multiple regression
# with 2 explanatory variables:
#   - speed (X1, numerical) and ship length (X2, factor)
# Such a model is called ANCOVA.

###########
# 1.] Construct the multiple model for consumption with speed and length as explanatory variables,
#     where the quadratic relationship seems to be suitable for the speed and linear for length.

# 2.] Compare the quality of the fitted ANCOVA model with the multivar linear model and univariate quadratic model with speed (fitI).

# 3.] Add the interaction term (between the speed and length) to the ANCOVA model and decide, whether it does improve the fit or not.

# 1. and 2.
source("marginal_plot.R")
marginal_plot(x = speed, y = fuel, group = ship_leg, data = dat, bw = "nrd", xlab = "Speed", ylab = "Fuel", pch = 15, cex = 0.5)
# we can see 2 different levels of ship leg regarding to the relationship between consumtion (fuel) and speed

# lets categorize the observations (ships) into 2 categories (small, large) as follows:
dat\$categ <- as.factor(ifelse(dat\$ship_leg<4> table(dat\$categ)
dat\$categ = relevel(dat\$categ, ref = "smaller") # we change the reference level (of ship_leg) to smaller
dat\$categ = factor(dat\$categ, levels = c("smaller", "larger")) # we switch back to the original order

contrasts(dat\$categ) # in regression, ship_leg will be represented using dummy variables; this type of parameterization is called "contr.treatment" in R

# contrasts(dat\$categ) = "contr.sum" # a different parameterization (see the lecture); this type of parametization is called "contr.sum" in R
# contrasts(dat\$categ) # the new parameterization

# contrasts(dat\$categ) = "contr.treatment" # we switch back to the parameterization employing dummy variables

fit = lm(fuel ~ speed + I(speed^2) + categ, data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)
coef(fit)
summary(fit)

fit.univar = lm(fuel ~ speed + I(speed^2), data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)
coef(fit.univar)
summary(fit.univar)

fit.lines = lm(fuel ~ speed + categ, data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)
coef(fit.lines)
summary(fit.lines)

# fit model is the best.

I feel that this R - code can help someone fit their research models.