• 0
Votes
name

A PHP Error was encountered

Severity: Notice

Message: Undefined index: userid

Filename: views/question.php

Line Number: 212

Backtrace:

File: /home/mycocrkc/statisticsanswered.com/application/views/question.php
Line: 212
Function: _error_handler

File: /home/mycocrkc/statisticsanswered.com/application/controllers/Questions.php
Line: 416
Function: view

File: /home/mycocrkc/statisticsanswered.com/index.php
Line: 315
Function: require_once

name Punditsdkoslkdosdkoskdo

Polynomial and multiple linear regression model in R


##############################################################
### Polynomial and multiple linear regression model
##############################################################


rm(list = ls()) 
cat(rep(" ",128)) 
setwd("c:/!!VSE/4ST611/4 week/")
getwd()

# install.packages("car")

# addendum to the topic READING EXTERNAL DATA SET discussed in a previous seminar: 
# you can read online data directly to R using e.g. function read.table()
dat = read.csv("ship_speed_fuel.csv")
str(dat)


#######################
# Exercises
#######################
# Read the data iris from base package (base R installation)

# 1.] Explore data set "ship_speed_fuel.csv", i.e.
# 1.a. plot histograms for each variable to check the normality and identify outliers
# 1.b. plot pairwise scatterplots to see: 
#      the relationship and covariance (and potential heteroskedasticity) 
#      and calculate the Pearson's correlation coefficient.

# 2.] Explore the relationship between Speed, Fuel and type of ship (ship_leg).
#     Plot pairwise scatterplot for Fuel and Speed and distinguish by colour the type of ship.
#     Could a straight line regression model be a suitable description of the relationship?

# 3.] Using the least-squares method, estimate the quadratic polynomial regression model 
#     for fuel as the dependent variable (Y) on speed (explanatory vairable, X)
#     separately for 2 subsets: 
#     - 1st subset are ships with length = 1, 2, or 3.
#     - 2nd subset are ships with length = 4, or 5.

# 4.] Using the least-squares method, estimate the regression line for this 2 subsets.

# 5.] Decide, which model better fits the dependency of Fuel consumption on speed.
#     5a.] based on information criteria: R2adj, AIC, BIC
#     5b.] based on graphical analysis (plot both fitted models and compare them with empirical values).
#     5c.] based on residual analysis. 

### SHIP data -----
## 1.)
str(dat)
table(dat$ship_leg)

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs(dat, gap=0, lower.panel = panel.smooth, upper.panel = panel.cor)

pairs(dat, 
      gap=0, 
      lower.panel = panel.smooth,
      upper.panel = panel.cor,
      diag.panel = function (x, ...) {
        par(new = TRUE) 
        hist(x, col = "light blue", probability = TRUE, axes=FALSE, main = "") 
        lines(density(x),col="red",lwd=3) 
        rug(x)
      })

# fuel is positively skewed distributed. 
# It may help to log transform it to get more symmetrical response variable.
# proper for ANCOVA
dat$ln.fuel <- log(dat$fuel)
hist(dat$ln.fuel)

## 2.)
source("marginal_plot.R")
marginal_plot(x = speed, y = fuel, group = ship_leg, data = dat, bw = "nrd", xlab = "Speed", ylab = "Fuel", pch = 15, cex = 0.5)

## 3.)
# sub1 = 1st subset are ships with length = 1, 2, or 3.
sub1 <- dat[dat$ship_leg<4> table(sub1$ship_leg)

# sub2 = 2nd subset are ships with length = 4, or 5.
sub2 <- dat[dat$ship_leg>3, ]
table(sub2$ship_leg)

# estimate the quadratic polynomial regression model in sub1 for 
fit = lm(fuel ~ poly(speed, degree = 2, raw = TRUE), data = sub1)
fitI = lm(fuel ~ speed + I(speed^2), data = sub1)

summary(fit)
summary(fitI)

fit2 = lm(fuel ~ poly(speed, degree = 2, raw = TRUE), data = sub2)
fit2I = lm(fuel ~ speed + I(speed^2), data = sub2)

summary(fit2)
summary(fit2I)


## 4.)
regline = lm(fuel ~ speed, data = sub1)
regline2 = lm(fuel ~ speed, data = sub2)

summary(regline)
summary(regline2)

## 5.)
# 5a. information criteria: R2adj, AIC, BIC
summary(fit)$adj.r.squared
summary(fitI)$adj.r.squared
summary(regline)$adj.r.squared

AIC(fit, fitI, regline)
BIC(fit, fitI, regline)


# 5b. graphical exploration of fitted models
# make models for sub1
fitI = lm(fuel ~ speed + I(speed^2), data = sub1)
regline = lm(fuel ~ speed, data = sub1)

# make new data
new.data <- data.frame(speed = seq(from = min(sub1$speed),
    to = max(sub1$speed), length.out = 200))

# predict
pred_lm <- predict(fitI, newdata = new.data)
pred_lm2 <- predict(regline, newdata = new.data)

# plot
plot(fuel ~ speed, data = sub1)
lines(pred_lm ~ new.data$speed, col = "red")
lines(pred_lm2 ~ new.data$speed, col = "blue")
legend("topleft", c("quadratic", "linear"), col = c("red", "blue"), lty = 1)

                      
# 5c. residual analysis

#Model validation
#1. Homogeneity
#2. Independence
#3. Influential observations
#4. Normality
#5. Does it all make sense?


#Standard graphical output lm:
par(mfrow = c(2, 2))
plot(fitI)
plot(regline)

#More specific output:
#Homogeneity
#E1 <- resid(fitI)   #or better: 
E1 <- rstandard(fitI)
F1 <- fitted(fitI)

plot(x = F1, 
     y = E1, 
     xlab = "Fitted values",
     ylab = "Residuals", 
     main = "Homogeneity?")
abline(h = 0, v = 0, lty = 2)

#Normality
hist(E1, main = "Normality", breaks=10) 
hist(rnorm(100))  # histogram nedoporucuje. I pro nahodne vygenerovana data z normalniho rozdeleni.

#Or qq-plot
#qqnorm(E1)
#qqline(E1)

#Dependence due to model misfit
#Plot residuals versus covariates
plot(x = sub1$speed,
     y = E1)
abline(h = 0, lty = 2)


###################################
# Multiple regression with 2 explanatory variables:
# - one is numerical and one is categorical.
# Tool: ANCOVA
##################################
# In the previous exercise, we had found out, 
# that there is non-linear (quadratic) dependency of the consumption on the speed.
# But there was an issue with the pattern in residuals for the covariate ship length (rows 191-192).
# The issue leads us to incorporate this covariate into regression model.
# That means, we should model multiple regression 
# with 2 explanatory variables:
#   - speed (X1, numerical) and ship length (X2, factor)
# Such a model is called ANCOVA.

###########
# Tasks:
# 1.] Construct the multiple model for consumption with speed and length as explanatory variables,
#     where the quadratic relationship seems to be suitable for the speed and linear for length.

# 2.] Compare the quality of the fitted ANCOVA model with the multivar linear model and univariate quadratic model with speed (fitI).

# 3.] Add the interaction term (between the speed and length) to the ANCOVA model and decide, whether it does improve the fit or not.


# 1. and 2. 
source("marginal_plot.R")
marginal_plot(x = speed, y = fuel, group = ship_leg, data = dat, bw = "nrd", xlab = "Speed", ylab = "Fuel", pch = 15, cex = 0.5)
# we can see 2 different levels of ship leg regarding to the relationship between consumtion (fuel) and speed

# lets categorize the observations (ships) into 2 categories (small, large) as follows:
dat$categ <- as.factor(ifelse(dat$ship_leg<4> table(dat$categ)
dat$categ = relevel(dat$categ, ref = "smaller") # we change the reference level (of ship_leg) to smaller
dat$categ = factor(dat$categ, levels = c("smaller", "larger")) # we switch back to the original order

contrasts(dat$categ) # in regression, ship_leg will be represented using dummy variables; this type of parameterization is called "contr.treatment" in R

# contrasts(dat$categ) = "contr.sum" # a different parameterization (see the lecture); this type of parametization is called "contr.sum" in R
# contrasts(dat$categ) # the new parameterization

# contrasts(dat$categ) = "contr.treatment" # we switch back to the parameterization employing dummy variables

fit = lm(fuel ~ speed + I(speed^2) + categ, data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)
coef(fit)
summary(fit)

fit.univar = lm(fuel ~ speed + I(speed^2), data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)
coef(fit.univar)
summary(fit.univar)

fit.lines = lm(fuel ~ speed + categ, data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)
coef(fit.lines)
summary(fit.lines)

# fit model is the best.