#### A PHP Error was encountered

Severity: Warning

Message: Undefined array key "userid"

Filename: views/question.php

Line Number: 212

Backtrace:

File: /home/statpmkj/public_html/application/views/question.php

Line: 212

Function: _error_handler

File: /home/statpmkj/public_html/application/controllers/Questions.php

Line: 416

Function: view

File: /home/statpmkj/public_html/index.php

Line: 315

Function: require_once

## Polynomial and multiple linear regression model in R

##############################################################

### Polynomial and multiple linear regression model

##############################################################

rm(list = ls())

cat(rep("
",128))

setwd("c:/!!VSE/4ST611/4 week/")

getwd()

# install.packages("car")

# addendum to the topic READING EXTERNAL DATA SET discussed in a previous seminar:

# you can read online data directly to R using e.g. function read.table()

dat = read.csv("ship_speed_fuel.csv")

str(dat)

#######################

# Exercises

#######################

# Read the data iris from base package (base R installation)

#

# 1.] Explore data set "ship_speed_fuel.csv", i.e.

# 1.a. plot histograms for each variable to check the normality and identify outliers

# 1.b. plot pairwise scatterplots to see:

# the relationship and covariance (and potential heteroskedasticity)

# and calculate the Pearson's correlation coefficient.

#

# 2.] Explore the relationship between Speed, Fuel and type of ship (ship_leg).

# Plot pairwise scatterplot for Fuel and Speed and distinguish by colour the type of ship.

# Could a straight line regression model be a suitable description of the relationship?

# 3.] Using the least-squares method, estimate the quadratic polynomial regression model

# for fuel as the dependent variable (Y) on speed (explanatory vairable, X)

# separately for 2 subsets:

# - 1st subset are ships with length = 1, 2, or 3.

# - 2nd subset are ships with length = 4, or 5.

# 4.] Using the least-squares method, estimate the regression line for this 2 subsets.

# 5.] Decide, which model better fits the dependency of Fuel consumption on speed.

# 5a.] based on information criteria: R2adj, AIC, BIC

# 5b.] based on graphical analysis (plot both fitted models and compare them with empirical values).

# 5c.] based on residual analysis.

### SHIP data -----

## 1.)

str(dat)

table(dat$ship_leg)

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)

{

usr <- par("usr"); on.exit(par(usr))

par(usr = c(0, 1, 0, 1))

r <- abs(cor(x, y))

txt <- format(c(r, 0.123456789), digits = digits)[1]

txt <- paste0(prefix, txt)

if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)

text(0.5, 0.5, txt, cex = cex.cor * r)

}

pairs(dat, gap=0, lower.panel = panel.smooth, upper.panel = panel.cor)

pairs(dat,

gap=0,

lower.panel = panel.smooth,

upper.panel = panel.cor,

diag.panel = function (x, ...) {

par(new = TRUE)

hist(x, col = "light blue", probability = TRUE, axes=FALSE, main = "")

lines(density(x),col="red",lwd=3)

rug(x)

})

# fuel is positively skewed distributed.

# It may help to log transform it to get more symmetrical response variable.

# proper for ANCOVA

dat$ln.fuel <- log(dat$fuel)

hist(dat$ln.fuel)

## 2.)

source("marginal_plot.R")

marginal_plot(x = speed, y = fuel, group = ship_leg, data = dat, bw = "nrd", xlab = "Speed", ylab = "Fuel", pch = 15, cex = 0.5)

## 3.)

# sub1 = 1st subset are ships with length = 1, 2, or 3.

sub1 <- dat[dat$ship_leg<4>
table(sub1$ship_leg)

# sub2 = 2nd subset are ships with length = 4, or 5.

sub2 <- dat[dat$ship_leg>3, ]

table(sub2$ship_leg)

# estimate the quadratic polynomial regression model in sub1 for

fit = lm(fuel ~ poly(speed, degree = 2, raw = TRUE), data = sub1)

fitI = lm(fuel ~ speed + I(speed^2), data = sub1)

summary(fit)

summary(fitI)

fit2 = lm(fuel ~ poly(speed, degree = 2, raw = TRUE), data = sub2)

fit2I = lm(fuel ~ speed + I(speed^2), data = sub2)

summary(fit2)

summary(fit2I)

## 4.)

regline = lm(fuel ~ speed, data = sub1)

regline2 = lm(fuel ~ speed, data = sub2)

summary(regline)

summary(regline2)

## 5.)

# 5a. information criteria: R2adj, AIC, BIC

summary(fit)$adj.r.squared

summary(fitI)$adj.r.squared

summary(regline)$adj.r.squared

AIC(fit, fitI, regline)

BIC(fit, fitI, regline)

# 5b. graphical exploration of fitted models

# make models for sub1

fitI = lm(fuel ~ speed + I(speed^2), data = sub1)

regline = lm(fuel ~ speed, data = sub1)

# make new data

new.data <- data.frame(speed = seq(from = min(sub1$speed),

to = max(sub1$speed), length.out = 200))

# predict

pred_lm <- predict(fitI, newdata = new.data)

pred_lm2 <- predict(regline, newdata = new.data)

# plot

plot(fuel ~ speed, data = sub1)

lines(pred_lm ~ new.data$speed, col = "red")

lines(pred_lm2 ~ new.data$speed, col = "blue")

legend("topleft", c("quadratic", "linear"), col = c("red", "blue"), lty = 1)

# 5c. residual analysis

#Model validation

#1. Homogeneity

#2. Independence

#3. Influential observations

#4. Normality

#5. Does it all make sense?

#Standard graphical output lm:

par(mfrow = c(2, 2))

plot(fitI)

plot(regline)

#More specific output:

#Homogeneity

#E1 <- resid(fitI) #or better:

E1 <- rstandard(fitI)

F1 <- fitted(fitI)

plot(x = F1,

y = E1,

xlab = "Fitted values",

ylab = "Residuals",

main = "Homogeneity?")

abline(h = 0, v = 0, lty = 2)

#Normality

hist(E1, main = "Normality", breaks=10)

hist(rnorm(100)) # histogram nedoporucuje. I pro nahodne vygenerovana data z normalniho rozdeleni.

#Or qq-plot

#qqnorm(E1)

#qqline(E1)

#Dependence due to model misfit

#Plot residuals versus covariates

plot(x = sub1$speed,

y = E1)

abline(h = 0, lty = 2)

###################################

# Multiple regression with 2 explanatory variables:

# - one is numerical and one is categorical.

# Tool: ANCOVA

##################################

# In the previous exercise, we had found out,

# that there is non-linear (quadratic) dependency of the consumption on the speed.

# But there was an issue with the pattern in residuals for the covariate ship length (rows 191-192).

# The issue leads us to incorporate this covariate into regression model.

# That means, we should model multiple regression

# with 2 explanatory variables:

# - speed (X1, numerical) and ship length (X2, factor)

# Such a model is called ANCOVA.

#

###########

# Tasks:

# 1.] Construct the multiple model for consumption with speed and length as explanatory variables,

# where the quadratic relationship seems to be suitable for the speed and linear for length.

#

# 2.] Compare the quality of the fitted ANCOVA model with the multivar linear model and univariate quadratic model with speed (fitI).

#

# 3.] Add the interaction term (between the speed and length) to the ANCOVA model and decide, whether it does improve the fit or not.

# 1. and 2.

source("marginal_plot.R")

marginal_plot(x = speed, y = fuel, group = ship_leg, data = dat, bw = "nrd", xlab = "Speed", ylab = "Fuel", pch = 15, cex = 0.5)

# we can see 2 different levels of ship leg regarding to the relationship between consumtion (fuel) and speed

# lets categorize the observations (ships) into 2 categories (small, large) as follows:

dat$categ <- as.factor(ifelse(dat$ship_leg<4>
table(dat$categ)

dat$categ = relevel(dat$categ, ref = "smaller") # we change the reference level (of ship_leg) to smaller

dat$categ = factor(dat$categ, levels = c("smaller", "larger")) # we switch back to the original order

contrasts(dat$categ) # in regression, ship_leg will be represented using dummy variables; this type of parameterization is called "contr.treatment" in R

# contrasts(dat$categ) = "contr.sum" # a different parameterization (see the lecture); this type of parametization is called "contr.sum" in R

# contrasts(dat$categ) # the new parameterization

#

# contrasts(dat$categ) = "contr.treatment" # we switch back to the parameterization employing dummy variables

fit = lm(fuel ~ speed + I(speed^2) + categ, data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)

coef(fit)

summary(fit)

fit.univar = lm(fuel ~ speed + I(speed^2), data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)

coef(fit.univar)

summary(fit.univar)

fit.lines = lm(fuel ~ speed + categ, data = dat) # model with two qualitative explanatory variables (educ, manag) and one quantitative explanatory variable (exper)

coef(fit.lines)

summary(fit.lines)

# fit model is the best.