#message=FALSE and warning=FALSE to not include message/warnings
library(dplyr)
library(ggplot2)
library(knitr)
library(rmarkdown)
library(kableExtra)
library(caTools)
library(MASS)
library(forecast)
library(ggcorrplot)
library(psych)
library(MASS)
library(AICcmodavg)
library(caTools)
options(scipen = 999)
#here was included more packages to graphics (ggplot2)
#data preparation (dplyr) and
#format tables (Rmarkdown and kableExtra)
# When use read.table and the parameter 'as.is=FALSE', the characters are converted to factors
routeAirfares_df <- read.table("Route-Airfares.csv", header=TRUE, sep=",", as.is = F)
head(routeAirfares_df, 10)
## S_CITY E_CITY COUPON NEW VACATION SW
## 1 Dallas/Fort Worth TX Amarillo TX 1.00 3 No Yes
## 2 Atlanta GA Baltimore/Wash Intl MD 1.06 3 No No
## 3 Boston MA Baltimore/Wash Intl MD 1.06 3 No No
## 4 Chicago IL Baltimore/Wash Intl MD 1.06 3 No Yes
## 5 Chicago IL Baltimore/Wash Intl MD 1.06 3 No Yes
## 6 Cleveland OH Baltimore/Wash Intl MD 1.01 3 No Yes
## 7 Dallas/Fort Worth TX Baltimore/Wash Intl MD 1.28 3 No No
## 8 Fort Lauderdale FL Baltimore/Wash Intl MD 1.15 3 Yes Yes
## 9 Houston TX Baltimore/Wash Intl MD 1.33 3 No Yes
## 10 Kansas City MO Baltimore/Wash Intl MD 1.60 2 No Yes
## HI S_INCOME E_INCOME S_POP E_POP SLOT GATE DISTANCE PAX
## 1 5291.99 28637 21112 3036732 205711 Free Free 312 7864
## 2 5419.16 26993 29838 3532657 7145897 Free Free 576 8820
## 3 9185.28 30124 29838 5787293 7145897 Free Free 364 6452
## 4 2657.35 29260 29838 7830332 7145897 Controlled Free 612 25144
## 5 2657.35 29260 29838 7830332 7145897 Free Free 612 25144
## 6 3408.11 26046 29838 2230955 7145897 Free Free 309 13386
## 7 6754.48 28637 29838 3036732 7145897 Free Free 1220 4625
## 8 5584.00 26752 29838 1440377 7145897 Free Free 921 5512
## 9 4662.44 27211 29838 3770125 7145897 Free Free 1249 7811
## 10 2617.00 25450 29838 1694803 7145897 Free Free 964 4657
## FARE
## 1 64.11
## 2 174.47
## 3 207.76
## 4 85.47
## 5 85.47
## 6 56.76
## 7 228.00
## 8 116.54
## 9 172.63
## 10 114.76
routeAirfares_df1 <- routeAirfares_df[,c(3:16)]
kable(head(routeAirfares_df1, 10), caption = "First 10 Lines of routeAirfares_df1",
digits = 2, formatr="html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
COUPON | NEW | VACATION | SW | HI | S_INCOME | E_INCOME | S_POP | E_POP | SLOT | GATE | DISTANCE | PAX | FARE |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1.00 | 3 | No | Yes | 5291.99 | 28637 | 21112 | 3036732 | 205711 | Free | Free | 312 | 7864 | 64.11 |
1.06 | 3 | No | No | 5419.16 | 26993 | 29838 | 3532657 | 7145897 | Free | Free | 576 | 8820 | 174.47 |
1.06 | 3 | No | No | 9185.28 | 30124 | 29838 | 5787293 | 7145897 | Free | Free | 364 | 6452 | 207.76 |
1.06 | 3 | No | Yes | 2657.35 | 29260 | 29838 | 7830332 | 7145897 | Controlled | Free | 612 | 25144 | 85.47 |
1.06 | 3 | No | Yes | 2657.35 | 29260 | 29838 | 7830332 | 7145897 | Free | Free | 612 | 25144 | 85.47 |
1.01 | 3 | No | Yes | 3408.11 | 26046 | 29838 | 2230955 | 7145897 | Free | Free | 309 | 13386 | 56.76 |
1.28 | 3 | No | No | 6754.48 | 28637 | 29838 | 3036732 | 7145897 | Free | Free | 1220 | 4625 | 228.00 |
1.15 | 3 | Yes | Yes | 5584.00 | 26752 | 29838 | 1440377 | 7145897 | Free | Free | 921 | 5512 | 116.54 |
1.33 | 3 | No | Yes | 4662.44 | 27211 | 29838 | 3770125 | 7145897 | Free | Free | 1249 | 7811 | 172.63 |
1.60 | 2 | No | Yes | 2617.00 | 25450 | 29838 | 1694803 | 7145897 | Free | Free | 964 | 4657 | 114.76 |
Using the variables as a factors in the import step, the step to create dummies is not needed.
Withe the function glimpse
is possible to see that the
variables are converted to Factor (fct) in R:
glimpse(routeAirfares_df1)
## Rows: 638
## Columns: 14
## $ COUPON <dbl> 1.00, 1.06, 1.06, 1.06, 1.06, 1.01, 1.28, 1.15, 1.33, 1.60, 1…
## $ NEW <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ VACATION <fct> No, No, No, No, No, No, No, Yes, No, No, Yes, No, No, No, No,…
## $ SW <fct> Yes, No, No, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ HI <dbl> 5291.99, 5419.16, 9185.28, 2657.35, 2657.35, 3408.11, 6754.48…
## $ S_INCOME <int> 28637, 26993, 30124, 29260, 29260, 26046, 28637, 26752, 27211…
## $ E_INCOME <int> 21112, 29838, 29838, 29838, 29838, 29838, 29838, 29838, 29838…
## $ S_POP <int> 3036732, 3532657, 5787293, 7830332, 7830332, 2230955, 3036732…
## $ E_POP <int> 205711, 7145897, 7145897, 7145897, 7145897, 7145897, 7145897,…
## $ SLOT <fct> Free, Free, Free, Controlled, Free, Free, Free, Free, Free, F…
## $ GATE <fct> Free, Free, Free, Free, Free, Free, Free, Free, Free, Free, F…
## $ DISTANCE <int> 312, 576, 364, 612, 612, 309, 1220, 921, 1249, 964, 2104, 232…
## $ PAX <int> 7864, 8820, 6452, 25144, 25144, 13386, 4625, 5512, 7811, 4657…
## $ FARE <dbl> 64.11, 174.47, 207.76, 85.47, 85.47, 56.76, 228.00, 116.54, 1…
# Summarizing CUSTOT by HOSPITAL:
resume_fare <-
routeAirfares_df1 %>%
summarise(min. = min(FARE, na.rm = T),
"%25 Q" = quantile(FARE, .25, na.rm = T),
"median." = median(FARE, na.rm = T),
"mean." = mean(FARE, na.rm = T),
"%75 Q" = quantile(FARE, .75, na.rm = T),
"max." = max(FARE, na.rm = T),
"sd." = sd(FARE, na.rm = T))
kable(t(resume_fare), caption = "Descriptive Statistics from routeAirfares_df1",
digits = 2, formatr="html", col.names = c("Statistic / Value")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Statistic / Value | |
---|---|
min. | 42.47 |
%25 Q | 106.29 |
median. | 144.60 |
mean. | 160.88 |
%75 Q | 209.35 |
max. | 402.02 |
sd. | 76.02 |
The Fare Average Variable shows a range between 42.47 and
402.02
The mean of 160.88 is higher than the median 144.60, and it is probably
caused by the higher values of fares.
par(mfrow=c(1,1))
ggplot(routeAirfares_df1, aes(FARE)) +
geom_histogram(fill = "orange", colour="lightgray", binwidth = 30) +
ggtitle("Hist. Average Route Fares") +
theme_light()
The Histogram shows the Fare Variable with a right-Skewed distribution.
To analyse the distribution, two Statistical Tests are ran:
Shapiro-wilk test(shapiro.test
) and Kolmogorov-Smirnov
Test(ks.test
):
#Shapiro-wilk test
shapiro.test(routeAirfares_df1$FARE)
##
## Shapiro-Wilk normality test
##
## data: routeAirfares_df1$FARE
## W = 0.95358, p-value = 0.0000000000002732
shapiro.test(log(routeAirfares_df1$FARE))
##
## Shapiro-Wilk normality test
##
## data: log(routeAirfares_df1$FARE)
## W = 0.97821, p-value = 0.00000003811
#Kolmogorov-Smirnov Tests
ks.test(routeAirfares_df1$FARE, 'pnorm')
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: routeAirfares_df1$FARE
## D = 1, p-value < 0.00000000000000022
## alternative hypothesis: two-sided
ks.test(log(routeAirfares_df1$FARE), 'pnorm')
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: log(routeAirfares_df1$FARE)
## D = 0.99991, p-value < 0.00000000000000022
## alternative hypothesis: two-sided
Usually a Log transformation turn the variable in a Normal Distribution. On this analysis, the Tests did not show a normal distribution to the Fare variable and also to the Log Fare Variable.
ggplot(routeAirfares_df1, aes(FARE, colour=SW)) +
geom_freqpoly(binwidth = 30) +
ggtitle("Hist. Average Route Fares Without(No) and With(Yes) Southwest (SW) Airline Operation") +
theme_light()
On the graphic above is possible to see the difference between the Fare when a Route has a SW Operation.
ggplot(routeAirfares_df1, aes(x=SW, y=FARE)) +
stat_boxplot(geom = 'errorbar', width = 0.15, colour='steelblue')+
geom_boxplot(outlier.shape = NA,fill = c("aquamarine3","orange"), colour = "steelblue") +
geom_jitter(width = 0.2, colour='lightgray') +
ggtitle("Routes Average Route Fares Without(No) and With(Yes) Southwest (SW) Airline Operation") +
xlab("SouthWest Arilines Operation Routes") +
ylab("Average fare") +
theme_light()
Therefore, the Boxplot information in the graph corroborates the above, where it is possible to see the difference between the rates where there is SW operation and where there is none.
# Summarizing CUSTOT by HOSPITAL:
resume_by_SW <-
routeAirfares_df1 %>% group_by(SW) %>%
summarise(min. = min(FARE, na.rm = T),
"%25 Q" = quantile(FARE, .25, na.rm = T),
median. = median(FARE, na.rm = T),
mean. = mean(FARE, na.rm = T),
"%75 Q" = quantile(FARE, .75, na.rm = T),
max. = max(FARE, na.rm = T),
sd. = sd(FARE, na.rm = T))
kable(resume_by_SW, caption = "Descreptive Statistics from routeAirfares_df1",
digits = 2, formatr="html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
SW | min. | %25 Q | median. | mean. | %75 Q | max. | sd. |
---|---|---|---|---|---|---|---|
No | 57.62 | 127.38 | 174.67 | 188.18 | 238.96 | 402.02 | 70.56 |
Yes | 42.47 | 63.97 | 80.82 | 98.38 | 127.06 | 241.04 | 44.80 |
As expected, both the mean and the median where there is SW operation are smaller than on routes where there is no operation.
ggplot(routeAirfares_df1, aes(x=DISTANCE, y=FARE, color=SW)) +
geom_point() +
ggtitle("Distance vs. Average Fares") +
xlab("Distance") +
ylab("Average fare") +
theme_light() +
geom_smooth(method=lm, colour='orange')
## `geom_smooth()` using formula 'y ~ x'
The orange line shows a regression standard to Average Fares by Distance. The correlation is positive, i.e., as the distance increase, the Fare increase too.
ggplot(routeAirfares_df1, aes(x=PAX, y=FARE, color=SW)) +
geom_point() +
ggtitle("Number of passengers vs. Average Fares") +
xlab("Number of passengers") +
ylab("Average Fare") +
theme_light() +
geom_smooth(method=lm, colour='orange')
## `geom_smooth()` using formula 'y ~ x'
The orange line shows a regression standard to Average Fares by Number of passengers. The correlation here is negative, i.e., as the number of passengers increase, the Fare decrease. But the points in the graphic did not show clearly this correlation.
ggplot(routeAirfares_df1, aes(x=HI, y=FARE, color=SW)) +
geom_point(colour = 'steelblue') +
ggtitle("Herfindel Index vs Average Fares") +
xlab("Herfindel Index") +
ylab("Average Fare") +
theme_light() +
geom_smooth(method=lm, colour='orange')
## `geom_smooth()` using formula 'y ~ x'
Considering the Herfindel Index, was not possible to see a correlation with Fare in the graphic above.
corr_matrix <- round(cor(routeAirfares_df1[,c(1,2,5:9,12,13,14)]), 2)
ggcorrplot(corr_matrix, insig = "blank",
outline.col = "white",lab = TRUE, lab_size = 3, type = "upper",
colors = c("#6D9EC1", "white", "#E46726"), tl.cex = 8,
title = "Correlation Plot of Numeric Variables")
## Analysis of Correlation
pairs.panels(routeAirfares_df1[,c(1,5:9,12,13,14)])
The linear correlation with variable Fare is significant only with the variable Distance. The Distance shows 67% of positive correlation and the Coupom variable shows 50% of correlation with Fare.
## Using factor, the original df could be used
full_model_1 <- lm(FARE~.,data=routeAirfares_df1)
summary(full_model_1)
##
## Call:
## lm(formula = FARE ~ ., data = routeAirfares_df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.329 -22.707 -2.329 21.135 128.694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6994118001 27.3794275438 0.464 0.642931
## COUPON 3.7548860460 12.1940745200 0.308 0.758241
## NEW -2.3955300091 1.8754235621 -1.277 0.201962
## VACATIONYes -35.6444414923 3.6170502952 -9.855 < 0.0000000000000002 ***
## SWYes -40.9696003501 3.7437293472 -10.944 < 0.0000000000000002 ***
## HI 0.0084257891 0.0009900663 8.510 < 0.0000000000000002 ***
## S_INCOME 0.0012066778 0.0005171071 2.334 0.019938 *
## E_INCOME 0.0013742730 0.0003749187 3.666 0.000268 ***
## S_POP 0.0000034009 0.0000006523 5.213 0.0000002525 ***
## E_POP 0.0000043631 0.0000007547 5.781 0.0000000117 ***
## SLOTFree -16.2447682308 3.8468797056 -4.223 0.0000277170 ***
## GATEFree -20.5792299506 4.0015843898 -5.143 0.0000003629 ***
## DISTANCE 0.0749842554 0.0035795488 20.948 < 0.0000000000000002 ***
## PAX -0.0008709429 0.0001459072 -5.969 0.0000000040 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.47 on 624 degrees of freedom
## Multiple R-squared: 0.7868, Adjusted R-squared: 0.7823
## F-statistic: 177.1 on 13 and 624 DF, p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(full_model_1)
For this First Model, the summary shows the significance of each
variable.
The variables COUPOM and NEW did not has a p-value significant.
Therefore the variables Vacation(Yes), SW(yes), Slot(Free) and
Gate(Free) has negative coefficients, i.e., when they have the values
Yes/Yes/Free/Free respectively, each of them reduce the Fare. To sum up,
the Adjusted R-Squared is 78,2% for this model.
The graphics shows some points out of the normality line in the Q-Q Plot (476, 110 and 332).
Using the Cook’s D statistic: cooks.distance
function
use Cook’s D, observations > 4*mean are considered outliers
cooksD <- cooks.distance(full_model_1)
abline_add <- 4*mean(cooksD, na.rm=T)
ggplot(full_model_1, aes(seq_along(.cooksd), cooksD)) +
geom_bar(stat = "identity", position = "identity") +
xlab("Obs. Number") + ylab("Cookss Distance") +
geom_hline(yintercept = abline_add, col="red") +
ggtitle("Cooks distance") + theme_light()
The graphic shows the points above the Red Line are considered outliers.
outlier_rows <-cooksD >4*mean(cooksD)
#head(outlier_rows)
routeAirfares_df2 <- routeAirfares_df1[outlier_rows=="FALSE",]
#Use the vector to "filter" the original DF
head(routeAirfares_df2)
## COUPON NEW VACATION SW HI S_INCOME E_INCOME S_POP E_POP SLOT
## 1 1.00 3 No Yes 5291.99 28637 21112 3036732 205711 Free
## 2 1.06 3 No No 5419.16 26993 29838 3532657 7145897 Free
## 3 1.06 3 No No 9185.28 30124 29838 5787293 7145897 Free
## 4 1.06 3 No Yes 2657.35 29260 29838 7830332 7145897 Controlled
## 5 1.06 3 No Yes 2657.35 29260 29838 7830332 7145897 Free
## 6 1.01 3 No Yes 3408.11 26046 29838 2230955 7145897 Free
## GATE DISTANCE PAX FARE
## 1 Free 312 7864 64.11
## 2 Free 576 8820 174.47
## 3 Free 364 6452 207.76
## 4 Free 612 25144 85.47
## 5 Free 612 25144 85.47
## 6 Free 309 13386 56.76
str(routeAirfares_df2)
## 'data.frame': 601 obs. of 14 variables:
## $ COUPON : num 1 1.06 1.06 1.06 1.06 1.01 1.28 1.15 1.33 1.6 ...
## $ NEW : int 3 3 3 3 3 3 3 3 3 2 ...
## $ VACATION: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ SW : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 1 2 2 2 ...
## $ HI : num 5292 5419 9185 2657 2657 ...
## $ S_INCOME: int 28637 26993 30124 29260 29260 26046 28637 26752 27211 25450 ...
## $ E_INCOME: int 21112 29838 29838 29838 29838 29838 29838 29838 29838 29838 ...
## $ S_POP : int 3036732 3532657 5787293 7830332 7830332 2230955 3036732 1440377 3770125 1694803 ...
## $ E_POP : int 205711 7145897 7145897 7145897 7145897 7145897 7145897 7145897 7145897 7145897 ...
## $ SLOT : Factor w/ 2 levels "Controlled","Free": 2 2 2 1 2 2 2 2 2 2 ...
## $ GATE : Factor w/ 2 levels "Constrained",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ DISTANCE: int 312 576 364 612 612 309 1220 921 1249 964 ...
## $ PAX : int 7864 8820 6452 25144 25144 13386 4625 5512 7811 4657 ...
## $ FARE : num 64.1 174.5 207.8 85.5 85.5 ...
The outliers are removed from the database (37), and now the df has 601 observations.
full_model_2 <- lm(FARE~.,data=routeAirfares_df2)
summary(full_model_2)
##
## Call:
## lm(formula = FARE ~ ., data = routeAirfares_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.44 -21.21 -2.18 18.98 94.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.6638405279 25.2067956355 -0.225 0.822295
## COUPON 6.3613498966 11.3562933053 0.560 0.575583
## NEW 0.7424641746 1.7628147513 0.421 0.673777
## VACATIONYes -34.2020644798 3.2549215980 -10.508 < 0.0000000000000002 ***
## SWYes -41.6442782155 3.3576682143 -12.403 < 0.0000000000000002 ***
## HI 0.0088384407 0.0009533133 9.271 < 0.0000000000000002 ***
## S_INCOME 0.0012623843 0.0004897804 2.577 0.010196 *
## E_INCOME 0.0013458018 0.0003505830 3.839 0.000137 ***
## S_POP 0.0000039863 0.0000006131 6.502 0.00000000016971 ***
## E_POP 0.0000047381 0.0000007009 6.760 0.00000000003336 ***
## SLOTFree -12.5048811005 3.4705858274 -3.603 0.000341 ***
## GATEFree -19.8111963827 3.6245247044 -5.466 0.00000006818400 ***
## DISTANCE 0.0729480484 0.0033667686 21.667 < 0.0000000000000002 ***
## PAX -0.0009401951 0.0001331062 -7.063 0.00000000000461 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.13 on 587 degrees of freedom
## Multiple R-squared: 0.82, Adjusted R-squared: 0.816
## F-statistic: 205.6 on 13 and 587 DF, p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(full_model_2)
For this Second Model (Database without outliers), the summary shows
the significance of each variable.
The variables COUPOM and NOVO still do not have a significant p-value.
Moreover, the variables Vacation(Yes), SW(yes), Slot(Free) and
Gate(Free) still have a negative coefficient. The Adjusted R-Squared is
81,6% for this model.
full_model_3 <- stepAIC(full_model_2, direction="backward", trace=FALSE)
summary(full_model_3)
##
## Call:
## lm(formula = FARE ~ VACATION + SW + HI + S_INCOME + E_INCOME +
## S_POP + E_POP + SLOT + GATE + DISTANCE + PAX, data = routeAirfares_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.919 -21.707 -1.721 18.988 94.994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9035133301 19.2403174837 0.255 0.798923
## VACATIONYes -34.3259074035 3.2450321635 -10.578 < 0.0000000000000002 ***
## SWYes -41.8577542040 3.3339214262 -12.555 < 0.0000000000000002 ***
## HI 0.0087092114 0.0009093637 9.577 < 0.0000000000000002 ***
## S_INCOME 0.0012369141 0.0004869600 2.540 0.011339 *
## E_INCOME 0.0013403991 0.0003493437 3.837 0.000138 ***
## S_POP 0.0000039520 0.0000006102 6.477 0.000000000197883 ***
## E_POP 0.0000047509 0.0000006992 6.794 0.000000000026679 ***
## SLOTFree -12.7759665264 3.4433243832 -3.710 0.000227 ***
## GATEFree -19.9102960703 3.6164414930 -5.505 0.000000055019535 ***
## DISTANCE 0.0743740294 0.0023314574 31.900 < 0.0000000000000002 ***
## PAX -0.0009666576 0.0001226921 -7.879 0.000000000000016 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.09 on 589 degrees of freedom
## Multiple R-squared: 0.8198, Adjusted R-squared: 0.8164
## F-statistic: 243.6 on 11 and 589 DF, p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(full_model_3)
For this Third Model (Database without outliers), the summary also
shows the significance of each variable.
The variable (Intercept) did not have a significant p-value. Moreover,
the variables Vacation(Yes), SW(yes), Slot(Free) and Gate(Free) still
have a negative coefficient. The Adjusted R-Squared is 81,6% for this
model.
All numeric variables are transformed using log
function.
routeAirfares_df2_log <- routeAirfares_df2
routeAirfares_df2_log[,c(1,5:9,12,13,14)] <- log(routeAirfares_df2_log[,c(1,5:9,12,13,14)])
kable(head(routeAirfares_df2_log, 10), caption = "First 10 Lines of routeAirfares_df2_log",
digits = 2, formatr="html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
COUPON | NEW | VACATION | SW | HI | S_INCOME | E_INCOME | S_POP | E_POP | SLOT | GATE | DISTANCE | PAX | FARE |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0.00 | 3 | No | Yes | 8.57 | 10.26 | 9.96 | 14.93 | 12.23 | Free | Free | 5.74 | 8.97 | 4.16 |
0.06 | 3 | No | No | 8.60 | 10.20 | 10.30 | 15.08 | 15.78 | Free | Free | 6.36 | 9.08 | 5.16 |
0.06 | 3 | No | No | 9.13 | 10.31 | 10.30 | 15.57 | 15.78 | Free | Free | 5.90 | 8.77 | 5.34 |
0.06 | 3 | No | Yes | 7.89 | 10.28 | 10.30 | 15.87 | 15.78 | Controlled | Free | 6.42 | 10.13 | 4.45 |
0.06 | 3 | No | Yes | 7.89 | 10.28 | 10.30 | 15.87 | 15.78 | Free | Free | 6.42 | 10.13 | 4.45 |
0.01 | 3 | No | Yes | 8.13 | 10.17 | 10.30 | 14.62 | 15.78 | Free | Free | 5.73 | 9.50 | 4.04 |
0.25 | 3 | No | No | 8.82 | 10.26 | 10.30 | 14.93 | 15.78 | Free | Free | 7.11 | 8.44 | 5.43 |
0.14 | 3 | Yes | Yes | 8.63 | 10.19 | 10.30 | 14.18 | 15.78 | Free | Free | 6.83 | 8.61 | 4.76 |
0.29 | 3 | No | Yes | 8.45 | 10.21 | 10.30 | 15.14 | 15.78 | Free | Free | 7.13 | 8.96 | 5.15 |
0.47 | 2 | No | Yes | 7.87 | 10.14 | 10.30 | 14.34 | 15.78 | Free | Free | 6.87 | 8.45 | 4.74 |
full_model_4 <- lm(FARE~.,data=routeAirfares_df2_log)
summary(full_model_4)
##
## Call:
## lm(formula = FARE ~ ., data = routeAirfares_df2_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54200 -0.13380 0.00438 0.12776 0.58055
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.679652 1.104340 -5.143 0.00000036897000607 ***
## COUPON 0.003814 0.104332 0.037 0.970854
## NEW -0.010475 0.011026 -0.950 0.342510
## VACATIONYes -0.208164 0.020161 -10.325 < 0.0000000000000002 ***
## SWYes -0.351735 0.021029 -16.727 < 0.0000000000000002 ***
## HI 0.097026 0.026102 3.717 0.000221 ***
## S_INCOME 0.347635 0.082357 4.221 0.00002816958245335 ***
## E_INCOME 0.271137 0.059840 4.531 0.00000711343967710 ***
## S_POP 0.090004 0.011143 8.077 0.00000000000000378 ***
## E_POP 0.108469 0.011079 9.790 < 0.0000000000000002 ***
## SLOTFree -0.110045 0.020597 -5.343 0.00000013118957467 ***
## GATEFree -0.133537 0.022152 -6.028 0.00000000293238855 ***
## DISTANCE 0.372717 0.019018 19.598 < 0.0000000000000002 ***
## PAX -0.166088 0.016457 -10.092 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.194 on 587 degrees of freedom
## Multiple R-squared: 0.8452, Adjusted R-squared: 0.8418
## F-statistic: 246.5 on 13 and 587 DF, p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(full_model_4)
For this Fourth Model (Database without outliers - Log transformed),
the summary shows the significance of each variable.
The variables COUPOM and NOVO still do not have a significant p-value.
Moreover, the variables Vacation(Yes), SW(yes), Slot(Free), Gate(Free)
and PAX had a negative coefficient. The Adjusted R-Squared is 84,2% for
this model.
full_model_5 <- stepAIC(full_model_4, direction="backward", trace=FALSE)
summary(full_model_5)
##
## Call:
## lm(formula = FARE ~ VACATION + SW + HI + S_INCOME + E_INCOME +
## S_POP + E_POP + SLOT + GATE + DISTANCE + PAX, data = routeAirfares_df2_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54562 -0.12529 0.00372 0.12560 0.57859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.64645 1.09709 -5.147 0.00000036167328782 ***
## VACATIONYes -0.20821 0.02009 -10.366 < 0.0000000000000002 ***
## SWYes -0.35191 0.02089 -16.847 < 0.0000000000000002 ***
## HI 0.09446 0.02381 3.967 0.00008175901020489 ***
## S_INCOME 0.34753 0.08223 4.226 0.00002753777426602 ***
## E_INCOME 0.26836 0.05971 4.494 0.00000841007056412 ***
## S_POP 0.08976 0.01112 8.071 0.00000000000000395 ***
## E_POP 0.10844 0.01102 9.843 < 0.0000000000000002 ***
## SLOTFree -0.10917 0.02050 -5.326 0.00000014334230587 ***
## GATEFree -0.13351 0.02213 -6.033 0.00000000284908566 ***
## DISTANCE 0.37212 0.01287 28.907 < 0.0000000000000002 ***
## PAX -0.16645 0.01384 -12.026 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1939 on 589 degrees of freedom
## Multiple R-squared: 0.845, Adjusted R-squared: 0.8421
## F-statistic: 291.8 on 11 and 589 DF, p-value: < 0.00000000000000022
For this Fifth Model (Database without outliers - Log transformed -
Stepwise), the summary shows the significance of each variable.
All variables has significant p-values. The attention is for variables
Vacation(Yes), SW(yes), Slot(Free), Gate(Free) and PAX, because they had
negative coefficients. The Adjusted R-Squared is 84,2% for this
model.
List_AIC <- list(full_model_2,full_model_3,full_model_4,full_model_5)
List_AIC_names <- c('full_model_2','full_model_3','full_model_4','full_model_5')
aictab(cand.set = List_AIC, modnames = List_AIC_names)
##
## Model selection based on AICc:
##
## K AICc Delta_AICc AICcWt Cum.Wt LL
## full_model_5 13 -252.01 0.00 0.84 0.84 139.31
## full_model_4 15 -248.73 3.27 0.16 1.00 139.78
## full_model_3 13 5851.34 6103.34 0.00 1.00 -2912.36
## full_model_2 15 5855.04 6107.05 0.00 1.00 -2912.11
The best model, following the AIC criteria is the Model 5 - Stepwase with Log transformed Variables.
The Fifth model shows the better AIC, and the analysis of the coefficients are bellow:
The HI(Herfindel Index), S_INCOME (Starting city’s average personal income), E_INCOME(End city’s average personal income), S_POP(Starting city’s population), E_POP(End city’s population) and Distance had positive coefficients. When these variables increase, the Fare will increase too.
Vacation(Yes), SW(Yes), Slot(Free), Gate(Free) and Pax(Number of
passengers) had negative coefficients. To sum up, in all these cases the
Fare becomes lower:
- when the route is a vacation route - when the route has SW
Operation
- when the route has either endpoint airport slot free of controls
- when the route has either endpoint airport has a gate free of
constraints
The company could estimate the fare using the variables and optimize the fare when analyzing the positive and negative coefficients to decide on each route it will operate. For example, if two routes have similar variables, but one of them has a Slot(Free) and Gate(Free), the airlines could choose the other option to maximize the fare because Slot(Free) and Gate(Free) affect a Fare in a negative way (decrease).
The database was split using sample.split
. The database
with transformed variables was split using the same vector.
set.seed(1982)
train_prt <- sample.split(routeAirfares_df2$FARE, SplitRatio = 0.65)
routeAirfares_df2_train <- subset(routeAirfares_df2, train_prt==TRUE)
routeAirfares_df2_test <- subset(routeAirfares_df2, train_prt==FALSE)
routeAirfares_df2_log_train <- subset(routeAirfares_df2_log, train_prt==TRUE)
routeAirfares_df2_log_test <- subset(routeAirfares_df2_log, train_prt==FALSE)
nrow(routeAirfares_df2)
## [1] 601
nrow(routeAirfares_df2_train)
## [1] 390
nrow(routeAirfares_df2_test)
## [1] 211
nrow(routeAirfares_df2_log)
## [1] 601
nrow(routeAirfares_df2_log_train)
## [1] 390
nrow(routeAirfares_df2_log_test)
## [1] 211
Model 5th is the best model, but here all models was ran to compare their predictions:
full_model_2_train <- lm(FARE~.,data=routeAirfares_df2_train)
full_model_3_train <- stepAIC(full_model_2_train, direction="backward", trace=FALSE)
full_model_4_train <- lm(FARE~.,data=routeAirfares_df2_log_train)
full_model_5_train <- stepAIC(full_model_4_train, direction="backward", trace=FALSE)
predict
function to perform predictions for all modelsfull_model_2_pred <- predict(full_model_2_train, routeAirfares_df2_test)
full_model_3_pred <- predict(full_model_3_train, routeAirfares_df2_test)
full_model_4_pred <- predict(full_model_4_train, routeAirfares_df2_log_test)
full_model_5_pred <- predict(full_model_5_train, routeAirfares_df2_log_test)
Comparison of the predicted values vs. actual values
round(accuracy(full_model_2_pred, routeAirfares_df2_test$FARE),4)
## ME RMSE MAE MPE MAPE
## Test set 0.8599 31.4738 24.8809 -1.5112 18.7165
round(accuracy(full_model_3_pred, routeAirfares_df2_test$FARE),4)
## ME RMSE MAE MPE MAPE
## Test set 0.8838 31.3079 24.7557 -1.5716 18.6155
round(accuracy(full_model_4_pred, routeAirfares_df2_log_test$FARE),4)
## ME RMSE MAE MPE MAPE
## Test set 0.0004 0.1987 0.1601 -0.0778 3.3008
round(accuracy(full_model_5_pred, routeAirfares_df2_log_test$FARE),4)
## ME RMSE MAE MPE MAPE
## Test set 0.0005 0.1979 0.1588 -0.0755 3.2752
Model 5th has better predictive accuracy than the others. Thus, model 5th will be used for predictions in the next steps.
df_pred <- data.frame("Predicted" = full_model_5_pred,
"Actual" = routeAirfares_df2_log_test$FARE,
"Residual" = full_model_5_pred - routeAirfares_df2_log_test$FARE)
df_pred_real <- exp(df_pred)
df_pred_real$Residual <- (df_pred_real$Predicted-df_pred_real$Actual)
kable(head(df_pred_real, 20), caption = "First 20 Lines of Precited, Actual ans Residual Values",
digits = 2, formatr="html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Predicted | Actual | Residual | |
---|---|---|---|
1 | 58.88 | 64.11 | -5.23 |
7 | 268.45 | 228.00 | 40.45 |
8 | 116.03 | 116.54 | -0.51 |
10 | 141.04 | 114.76 | 26.28 |
12 | 226.41 | 228.99 | -2.58 |
25 | 117.14 | 113.50 | 3.64 |
27 | 75.12 | 69.12 | 6.00 |
30 | 119.65 | 134.30 | -14.65 |
38 | 188.22 | 234.15 | -45.93 |
39 | 209.66 | 203.17 | 6.49 |
43 | 148.13 | 106.60 | 41.53 |
44 | 134.90 | 106.60 | 28.30 |
45 | 148.88 | 136.27 | 12.61 |
53 | 79.15 | 69.10 | 10.05 |
54 | 75.62 | 69.10 | 6.52 |
55 | 127.65 | 91.83 | 35.82 |
57 | 81.96 | 57.62 | 24.34 |
63 | 188.80 | 157.20 | 31.60 |
66 | 166.08 | 143.59 | 22.49 |
67 | 74.90 | 75.07 | -0.17 |
ggplot(df_pred_real, aes(Residual)) +
geom_histogram(fill = "orange", colour="lightgray", bins = 30) +
ggtitle("Hist. Residual of Predictions") +
theme_light()
Prediction of the Fare considering the values of variables are:
- COUPON = 1.202
- NEW = 3
- VACATION = No
- SW = No
- HI = 4442.141
- S_INCOME = $28,760
- E_INCOME = $27,664
- S_POP = 4,557,004
- E_POP = 3,195,503
- SLOT = Free
- GATE = Free
- PAX = 12,782 and
- DISTANCE = 1976 miles.
pred_data1 <- data.frame(COUPON = 1.202, NEW = 3, VACATION ="No", SW = "No", HI = 4442.141, S_INCOME = 28760,
E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503, SLOT = "Free", GATE = "Free",
PAX = 12782, DISTANCE = 1976)
#Log transformation
pred_data1_log <- pred_data1
pred_data1_log[,c(1,5:9,12,13)] <- log(pred_data1_log[,c(1,5:9,12,13)])
predict_fare1 <- predict(full_model_5_train, pred_data1_log, interval="prediction", level=0.05)
print(exp(predict_fare1))
## fit lwr upr
## 1 242.9216 239.9687 245.9108
For this group of variables the prediction of Fare was Usd 243, with the range between Usd 240 and Usd 246.
Predict the reduction in average fare on the route described in step 11 above if SW Airlines decides to cover this route (i.e., SW = Yes)
pred_data2 <- data.frame(COUPON = 1.202, NEW = 3, VACATION ="No", SW = "Yes", HI = 4442.141, S_INCOME = 28760,
E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503, SLOT = "Free", GATE = "Free",
PAX = 12782, DISTANCE = 1976)
#Log transform
pred_data2_log <- pred_data2
pred_data2_log[,c(1,5:9,12,13)] <- log(pred_data2_log[,c(1,5:9,12,13)])
predict_fare2 <- predict(full_model_5_train, pred_data2_log, interval="prediction", level=0.05)
print(exp(predict_fare2))
## fit lwr upr
## 1 166.097 164.0715 168.1475
Considering the same value of the group variables above cited, but
changing the SW Operation to “Yes”, the prediction of Fare was Usd 166,
with the range between Usd 164 and Usd 168.
Thus, the impact of SW start operate this route is reduce the Average
Fare in Usd 76.82 compare to not operate the route.
The SW Airline is a low cost company. When it starts a operation in
one route, the average of Fare reduce.
Thus, to maximize its fare, will be necessary analyze the other
variables of the route.
As mentioned before, the modelling shows a list of variables that impact
in reduction of fare.
Because is knew that its Fare is lower than other companies, the SW
needs to evaluate the other variables. One example could be avoid the
routes where Vacation is Yes, Slot is Free and Gate is Free. And
prioritize the routes which has a higher HI(Herfindel Index), S_INCOME
(Starting city’s average personal income), E_INCOME(End city’s average
personal income), S_POP(Starting city’s population), E_POP(End city’s
population) and Distance.
In competitive industries, A new entrant with a novel business model can have a disruptive effect on existing firms. If a new entrant’s business model is sustainable, other players are forced to respond by changing their business practices.
If the goal of the analysis was to evaluate the effect of Southwest Airlines’ presence on the airline industry rather than predicting fares on new routes, how would the analysis be different? Describe the conceptual and technical aspects.
In an analysis to assess the impact of SW Airline’s entry into
operation, the SW variable can be used as a response variable in a
Logistic Regression.
In this way, each of the other variables can be used as predictors of
the variable SW.
With this, it would be possible to evaluate which other routes there is
a greater probability of entering the SW, and still, using the multiple
regression model (5th) it would be possible to evaluate the impact on
the Average Fare of these routes.