ST 558 Project 2
Kaylee Frazier and Rebecca Voelker 10/31/2021
#Read in Libraries
library(tidyverse)
library(readr)
library(ggplot2)
library(randomForest)
library(shiny)
library(dplyr)
library(caret)
## Check Working Directory Path
getwd()
## [1] "C:/Users/Rebecca/OneDrive/Desktop/ST 558/Repos/Project2"
## Read in Raw Data Using Relative Path
rawData <- read_csv(file = "./project2_rawdata.csv")
rawData
## # A tibble: 39,644 x 61
## url timedelta n_tokens_title n_tokens_content n_unique_tokens
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 http://ma~ 731 12 219 0.664
## 2 http://ma~ 731 9 255 0.605
## 3 http://ma~ 731 9 211 0.575
## 4 http://ma~ 731 9 531 0.504
## 5 http://ma~ 731 13 1072 0.416
## 6 http://ma~ 731 10 370 0.560
## 7 http://ma~ 731 8 960 0.418
## 8 http://ma~ 731 12 989 0.434
## 9 http://ma~ 731 11 97 0.670
## 10 http://ma~ 731 10 231 0.636
## # ... with 39,634 more rows, and 56 more variables:
## # n_non_stop_words <dbl>, n_non_stop_unique_tokens <dbl>,
## # num_hrefs <dbl>, num_self_hrefs <dbl>, num_imgs <dbl>,
## # num_videos <dbl>, average_token_length <dbl>, num_keywords <dbl>,
## # data_channel_is_lifestyle <dbl>,
## # data_channel_is_entertainment <dbl>, data_channel_is_bus <dbl>,
## # data_channel_is_socmed <dbl>, data_channel_is_tech <dbl>, ...
## Create a New Variable to Data Channel
rawDataNew <- rawData %>% mutate(data_channel = if_else(data_channel_is_bus == 1, "Business",
if_else(data_channel_is_entertainment == 1, "Entertainment",
if_else(data_channel_is_lifestyle == 1, "Lifestyle",
if_else(data_channel_is_socmed == 1, "Social Media",
if_else(data_channel_is_tech == 1, "Tech", "World"))))))
## Subset Data for Respective Data Channel
subsetData <- rawDataNew %>% filter(data_channel == params$data_channel)
## Create Training and Test Data Sets
set.seed(500)
trainIndex <- createDataPartition(subsetData$shares, p = 0.7, list = FALSE)
trainData <- subsetData[trainIndex,]
testData <- subsetData[-trainIndex,]
trainData
## # A tibble: 10,195 x 62
## url timedelta n_tokens_title n_tokens_content n_unique_tokens
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 http://ma~ 731 10 231 0.636
## 2 http://ma~ 731 9 1248 0.490
## 3 http://ma~ 731 9 274 0.609
## 4 http://ma~ 731 9 285 0.744
## 5 http://ma~ 731 8 259 0.563
## 6 http://ma~ 731 12 682 0.460
## 7 http://ma~ 731 11 125 0.675
## 8 http://ma~ 731 10 597 0.532
## 9 http://ma~ 731 11 799 0.504
## 10 http://ma~ 731 9 474 0.609
## # ... with 10,185 more rows, and 57 more variables:
## # n_non_stop_words <dbl>, n_non_stop_unique_tokens <dbl>,
## # num_hrefs <dbl>, num_self_hrefs <dbl>, num_imgs <dbl>,
## # num_videos <dbl>, average_token_length <dbl>, num_keywords <dbl>,
## # data_channel_is_lifestyle <dbl>,
## # data_channel_is_entertainment <dbl>, data_channel_is_bus <dbl>,
## # data_channel_is_socmed <dbl>, data_channel_is_tech <dbl>, ...
testData
## # A tibble: 4,366 x 62
## url timedelta n_tokens_title n_tokens_content n_unique_tokens
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 http://ma~ 731 8 1118 0.512
## 2 http://ma~ 731 9 391 0.510
## 3 http://ma~ 731 9 443 0.467
## 4 http://ma~ 729 7 1174 0.458
## 5 http://ma~ 729 9 167 0.681
## 6 http://ma~ 729 11 825 0.477
## 7 http://ma~ 729 7 105 0.819
## 8 http://ma~ 729 12 537 0.538
## 9 http://ma~ 729 10 138 0.754
## 10 http://ma~ 729 11 459 0.563
## # ... with 4,356 more rows, and 57 more variables:
## # n_non_stop_words <dbl>, n_non_stop_unique_tokens <dbl>,
## # num_hrefs <dbl>, num_self_hrefs <dbl>, num_imgs <dbl>,
## # num_videos <dbl>, average_token_length <dbl>, num_keywords <dbl>,
## # data_channel_is_lifestyle <dbl>,
## # data_channel_is_entertainment <dbl>, data_channel_is_bus <dbl>,
## # data_channel_is_socmed <dbl>, data_channel_is_tech <dbl>, ...
=======
#Create New Variable that Measures Title Length
trainData <- trainData %>% mutate(TitleLength = if_else(n_tokens_title <= 10, "Short Title",
if_else(n_tokens_title <= 15, "Medium Title", "Long Title")))
#Create New variable using weekday_is_() variables
trainDataNew <- trainData %>% mutate(day_of_the_week = if_else(weekday_is_monday == 1, "Monday",
if_else(weekday_is_tuesday == 1, "Tuesday",
if_else(weekday_is_wednesday == 1, "Wednesday",
if_else(weekday_is_thursday == 1, "Thursday",
if_else(weekday_is_friday == 1, "Friday",
if_else(weekday_is_saturday == 1, "Saturday", "Sunday"
)))))))
trainDataNew
## # A tibble: 10,195 x 64
## url timedelta n_tokens_title n_tokens_content n_unique_tokens
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 http://ma~ 731 10 231 0.636
## 2 http://ma~ 731 9 1248 0.490
## 3 http://ma~ 731 9 274 0.609
## 4 http://ma~ 731 9 285 0.744
## 5 http://ma~ 731 8 259 0.563
## 6 http://ma~ 731 12 682 0.460
## 7 http://ma~ 731 11 125 0.675
## 8 http://ma~ 731 10 597 0.532
## 9 http://ma~ 731 11 799 0.504
## 10 http://ma~ 731 9 474 0.609
## # ... with 10,185 more rows, and 59 more variables:
## # n_non_stop_words <dbl>, n_non_stop_unique_tokens <dbl>,
## # num_hrefs <dbl>, num_self_hrefs <dbl>, num_imgs <dbl>,
## # num_videos <dbl>, average_token_length <dbl>, num_keywords <dbl>,
## # data_channel_is_lifestyle <dbl>,
## # data_channel_is_entertainment <dbl>, data_channel_is_bus <dbl>,
## # data_channel_is_socmed <dbl>, data_channel_is_tech <dbl>, ...
#Convert is_weekend to a Factor
trainData$is_weekend <- factor(trainData$is_weekend)
#Table of Publications on Weekdays vs. Weekend
table(trainData$is_weekend)
##
## 0 1
## 8749 1446
#Create a Scatter Plot of # of Words by Shares
g1 <- ggplot(trainData, aes(x = n_tokens_content, y = shares)) +
geom_point(aes(col = is_weekend)) +
geom_smooth(method = "lm", aes(col = is_weekend))
g1
#Create a Box Plot of Weekend vs. Weekday by Shares
g2 <- ggplot(trainData, aes(x = is_weekend, y = shares)) +
geom_boxplot()
g2
#Generate a Numerical Summary of Data Summarized in our Box Plot
trainData %>% group_by(is_weekend) %>% summarise(min = min(shares), med = median(shares), max = max(shares), mean = mean(shares))
## # A tibble: 2 x 5
## is_weekend min med max mean
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 4 1300 843300 3761.
## 2 1 43 1700 617900 4186.
table(trainData$n_tokens_title)
##
## 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 3 28 150 456 1026 1651 2004 1820 1411 918 413 202 79 25
## 18 19 20
## 7 1 1
# Create a Scatter Plot of Polarity of Title by Shares
g3 <- ggplot(trainData, aes(x = abs_title_sentiment_polarity, y = shares)) +
geom_point(aes(col = TitleLength)) +
geom_smooth(method = "lm", aes(col = TitleLength))
g3
## `geom_smooth()` using formula 'y ~ x'
# Numerical Correlation of Polarity of Title by Shares
cor(trainData$abs_title_sentiment_polarity, trainData$shares)
## [1] 0.03942311
#find mean, median, and standard deviation of shares by day_of_the_week
daySummary <-trainDataNew %>% group_by(day_of_the_week) %>% summarise(avg_shares = mean(shares), med_shares = median(shares), sd_shares = sd(shares))
#print out numerical summary
daySummary
## # A tibble: 7 x 4
## day_of_the_week avg_shares med_shares sd_shares
## <chr> <dbl> <dbl> <dbl>
## 1 Friday 3768. 1400 9636.
## 2 Monday 4225. 1300 12150.
## 3 Saturday 4897. 1800 25110.
## 4 Sunday 3587. 1600 6705.
## 5 Thursday 3686. 1300 9842.
## 6 Tuesday 3480. 1200 9848.
## 7 Wednesday 3705. 1200 21081.
#create a basic plot with x and y variables
g5 <- ggplot(data = trainDataNew, aes(day_of_the_week, shares))
#make the plot a bar chart
g5 + geom_col(aes(fill = day_of_the_week), position = "dodge") +
#change the angle of the words to fit
guides(x = guide_axis(angle = 45)) +
labs(title = "Day of the Week vs. Shares") +
#get rid of legend
theme(legend.position = "none")
#create summary table with different ranges of images
imagesSummary <- trainData %>% group_by(cut(num_imgs, c(min(num_imgs), 10, 20, 40, 60, 80, max(num_imgs)))) %>% summarise(avg_shares = mean(shares), med_shares = median(shares), sd_shares = sd(shares))
#rename column name
names(imagesSummary)[names(imagesSummary) == "cut(num_imgs, c(min(num_imgs), 10, 20, 40, 60, 80, max(num_imgs)))"] <- "image_number"
#print out numerical summary
imagesSummary
## # A tibble: 7 x 4
## image_number avg_shares med_shares sd_shares
## <fct> <dbl> <dbl> <dbl>
## 1 (0,10] 2721. 1200 6863.
## 2 (10,20] 6342. 1900 29974.
## 3 (20,40] 5968. 2000 13209.
## 4 (40,60] 6195. 1800 11248.
## 5 (60,80] 4459. 4900 3267.
## 6 (80,100] 5017. 3550 4218.
## 7 <NA> 4853. 1500 13266.
#create basic plot with x and y
g6 <- ggplot(trainData, aes(x = num_imgs, y = shares))
#make it a scatter plot with regression line
g6 + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
#create basic plot with x and y
g7 <- ggplot(trainData, aes(x = num_videos, y = shares))
#make it a scatter plot with regression line
g7 + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Convert is_weekend to Numeric
trainData$is_weekend <- as.numeric(trainData$is_weekend)
## Fit Multiple Linear Regression Model
fitLM1 <- train(shares ~ n_tokens_content + is_weekend, data = trainData, method = "lm", trControl = trainControl(method = "cv", number = 10))
fitLM1
## Linear Regression
##
## 10195 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9176, 9175, 9175, 9175, 9177, 9175, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 12370.79 0.005442263 3981.896
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
## Fit Multiple Linear Regression Model
fitLM2 <- train(shares ~ abs_title_sentiment_polarity^2 + TitleLength, data = trainData, method = "lm", trControl = trainControl(method = "cv", number = 10))
fitLM2
## Linear Regression
##
## 10195 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9175, 9176, 9174, 9176, 9176, 9175, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 12485.77 0.004528442 3978.979
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#create fit using shares as response, day_of_the_week, num_imgs, and num_videos as the predictors
fitLM3 <- train(shares ~ day_of_the_week + num_imgs + num_videos, data = trainDataNew,
#method is linear model
method = "lm",
#center and scale the data
preProcess = c("center", "scale"),
#10 fold cross validation
trControl = trainControl(method = "cv", number = 10))
fitLM3
## Linear Regression
##
## 10195 samples
## 3 predictor
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9176, 9175, 9176, 9176, 9176, 9176, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 12449.77 0.008486067 3925.83
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#use train function
rfFit <- train(shares ~ day_of_the_week + num_imgs + num_videos + n_tokens_content, data = trainDataNew,
#use rf method
method = "rf",
#5 fold validation
trControl = trainControl(method = "cv", number = 5),
#consider values of mtry
tuneGrid = data.frame(mtry = 1:4))
rfFit
## Random Forest
##
## 10195 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 8156, 8156, 8155, 8156, 8157
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 1 12504.27 0.015209356 3906.988
## 2 12548.17 0.011646230 3873.410
## 3 12706.47 0.008322364 3918.735
## 4 12910.71 0.006639208 4008.710
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 1.
n.trees <- c(5, 10, 25, 100)
interaction.depth <- 1:4
shrinkage <- 0.1
n.minobsinnode <- 10
df <- expand.grid(n.trees = n.trees, interaction.depth = interaction.depth, shrinkage = shrinkage, n.minobsinnode = n.minobsinnode)
boostFit <- train(shares ~ abs_title_sentiment_polarity^2 + TitleLength, data = trainData, method = "gbm", trControl = trainControl(method = "cv", number = 5), tuneGrid = df)
boostFit$results
boostFit$bestTune
#Add New Variables (Title Length and Day of the Week) to Test Data
testData <- testData %>% mutate(TitleLength = if_else(n_tokens_title <= 10, "Short Title",
if_else(n_tokens_title <= 15, "Medium Title", "Long Title")))
testData <- testData %>% mutate(day_of_the_week = if_else(weekday_is_monday == 1, "Monday",
if_else(weekday_is_tuesday == 1, "Tuesday",
if_else(weekday_is_wednesday == 1, "Wednesday",
if_else(weekday_is_thursday == 1, "Thursday",
if_else(weekday_is_friday == 1, "Friday",
if_else(weekday_is_saturday == 1, "Saturday", "Sunday"
)))))))
## Obtain RMSE on Test Data for Linear Models
predLM1 <- predict(fitLM1, newdata = testData)
postResample(predLM1, obs = testData$shares)
## RMSE Rsquared MAE
## 1.228486e+04 3.486440e-03 3.744354e+03
predLM2 <- predict(fitLM2, newdata = testData)
postResample(predLM2, obs = testData$shares)
## RMSE Rsquared MAE
## 1.230156e+04 6.109966e-04 4.002990e+03
predLM3 <- predict(fitLM3, newdata = testData)
postResample(predLM3, obs = testData$shares)
## RMSE Rsquared MAE
## 1.225466e+04 7.036456e-03 3.935479e+03
## Obtain RMSE on Test Data for Random Forest Model
predRF <- predict(rfFit, newdata = testData)
postResample(predRF, testData$shares)
## RMSE Rsquared MAE
## 1.222704e+04 1.714757e-02 3.921356e+03
## Obtain RMSE on Test Data for Boosted Fit Model
predBoost <- predict(boostFit, newdata = testData)
postResample(predBoost, testData$shares)
## RMSE Rsquared MAE
## 1.229560e+04 7.425707e-04 3.992220e+03