##############################################################


Project done by Katelin Bauer and Abigail Tata 

##############################################################

library(tidyverse)
library(dplyr)
library(corrplot)
library(tidyverse)
library(dplyr)
library(Matrix)
library(ggplot2)

setwd("/Users/abigailtata/Desktop/Regression/project")
WD<- read.csv("world-data-2023!.csv")
WD

view(WD)
WD<-na.omit(WD)
WD
as.tibble(WD)

WD <- WD%>%
  mutate(Land.Area.Km2. = as.double(Land.Area.Km2.),
         Armed.Forces.size= as.double(Armed.Forces.size),
         Co2.Emissions= as.double(Co2.Emissions),
         Gasoline.Price= as.double(Gasoline.Price),
         GDP= as.double(GDP),
         Maternal.mortality.ratio= as.double(Maternal.mortality.ratio),
         Minimum.wage= as.double(Minimum.wage),
         Population..Labor.force.participation....= as.double(Population..Labor.force.participation....),
         Population= as.double(Population),
         Unemployment.rate= as.double(Unemployment.rate),
        GDPlog=log(GDP),
        GDPlog=as.double(GDPlog))

WD

colnames(WD)[5] ="Area"
colnames(WD)[6] ="ForceSize"
colnames(WD)[8] ="birthrate"
colnames(WD)[10] ="Capital"
colnames(WD)[11] ="CO2emissions"
colnames(WD)[21] ="infantmortality"
colnames(WD)[23] ="lifeexpectancy"
colnames(WD)[25] ="minimumwage"
colnames(WD)[26] ="officiallanguage"
colnames(WD)[28] ="physiciansperthousand"
colnames(WD)[29] ="population"
colnames(WD)[33] ="unemploymentrate"
WD

##############################################################################
#1) Offer a preliminary description of the data set. For example, indicate the size of the
#data source, describe the variables, and include any other data profile information
#that would be of interest.
##############################################################################


#This dataset provides information about all countries worldwide. This data covers demographic 
#statistics, economic indicators, environmental factors, healthcare metrics, education statistics, and more. 

worlddata<- WD%>%
  select(Country,Area,ForceSize,birthrate,Capital,Gasoline.Price,GDP,infantmortality,lifeexpectancy,minimumwage,officiallanguage,physiciansperthousand,population,unemploymentrate)
worlddata<-na.omit(worlddata)
worlddata

count(worlddata)


#Country: Name of the country.
#Area Total land area of the country in square kilometers.
#ForceSize: Size of the armed forces in the country. levels big or small
#Birth Rate: Number of births per 1,000 population per year.
#Capital: Name of the capital or major city.
#Gasoline_Price: Price of gasoline per liter in local currency.
#GDP: Gross Domestic Product, the total value of goods and services produced in the country.
#Infant Mortality: Number of deaths per 1,000 live births before reaching one year of age.
#Life Expectancy: Average number of years a newborn is expected to live.
#Minimum Wage: Minimum wage level in local currency.
#Official Language: Official language(s) spoken in the country.
#Physicians per Thousand: Number of physicians per thousand people.
#Population: Total population of the country.
#Population: Labor Force Participation (%): Percentage of the population that is part of the labor force.
#Unemployment Rate: Percentage of the labor force that is unemployed.


#########################################################################
#2) Generate relevant data visual plots that explore multicollinearity for the
#quantitative variables and normality for the quantitative variables as well. Also, use
#R code to confirm the levels of the categorical variables.
#########################################################################


#1. Levels of the categorical variables 

factor(worlddata$ForceSize)
factor(worlddata$unemploymentrate)


#multicollinearity
cor(worlddata2)
corrplot(cor(worlddata2), method = "number")


#2 PLOTS

worlddata2<- worlddata%>%
  select(Area,birthrate,Gasoline.Price,GDP,infantmortality,lifeexpectancy,minimumwage,physiciansperthousand,population)

worlddata2

#minimum wage
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = minimumwage , y = GDPlog)) +

#physiciansperthousand
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = physiciansperthousand , y = GDPlog)) 

#birthrate
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = birthrate , y = GDPlog)) 

#Gasoline.Price
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = Gasoline.Price , y = GDPlog)) 

#infantmortality
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = infantmortality , y = GDPlog)) 

#lifeexpectancy
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = lifeexpectancy , y = GDPlog)) 

#population
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = population , y = GDPlog)) 

# log population
logpop<-log(worlddata2$population)
logpop

# log population
ggplot(data = worlddata2) +
  geom_point(mapping = aes(x = logpop , y = GDPlog)) 


##############################################################################
#3) Using R code, produce a full Regression Model that consists of quantitative and
#categorical variables. Make use of the R generated dummy variable matrices
##############################################################################


model1<-lm(GDPlog~physiciansperthousand + unemploymentrate + lifeexpectancy + minimumwage +ForceSize, data= worlddata)
model1
#GDPlog= 22.72123 + 0.17990(physiciansperthousand) +0.65635(unemploymentrateGOOD) + 0.03721(lifeexpectancy) + 0.19185(minimumwage) -2.54126(ForceSizeSMALL)

summary(model1)


contrasts(as.factor(worlddata$unemploymentrate))

#GOOD
#BAD     0
#GOOD    1

contrasts(as.factor(worlddata$ForceSize))

#SMALL
#BIG       0
#SMALL     1

##############################################################################
#4) Using only the quantitative variables as predictors, produce a model using matrix
#methods. Also use matrix methods to find the fitted values and the residuals
##############################################################################

minimumwage1<-as.matrix(worlddata$minimumwage)
physiciansperthousand1<-as.matrix(worlddata$physiciansperthousand)
birthrate1<-as.matrix(worlddata$birthrate)
Gasoline.Price1<-as.matrix(worlddata$Gasoline.Price)
infantmortality1<-as.matrix(worlddata$infantmortality)
lifeexpectancy1<-as.matrix(worlddata$lifeexpectancy)

cbind(c(1),minimumwage1,physiciansperthousand1,birthrate1,Gasoline.Price1,infantmortality1,lifeexpectancy1)

Xm<-cbind(c(1),minimumwage1,physiciansperthousand1,birthrate1,Gasoline.Price1,infantmortality1,lifeexpectancy1)
Xm

Ym<-as.matrix(worlddata$GDPlog)
Ym

int_slope<- solve(t(Xm)%*%Xm)%*%t(Xm)%*%Ym
int_slope

# Dr. Dickens's Code

worlddata  

as_tibble(worlddata) 

Y <- matrix(worlddata$GDPlog) 
Y                       
                        
vls <- rep(1, 138)  
vls 


X <- matrix(c(vls,worlddata$minimumwage,worlddata$physiciansperthousand,worlddata$birthrate,worlddata$Gasoline.Price,worlddata$infantmortality,worlddata$lifeexpectancy), nrow = 138, 
               ncol = 7, byrow = FALSE)
X

View(X) 

solve(t(X)%*%X)%*%t(X)%*%Y 

# Use conventinal lm coding to confirm your matrix approach answers.

lm(GDPlog ~ minimumwage + physiciansperthousand + birthrate + Gasoline.Price + infantmortality + lifeexpectancy, data = worlddata)

##############################################################################
#5) Produce an output summary table to be used to analyze and evaluate the full model
#(Adjusted R squared, Standard Error, Significance of Variables, ect...)
##############################################################################


summary(model1)


##############################################################################
#6) Use procedures and techniques explored in class to produce confidence intervals for
#the independent quantitative variables of your model. Choose at least two of the
#quantitative variables to find confidence intervals for.
##############################################################################

confint(model1)


##############################################################################
#7) Now produce a reduced model (removing variables of your choice with justification).
#Use R summary coding for both models and offer justification for choosing one
#model over the other.
##############################################################################


# reduced model... removed life expectancy because it wasnt significant 
model2<-lm(GDPlog~physiciansperthousand + unemploymentrate + minimumwage +ForceSize, data= worlddata)
model2
summary(model1)
summary(model2)


# the original model with the insignificant variable of life expectancy made the multiple r2(0.6435) better than
#if we were to not include the life expectancy(0.6365)
# both of the p values are the same so we can keep model 1 for it produces a better model.

#############################################################################
#8) Research and apply a model analysis technique not discussed in class to your full
#model or reduced model. Fully explain the technique or procedure and how it is
#being applied to your specific model.
##############################################################################

#Akaike Information Criterion: An estimator of prediction error, therefore the relative quality of statistical models for a given set of data. Lower AIC values tend to indicate a better model. 

#Bayesian Information Criterion: A criterion for model selection among a finite set of models. Models with a lower BIC are generally preferred. 

AIC(model1)
BIC(model1)

AIC(model2)
BIC(model2)

# Model 1 has a slightly lower AIC value, but this difference is less than 1.0.
# Model 2 has a lower BIC value, of just over 2.0.


##############################################################################
#9) Offer final summary perspectives about the data and the models that you produce,
#suggesting how your models or model analysis enhanced your understanding of the
#data. (4 or 5 sentences)
##############################################################################


#We thought life expectancy would have an effect on the GDP but there was no significance according to our model. We didn’t 
believe there would be a high correlation between log(GDP) and population but there was a strong correlation between the two. 
We were surprised to see that the model that had an insignificant variable happened to have a better adjusted r-squared compared 
to the one without the insignificant variable. The analysis of the variables in the data set highlighted the importance of not 
assuming one variable will be significant and doing proper linear regression techniques to analyze and understand the data.