#load the dslabs package#install.packages("dslabs")library(dslabs)library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.0 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.1 ✔ tibble 3.1.8
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
#look at help page for gapminder datahelp("gapminder")
#look at the structure of the datastr(gapminder)
'data.frame': 10545 obs. of 9 variables:
$ country : Factor w/ 185 levels "Albania","Algeria",..: 1 2 3 4 5 6 7 8 9 10 ...
$ year : int 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
$ infant_mortality: num 115.4 148.2 208 NA 59.9 ...
$ life_expectancy : num 62.9 47.5 36 63 65.4 ...
$ fertility : num 6.19 7.65 7.32 4.43 3.11 4.55 4.82 3.45 2.7 5.57 ...
$ population : num 1636054 11124892 5270844 54681 20619075 ...
$ gdp : num NA 1.38e+10 NA NA 1.08e+11 ...
$ continent : Factor w/ 5 levels "Africa","Americas",..: 4 1 1 2 2 3 2 5 4 3 ...
$ region : Factor w/ 22 levels "Australia and New Zealand",..: 19 11 10 2 15 21 2 1 22 21 ...
#look at a summary of the datasummary(gapminder)
country year infant_mortality life_expectancy
Albania : 57 Min. :1960 Min. : 1.50 Min. :13.20
Algeria : 57 1st Qu.:1974 1st Qu.: 16.00 1st Qu.:57.50
Angola : 57 Median :1988 Median : 41.50 Median :67.54
Antigua and Barbuda: 57 Mean :1988 Mean : 55.31 Mean :64.81
Argentina : 57 3rd Qu.:2002 3rd Qu.: 85.10 3rd Qu.:73.00
Armenia : 57 Max. :2016 Max. :276.90 Max. :83.90
(Other) :10203 NA's :1453
fertility population gdp continent
Min. :0.840 Min. :3.124e+04 Min. :4.040e+07 Africa :2907
1st Qu.:2.200 1st Qu.:1.333e+06 1st Qu.:1.846e+09 Americas:2052
Median :3.750 Median :5.009e+06 Median :7.794e+09 Asia :2679
Mean :4.084 Mean :2.701e+07 Mean :1.480e+11 Europe :2223
3rd Qu.:6.000 3rd Qu.:1.523e+07 3rd Qu.:5.540e+10 Oceania : 684
Max. :9.220 Max. :1.376e+09 Max. :1.174e+13
NA's :187 NA's :185 NA's :2972
region
Western Asia :1026
Eastern Africa : 912
Western Africa : 912
Caribbean : 741
South America : 684
Southern Europe: 684
(Other) :5586
#check what type of object the data isclass(gapminder)
[1] "data.frame"
Subset the data, only keeping data from Africa
#subset the data so there is only data from the African continent in a new data frameafricadata <- gapminder %>%filter(continent =="Africa")
#look at the structure of the datastr(africadata)
'data.frame': 2907 obs. of 9 variables:
$ country : Factor w/ 185 levels "Albania","Algeria",..: 2 3 18 22 26 27 29 31 32 33 ...
$ year : int 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
$ infant_mortality: num 148 208 187 116 161 ...
$ life_expectancy : num 47.5 36 38.3 50.3 35.2 ...
$ fertility : num 7.65 7.32 6.28 6.62 6.29 6.95 5.65 6.89 5.84 6.25 ...
$ population : num 11124892 5270844 2431620 524029 4829291 ...
$ gdp : num 1.38e+10 NA 6.22e+08 1.24e+08 5.97e+08 ...
$ continent : Factor w/ 5 levels "Africa","Americas",..: 1 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 22 levels "Australia and New Zealand",..: 11 10 20 17 20 5 10 20 10 10 ...
#look at a summary of the datasummary(africadata)
country year infant_mortality life_expectancy
Algeria : 57 Min. :1960 Min. : 11.40 Min. :13.20
Angola : 57 1st Qu.:1974 1st Qu.: 62.20 1st Qu.:48.23
Benin : 57 Median :1988 Median : 93.40 Median :53.98
Botswana : 57 Mean :1988 Mean : 95.12 Mean :54.38
Burkina Faso: 57 3rd Qu.:2002 3rd Qu.:124.70 3rd Qu.:60.10
Burundi : 57 Max. :2016 Max. :237.40 Max. :77.60
(Other) :2565 NA's :226
fertility population gdp continent
Min. :1.500 Min. : 41538 Min. :4.659e+07 Africa :2907
1st Qu.:5.160 1st Qu.: 1605232 1st Qu.:8.373e+08 Americas: 0
Median :6.160 Median : 5570982 Median :2.448e+09 Asia : 0
Mean :5.851 Mean : 12235961 Mean :9.346e+09 Europe : 0
3rd Qu.:6.860 3rd Qu.: 13888152 3rd Qu.:6.552e+09 Oceania : 0
Max. :8.450 Max. :182201962 Max. :1.935e+11
NA's :51 NA's :51 NA's :637
region
Eastern Africa :912
Western Africa :912
Middle Africa :456
Northern Africa :342
Southern Africa :285
Australia and New Zealand: 0
(Other) : 0
#further subset the African data by creating a data frame with only infant_mortality and Life_expectancyafricadata2 <- africadata %>%select(c(infant_mortality, life_expectancy))
#look at the structure of the datastr(africadata2)
'data.frame': 2907 obs. of 2 variables:
$ infant_mortality: num 148 208 187 116 161 ...
$ life_expectancy : num 47.5 36 38.3 50.3 35.2 ...
#look at a summary of the datasummary(africadata2)
infant_mortality life_expectancy
Min. : 11.40 Min. :13.20
1st Qu.: 62.20 1st Qu.:48.23
Median : 93.40 Median :53.98
Mean : 95.12 Mean :54.38
3rd Qu.:124.70 3rd Qu.:60.10
Max. :237.40 Max. :77.60
NA's :226
#subset the African data again, but this time including only population and life_expectancyafricadata3 <- africadata %>%select(c(population, life_expectancy))
#look at the structure of the datastr(africadata3)
'data.frame': 2907 obs. of 2 variables:
$ population : num 11124892 5270844 2431620 524029 4829291 ...
$ life_expectancy: num 47.5 36 38.3 50.3 35.2 ...
#look at a summary of the datasummary(africadata3)
population life_expectancy
Min. : 41538 Min. :13.20
1st Qu.: 1605232 1st Qu.:48.23
Median : 5570982 Median :53.98
Mean : 12235961 Mean :54.38
3rd Qu.: 13888152 3rd Qu.:60.10
Max. :182201962 Max. :77.60
NA's :51
Plot the data here
#Plot life expectancy as a function of infant mortalityggplot(data = africadata2) +geom_point(aes(x = infant_mortality, y = life_expectancy))
#Find which years have missing data for infant_mortalityafrica_NAs <- africadata %>%select(c(country, year, infant_mortality)) %>%filter(is.na(infant_mortality))summary(africa_NAs)
country year infant_mortality
Equatorial Guinea: 23 Min. :1960 Min. : NA
Angola : 19 1st Qu.:1963 1st Qu.: NA
Gabon : 19 Median :1968 Median : NA
Djibouti : 17 Mean :1978 Mean :NaN
Guinea-Bissau : 17 3rd Qu.:1978 3rd Qu.: NA
South Africa : 15 Max. :2016 Max. : NA
(Other) :116 NA's :226
#Subset the African data to only include data from the year 2000africadata2000 <- africadata %>%filter(year ==2000)
#look at the structure of the datastr(africadata2000)
'data.frame': 51 obs. of 9 variables:
$ country : Factor w/ 185 levels "Albania","Algeria",..: 2 3 18 22 26 27 29 31 32 33 ...
$ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
$ infant_mortality: num 33.9 128.3 89.3 52.4 96.2 ...
$ life_expectancy : num 73.3 52.3 57.2 47.6 52.6 46.7 54.3 68.4 45.3 51.5 ...
$ fertility : num 2.51 6.84 5.98 3.41 6.59 7.06 5.62 3.7 5.45 7.35 ...
$ population : num 31183658 15058638 6949366 1736579 11607944 ...
$ gdp : num 5.48e+10 9.13e+09 2.25e+09 5.63e+09 2.61e+09 ...
$ continent : Factor w/ 5 levels "Africa","Americas",..: 1 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 22 levels "Australia and New Zealand",..: 11 10 20 17 20 5 10 20 10 10 ...
#look at a summary of the datasummary(africadata2000)
country year infant_mortality life_expectancy
Algeria : 1 Min. :2000 Min. : 12.30 Min. :37.60
Angola : 1 1st Qu.:2000 1st Qu.: 60.80 1st Qu.:51.75
Benin : 1 Median :2000 Median : 80.30 Median :54.30
Botswana : 1 Mean :2000 Mean : 78.93 Mean :56.36
Burkina Faso: 1 3rd Qu.:2000 3rd Qu.:103.30 3rd Qu.:60.00
Burundi : 1 Max. :2000 Max. :143.30 Max. :75.00
(Other) :45
fertility population gdp continent
Min. :1.990 Min. : 81154 Min. :2.019e+08 Africa :51
1st Qu.:4.150 1st Qu.: 2304687 1st Qu.:1.274e+09 Americas: 0
Median :5.550 Median : 8799165 Median :3.238e+09 Asia : 0
Mean :5.156 Mean : 15659800 Mean :1.155e+10 Europe : 0
3rd Qu.:5.960 3rd Qu.: 17391242 3rd Qu.:8.654e+09 Oceania : 0
Max. :7.730 Max. :122876723 Max. :1.329e+11
region
Eastern Africa :16
Western Africa :16
Middle Africa : 8
Northern Africa : 6
Southern Africa : 5
Australia and New Zealand: 0
(Other) : 0
#Plot life expectancy as a function of infant mortalityggplot(data = africadata2000) +geom_point(aes(x = infant_mortality, y = life_expectancy))
#Plot life expectancy as a function of log10(population)ggplot(data = africadata2000) +geom_point(aes(x =log10(population), y = life_expectancy))
#Use a linear model, fit life expectancy as the outcome and infant mortality as the predictor#p-value is very small, infant mortality is very likely to predict life expectancyfit1 <-lm(infant_mortality ~ life_expectancy, data = africadata2000)summary(fit1)
Call:
lm(formula = infant_mortality ~ life_expectancy, data = africadata2000)
Residuals:
Min 1Q Median 3Q Max
-67.262 -9.806 -1.891 12.460 52.285
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 219.0135 21.4781 10.197 1.05e-13 ***
life_expectancy -2.4854 0.3769 -6.594 2.83e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 22.55 on 49 degrees of freedom
Multiple R-squared: 0.4701, Adjusted R-squared: 0.4593
F-statistic: 43.48 on 1 and 49 DF, p-value: 2.826e-08
#Use a linear model, fit life expectancy as the outcome and population as the predictor#p-value is large, population size is unlikley to predict life expectancyfit2 <-lm(population ~ life_expectancy, data = africadata2000)summary(fit2)
Call:
lm(formula = population ~ life_expectancy, data = africadata2000)
Residuals:
Min 1Q Median 3Q Max
-18308728 -12957963 -6425955 2079794 107435285
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5074933 21193712 0.239 0.812
life_expectancy 187799 371938 0.505 0.616
Residual standard error: 22250000 on 49 degrees of freedom
Multiple R-squared: 0.005176, Adjusted R-squared: -0.01513
F-statistic: 0.2549 on 1 and 49 DF, p-value: 0.6159
I want to continue to explore the data related to African countries, specifically how gdp and infant mortality relates to fertility.
#subset African countriesafricadata <-filter(gapminder, continent =="Africa")#select only the `gdp` and `fertility` columns and assign to `gdpfert`gdpfert <- africadata %>%select(gdp, fertility)#select only the `fertility` and `infant_mortality` columns and assign to `fertmort`fertmort <- africadata %>%select(fertility, infant_mortality)
Plotting the data
Next, I wanted to look at the plots for the two new datasets. For both gdpfert and fertmort, I plotted the data points using geom_point() and geom_smooth() to produce a scatter plot with a smoothed line over the data. The smoothed line helps visualize patterns in the plot.
#produce scatter plot with smooth line; x = gdp, y = fertility#x-axis log transformed for gdp dataggplot(gdpfert, aes(gdp, fertility))+geom_point() +geom_smooth()+scale_x_continuous(trans ="log")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
We can see a distinct negative correlation with fertility after the GDP of a country reaches above approx. 9.7 billion dollars. Note that most African countries have a GDP between 485 million and $9.7 billion dollars where the line fluctuates around 6 children per woman.
#produce scatter plot with smooth line; x = fertility, y = infant_mortalityggplot(fertmort, aes(fertility, infant_mortality)) +geom_point()+geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Here, fertility and infant mortality are positively correlated until approximately 7 children per woman or 125 infants deaths per 1000.
Linear modeling
I would like to consider a linear model predicting fertility from GDP and infant mortality for African countries. The glm() function will be used to create a linear model for this scenario. The tidy() function from the broom package produces the summary of the model.
fit3 <-glm(fertility ~ gdp + infant_mortality, data = africadata)tidy(fit3)
Both GDP and infant mortality has a statistically significant affect on fertility (p-values > 0.001). GDP has an extremely small negative effect on fertility (slope = -1.411e-11), and infant mortality has a small positive effect (slope = 2.176e-2).