# set working directory if has child directory
dir_child = 'students'
if (dir_child %in% list.files()){
if (interactive()) {
# R Console
setwd(dir_child)
} else {
# knitting
knitr::opts_knit$set(root.dir=dir_child)
}
}
and ## 2. Programming Concepts ### Content
For this class, the group will be working to create a program to take in survey data in real time, reorganize it, and output summary statistics. For my group project (on Santa Barbara water supply) and future work, I’m most interested in learning data management strategies and ways to visualize data.
A group member has data on survey responses on solid waste. Data is received via text message and associated timestamps. The responses need to be tied to a specific survey question and organized.
Fig. 1. Histogram of age distribution of survey respondents.
# read csv
d = read.csv('data/jkidson_ugandasms.csv')
# output summary
summary(d)
## Nominated Zone years_zone gender age
## no :504 ZONE I : 24 Min. : 1.000 female:442 Min. :18.0
## yes:536 ZONE V : 22 1st Qu.: 2.000 male :598 1st Qu.:23.0
## CHURCH : 18 Median : 5.000 Median :28.0
## LUGOBA : 15 Mean : 9.192 Mean :30.5
## LUBIRI : 14 3rd Qu.:13.000 3rd Qu.:35.0
## B 9 : 13 Max. :65.000 Max. :76.0
## (Other):934
## satis
## dissatisfied :320
## neither_satisf:124
## satisfied :270
## very_dissatisf:283
## very_satisfied: 43
##
##
library(readr)
d = read_csv('data/jkidson_ugandasms.csv')
head(d)
## Nominated Zone years_zone gender age satis
## 1 yes LUGOBA 6 female 30 very_dissatisf
## 2 no KISOWERA 30 female 40 satisfied
## 3 yes KISOWERA 15 male 29 satisfied
## 4 yes GOOD HOPE 1 female 42 very_satisfied
## 5 yes LUGOBA 2 male 23 satisfied
## 6 no CORNER 3 female 30 very_dissatisf
summary(d)
## Nominated Zone years_zone gender
## Length:1040 Length:1040 Min. : 1.000 Length:1040
## Class :character Class :character 1st Qu.: 2.000 Class :character
## Mode :character Mode :character Median : 5.000 Mode :character
## Mean : 9.192
## 3rd Qu.:13.000
## Max. :65.000
## age satis
## Min. :18.0 Length:1040
## 1st Qu.:23.0 Class :character
## Median :28.0 Mode :character
## Mean :30.5
## 3rd Qu.:35.0
## Max. :76.0
# read in csv
# view data
# limit columns to Zone and satis
# limit rows to just zone "CORNER"
# get count for each satisfaction level
# write out csv
# read in csv
surveys = read.csv('data/jkidson_ugandasms.csv')
# view data
head(surveys)
## Nominated Zone years_zone gender age satis
## 1 yes LUGOBA 6 female 30 very_dissatisf
## 2 no KISOWERA 30 female 40 satisfied
## 3 yes KISOWERA 15 male 29 satisfied
## 4 yes GOOD HOPE 1 female 42 very_satisfied
## 5 yes LUGOBA 2 male 23 satisfied
## 6 no CORNER 3 female 30 very_dissatisf
summary(surveys)
## Nominated Zone years_zone gender age
## no :504 ZONE I : 24 Min. : 1.000 female:442 Min. :18.0
## yes:536 ZONE V : 22 1st Qu.: 2.000 male :598 1st Qu.:23.0
## CHURCH : 18 Median : 5.000 Median :28.0
## LUGOBA : 15 Mean : 9.192 Mean :30.5
## LUBIRI : 14 3rd Qu.:13.000 3rd Qu.:35.0
## B 9 : 13 Max. :65.000 Max. :76.0
## (Other):934
## satis
## dissatisfied :320
## neither_satisf:124
## satisfied :270
## very_dissatisf:283
## very_satisfied: 43
##
##
# limit columns to species and year
surveys_2 = surveys[,c('Zone', 'satis')]
# limit rows to just species "NL"
surveys_3 = surveys_2[surveys_2$Zone == 'CORNER',]
# get count per year
surveys_4 = aggregate(Zone ~ satis, data=surveys_3, FUN='length')
# write to csv
write.csv(surveys_4, 'data/surveys_jkidson.csv', row.names = FALSE)
(For some reason this nested function is giving me an error, so I commented it out for now. I’ll come back to it if I have time after the group assignment.)
# read in data
surveys = read.csv('data/jkidson_ugandasms.csv')
# view data
head(surveys)
## Nominated Zone years_zone gender age satis
## 1 yes LUGOBA 6 female 30 very_dissatisf
## 2 no KISOWERA 30 female 40 satisfied
## 3 yes KISOWERA 15 male 29 satisfied
## 4 yes GOOD HOPE 1 female 42 very_satisfied
## 5 yes LUGOBA 2 male 23 satisfied
## 6 no CORNER 3 female 30 very_dissatisf
summary(surveys)
## Nominated Zone years_zone gender age
## no :504 ZONE I : 24 Min. : 1.000 female:442 Min. :18.0
## yes:536 ZONE V : 22 1st Qu.: 2.000 male :598 1st Qu.:23.0
## CHURCH : 18 Median : 5.000 Median :28.0
## LUGOBA : 15 Mean : 9.192 Mean :30.5
## LUBIRI : 14 3rd Qu.:13.000 3rd Qu.:35.0
## B 9 : 13 Max. :65.000 Max. :76.0
## (Other):934
## satis
## dissatisfied :320
## neither_satisf:124
## satisfied :270
## very_dissatisf:283
## very_satisfied: 43
##
##
# limit data with [], aggregate to count, write to csv
# write.csv(
# aggregate(
# Zone ~ satis,
# data = surveys[surveys$satis == 'CORNER', c('Zone', 'satis')],
# FUN = 'length'),
# 'data/surveys_jkidson.csv',
# row.names = FALSE)
# load libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# read in csv
surveys = read_csv('data/jkidson_ugandasms.csv')
# dplyr elegance
surveys %T>% # note tee operator %T>% for glimpse
glimpse() %>% # view data
select(Zone, satis) %>% # limit columns
filter(Zone == 'CORNER') %>% # limit rows
group_by(satis) %>% # get count by first grouping
summarize(n = n()) %>% # then summarize
write_csv('data/surveys_jkidson.csv') # write out csv
## Observations: 1,040
## Variables: 6
## $ Nominated (chr) "yes", "no", "yes", "yes", "yes", "no", "yes", "yes...
## $ Zone (chr) "LUGOBA", "KISOWERA", "KISOWERA", "GOOD HOPE", "LUG...
## $ years_zone (int) 6, 30, 15, 1, 2, 3, 3, 13, 1, 2, 9, 35, 10, 4, 35, ...
## $ gender (chr) "female", "female", "male", "female", "male", "fema...
## $ age (int) 30, 40, 29, 42, 23, 30, 26, 32, 25, 22, 30, 35, 29,...
## $ satis (chr) "very_dissatisf", "satisfied", "satisfied", "very_s...
install.packages("devtools")
devtools::install_github("rstudio/EDAWR")
library(EDAWR)
help(package='EDAWR')
?storms # wind speed data for 6 hurricanes
?cases # subset of WHO tuberculosis
?pollution # pollution data from WHO Ambient Air Pollution, 2014
?tb # tuberculosis data
View(storms)
View(cases)
View(pollution)
# storms
storms$storm
storms$wind
storms$pressure
storms$date
# cases
cases$country
names(cases)[-1]
unlist(cases[1:3, 2:4])
# pollution
pollution$city[c(1,3,5)]
pollution$amount[c(1,3,5)]
pollution$amount[c(2,4,6)]
# ratio
storms$pressure / storms$wind
Two main functions: gather() and spread()
# install.packages("tidyr")
library(tidyr)
?gather # gather to long
?spread # spread to wide
gather
cases %>%
# gather(cases, "year", "n", 2:4) # gathers by key of year
# if you wanted to removed a column you could enter -2 instead of 2:4 (but this makes country weird)
gather("year", "n", -country) %>%
# France and US for 2011 and 2013
filter(
year %in% c(2011,2013) &
country %in% c('FR','US'))
spread
pollution
spread(pollution, size, amount)
Other functions to extract and combine columns…
separate
storms
storms2 = separate(storms, date, c("year", "month", "day"), sep = "-")
library(stringr)
# Splitting date strings!!!
#storms%>%
# mutate(date_str = as.character(date)) %>%
# separate(date_str, c("year","month","day"), sep = "-")
unite
storms2
unite(storms2, "date", year, month, day, sep = "-")
Recap: tidyr:
A package that reshapes the layout of data sets.
Make observations from variables with gather()
Make variables from observations with spread()
Split and merge columns with unite()
and separate()
From the data-wrangling-cheatsheet.pdf:
Task. Convert the following table CO2 emissions per country since 1970 from wide to long format and output the first few rows into your Rmarkdown. I recommend consulting ?gather
and you should have 3 columns in your output.
library(readxl) # install.packages('readxl')
url = 'http://edgar.jrc.ec.europa.eu/news_docs/CO2_1970-2014_dataset_of_CO2_report_2015.xls'
xls = '../data/co2_europa.xls'
print(getwd())
## [1] "/Users/jenniferkidson/Documents/Bren Docs/296 Informatics/env-info/students"
if (!file.exists(xls)){
download.file(url, xls)
}
co2 = read_excel(xls, skip=12)
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00
co2
## Source: local data frame [220 x 46]
##
## Country 1970.000000 1971.000000 1972.000000 1973.000000
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 Afghanistan 1813.981234 1840.794051 1794.831482 1817.997422
## 2 Albania 4435.433037 4425.968296 5035.309230 4638.057137
## 3 Algeria 18850.745443 22226.320843 34278.432478 46375.728534
## 4 American Samoa 6.183023 6.327022 6.462429 6.587474
## 5 Angola 8946.495011 8532.101439 10382.218539 11365.512295
## 6 Anguilla 2.168006 2.168036 2.263426 2.108604
## 7 Antigua and Barbuda 196.161773 203.964395 244.721752 263.402867
## 8 Argentina 87478.401090 87827.313960 88888.066154 93076.186184
## 9 Armenia 10951.012839 10972.887313 11783.384976 12844.589903
## 10 Aruba 40.380782 40.635025 41.138578 41.949184
## .. ... ... ... ... ...
## Variables not shown: 1974.000000 (dbl), 1975.000000 (dbl), 1976.000000
## (dbl), 1977.000000 (dbl), 1978.000000 (dbl), 1979.000000 (dbl),
## 1980.000000 (dbl), 1981.000000 (dbl), 1982.000000 (dbl), 1983.000000
## (dbl), 1984.000000 (dbl), 1985.000000 (dbl), 1986.000000 (dbl),
## 1987.000000 (dbl), 1988.000000 (dbl), 1989.000000 (dbl), 1990.000000
## (dbl), 1991.000000 (dbl), 1992.000000 (dbl), 1993.000000 (dbl),
## 1994.000000 (dbl), 1995.000000 (dbl), 1996.000000 (dbl), 1997.000000
## (dbl), 1998.000000 (dbl), 1999.000000 (dbl), 2000.000000 (dbl),
## 2001.000000 (dbl), 2002.000000 (dbl), 2003.000000 (dbl), 2004.000000
## (dbl), 2005.000000 (dbl), 2006.000000 (dbl), 2007.000000 (dbl),
## 2008.000000 (dbl), 2009.000000 (dbl), 2010.000000 (dbl), 2011.000000
## (dbl), 2012.000000 (dbl), 2013.000000 (dbl), 2014.000000 (dbl)
Question. Why use skip=12
argument in read_excel()
? Everything before line 13 is separate from the actual data table, so skip=12 jumps over the first 12 rows.
A package that helps transform tabular data
# install.packages("dplyr")
library(dplyr)
?select
?filter
?arrange
?mutate
?group_by
?summarise
See sections in the data-wrangling-cheatsheet.pdf:
select()
filter()
arrange()
mutate()
group_by()
and summarise()
select
storms
select(storms, storm, pressure)
storms %>% select(storm, pressure)
filter
storms
filter(storms, wind >= 50)
storms %>% filter(wind >= 50)
storms %>%
filter(wind >= 50) %>%
select(storm, pressure)
mutate
storms %>%
mutate(ratio = pressure / wind) %>%
select(storm, ratio)
group_by
pollution
pollution %>% group_by(city)
summarise
# by city
pollution %>%
group_by(city) %>%
summarise(
mean = mean(amount),
sum = sum(amount),
n = n())
# by size
pollution %>%
group_by(size) %>%
summarise(
mean = mean(amount),
sum = sum(amount),
n = n())
note that summarize
synonymously works
ungroup
pollution %>%
group_by(size)
pollution %>%
group_by(size) %>%
ungroup()
tb %>%
group_by(country, year) %>%
summarise(cases = sum(cases))
summarise(cases = sum(cases))
Recap: dplyr:
Extract columns with select()
and rows with filter()
Sort rows by column with arrange()
Make new columns with mutate()
Group rows by column with group_by()
and summarise()
See sections in the data-wrangling-cheatsheet.pdf:
Subset Variables (Columns), eg select()
Subset Observations (Rows), eg filter()
Reshaping Data - Change the layout of a data set, eg arrange()
Make New Variables, eg mutate()
Group Data, eg group_by()
and summarise()
Task 1 Task. Report the top 5 emitting countries (not World or EU28) for 2014 using your long format table. (You may need to convert your year column from factor to numeric, eg mutate(year = as.numeric(as.character(year)))
. As with most analyses, there are multiple ways to do this. I used the following functions: filter
, arrange
, desc
, head
).
names(co2) = c('country', 1970:2014) # get rid of extra decimal places in column names
library(dplyr)
library(tidyr)
co2
## Source: local data frame [220 x 46]
##
## country 1970 1971 1972 1973
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 Afghanistan 1813.981234 1840.794051 1794.831482 1817.997422
## 2 Albania 4435.433037 4425.968296 5035.309230 4638.057137
## 3 Algeria 18850.745443 22226.320843 34278.432478 46375.728534
## 4 American Samoa 6.183023 6.327022 6.462429 6.587474
## 5 Angola 8946.495011 8532.101439 10382.218539 11365.512295
## 6 Anguilla 2.168006 2.168036 2.263426 2.108604
## 7 Antigua and Barbuda 196.161773 203.964395 244.721752 263.402867
## 8 Argentina 87478.401090 87827.313960 88888.066154 93076.186184
## 9 Armenia 10951.012839 10972.887313 11783.384976 12844.589903
## 10 Aruba 40.380782 40.635025 41.138578 41.949184
## .. ... ... ... ... ...
## Variables not shown: 1974 (dbl), 1975 (dbl), 1976 (dbl), 1977 (dbl), 1978
## (dbl), 1979 (dbl), 1980 (dbl), 1981 (dbl), 1982 (dbl), 1983 (dbl), 1984
## (dbl), 1985 (dbl), 1986 (dbl), 1987 (dbl), 1988 (dbl), 1989 (dbl), 1990
## (dbl), 1991 (dbl), 1992 (dbl), 1993 (dbl), 1994 (dbl), 1995 (dbl), 1996
## (dbl), 1997 (dbl), 1998 (dbl), 1999 (dbl), 2000 (dbl), 2001 (dbl), 2002
## (dbl), 2003 (dbl), 2004 (dbl), 2005 (dbl), 2006 (dbl), 2007 (dbl), 2008
## (dbl), 2009 (dbl), 2010 (dbl), 2011 (dbl), 2012 (dbl), 2013 (dbl), 2014
## (dbl)
co2 %>%
gather("year","n", 2:46) %>%
filter(year %in% 2014) %>%
subset(country != c("World","EU28")) %>%
arrange(desc(n)) %>%
head(n=5)
## Source: local data frame [5 x 3]
##
## country year n
## (chr) (chr) (dbl)
## 1 China 2014 10540750
## 2 United States of America 2014 5334530
## 3 India 2014 2341897
## 4 Russian Federation 2014 1766427
## 5 Japan 2014 1278922
Task 2 Task. Summarize the total emissions by country (not World or EU28) across years from your long format table and return the top 5 emitting countries. (As with most analyses, there are multiple ways to do this. I used the following functions: filter
, arrange
, desc
, head
).
co2 %>%
gather("year","n", 2:46)%>%
group_by(country) %>%
subset(country != c("World","EU28")) %>%
summarise(
sum=sum(n)) %>%
arrange(desc(sum)) %>%
head
## Source: local data frame [6 x 2]
##
## country sum
## (chr) (dbl)
## 1 United States of America 231948899
## 2 China 174045927
## 3 Russian Federation 81242427
## 4 Japan 51276329
## 5 Germany 43382205
## 6 India 39004218
pollution %>% group_by(city) %>% summarise( mean = mean(amount), sum = sum(amount), n = n())
pollution %>% group_by(size) %>% summarise( mean = mean(amount), sum = sum(amount), n = n())
bind_cols
y = data.frame(
x1 = c('A','B','C'),
x2 = c( 1 , 2 , 3),
stringsAsFactors=F)
z = data.frame(
x1 = c('B','C','D'),
x2 = c( 2 , 3 , 4),
stringsAsFactors=F)
y
z
bind_cols(y, z)
bind_rows
y
z
bind_rows(y, z)
union
y
z
union(y, z)
intersect
y
z
intersect(y, z)
setdiff
y
z
setdiff(y, z)
left_join
songs = data.frame(
song = c('Across the Universe','Come Together', 'Hello, Goodbye', 'Peggy Sue'),
name = c('John','John','Paul','Buddy'),
stringsAsFactors=F)
artists = data.frame(
name = c('George','John','Paul','Ringo'),
plays = c('sitar','guitar','bass','drums'),
stringsAsFactors=F)
left_join(songs, artists, by='name')
inner_join
inner_join(songs, artists, by = "name")
semi_join
semi_join(songs, artists, by = "name")
anti_join
anti_join(songs, artists, by = "name")
You’ll join the gapminder datasets to get world population per country.
Task. Report the top 5 emitting countries (not World or EU28) per capita using your long format table. (You may need to convert your year column from factor to numeric, eg mutate(year = as.numeric(as.character(year)))
. As with most analyses, there are multiple ways to do this. I used the following functions: filter
, arrange
, desc
, head
).
Task. Summarize the total emissions by country (not World or EU28) per capita across years from your long format table and return the top 5 emitting countries. (As with most analyses, there are multiple ways to do this. I used the following functions: filter
, arrange
, desc
, head
).
library(gapminder) # install.packages('gapminder')