Managing the working directory in R markdown

# set working directory if has child directory
dir_child = 'students' 
if (dir_child %in% list.files()){
  if (interactive()) {  
    # R Console
    setwd(dir_child)
  } else {              
    # knitting
    knitr::opts_knit$set(root.dir=dir_child)  
  }
}

1. Reproducible Science Tools

and ## 2. Programming Concepts ### Content

For this class, the group will be working to create a program to take in survey data in real time, reorganize it, and output summary statistics. For my group project (on Santa Barbara water supply) and future work, I’m most interested in learning data management strategies and ways to visualize data.

Techniques

  1. Wrangling data
  2. Cleaning up data
  3. Visualizing data
    • interactive maps
    • interactive time series

Data

A group member has data on survey responses on solid waste. Data is received via text message and associated timestamps. The responses need to be tied to a specific survey question and organized.

Image

Fig. 1. Histogram of age distribution of survey respondents.

Summary of Survey Response Data

# read csv
d = read.csv('data/jkidson_ugandasms.csv')
      
# output summary
summary(d)
##  Nominated      Zone       years_zone        gender         age      
##  no :504   ZONE I : 24   Min.   : 1.000   female:442   Min.   :18.0  
##  yes:536   ZONE V : 22   1st Qu.: 2.000   male  :598   1st Qu.:23.0  
##            CHURCH : 18   Median : 5.000                Median :28.0  
##            LUGOBA : 15   Mean   : 9.192                Mean   :30.5  
##            LUBIRI : 14   3rd Qu.:13.000                3rd Qu.:35.0  
##            B 9    : 13   Max.   :65.000                Max.   :76.0  
##            (Other):934                                               
##             satis    
##  dissatisfied  :320  
##  neither_satisf:124  
##  satisfied     :270  
##  very_dissatisf:283  
##  very_satisfied: 43  
##                      
## 

3. Data Wrangling

Reading CSV with readr

library(readr)

d = read_csv('data/jkidson_ugandasms.csv')
head(d)
##   Nominated      Zone years_zone gender age          satis
## 1       yes    LUGOBA          6 female  30 very_dissatisf
## 2        no  KISOWERA         30 female  40      satisfied
## 3       yes  KISOWERA         15   male  29      satisfied
## 4       yes GOOD HOPE          1 female  42 very_satisfied
## 5       yes    LUGOBA          2   male  23      satisfied
## 6        no    CORNER          3 female  30 very_dissatisf
summary(d)
##   Nominated             Zone             years_zone        gender         
##  Length:1040        Length:1040        Min.   : 1.000   Length:1040       
##  Class :character   Class :character   1st Qu.: 2.000   Class :character  
##  Mode  :character   Mode  :character   Median : 5.000   Mode  :character  
##                                        Mean   : 9.192                     
##                                        3rd Qu.:13.000                     
##                                        Max.   :65.000                     
##       age          satis          
##  Min.   :18.0   Length:1040       
##  1st Qu.:23.0   Class :character  
##  Median :28.0   Mode  :character  
##  Mean   :30.5                     
##  3rd Qu.:35.0                     
##  Max.   :76.0

dplyr Demo

Pseudocode

# read in csv
# view data
# limit columns to Zone and satis
# limit rows to just zone "CORNER"
# get count for each satisfaction level
# write out csv

Multiple Variables

# read in csv
surveys = read.csv('data/jkidson_ugandasms.csv') 

# view data
head(surveys)
##   Nominated      Zone years_zone gender age          satis
## 1       yes    LUGOBA          6 female  30 very_dissatisf
## 2        no  KISOWERA         30 female  40      satisfied
## 3       yes  KISOWERA         15   male  29      satisfied
## 4       yes GOOD HOPE          1 female  42 very_satisfied
## 5       yes    LUGOBA          2   male  23      satisfied
## 6        no    CORNER          3 female  30 very_dissatisf
summary(surveys)
##  Nominated      Zone       years_zone        gender         age      
##  no :504   ZONE I : 24   Min.   : 1.000   female:442   Min.   :18.0  
##  yes:536   ZONE V : 22   1st Qu.: 2.000   male  :598   1st Qu.:23.0  
##            CHURCH : 18   Median : 5.000                Median :28.0  
##            LUGOBA : 15   Mean   : 9.192                Mean   :30.5  
##            LUBIRI : 14   3rd Qu.:13.000                3rd Qu.:35.0  
##            B 9    : 13   Max.   :65.000                Max.   :76.0  
##            (Other):934                                               
##             satis    
##  dissatisfied  :320  
##  neither_satisf:124  
##  satisfied     :270  
##  very_dissatisf:283  
##  very_satisfied: 43  
##                      
## 
# limit columns to species and year
surveys_2 = surveys[,c('Zone', 'satis')]

# limit rows to just species "NL"
surveys_3 = surveys_2[surveys_2$Zone  == 'CORNER',]

# get count per year
surveys_4 = aggregate(Zone ~ satis, data=surveys_3, FUN='length')

# write to csv
write.csv(surveys_4, 'data/surveys_jkidson.csv', row.names = FALSE)

Nested Functions

(For some reason this nested function is giving me an error, so I commented it out for now. I’ll come back to it if I have time after the group assignment.)

# read in data
surveys = read.csv('data/jkidson_ugandasms.csv') 

# view data
head(surveys)
##   Nominated      Zone years_zone gender age          satis
## 1       yes    LUGOBA          6 female  30 very_dissatisf
## 2        no  KISOWERA         30 female  40      satisfied
## 3       yes  KISOWERA         15   male  29      satisfied
## 4       yes GOOD HOPE          1 female  42 very_satisfied
## 5       yes    LUGOBA          2   male  23      satisfied
## 6        no    CORNER          3 female  30 very_dissatisf
summary(surveys)
##  Nominated      Zone       years_zone        gender         age      
##  no :504   ZONE I : 24   Min.   : 1.000   female:442   Min.   :18.0  
##  yes:536   ZONE V : 22   1st Qu.: 2.000   male  :598   1st Qu.:23.0  
##            CHURCH : 18   Median : 5.000                Median :28.0  
##            LUGOBA : 15   Mean   : 9.192                Mean   :30.5  
##            LUBIRI : 14   3rd Qu.:13.000                3rd Qu.:35.0  
##            B 9    : 13   Max.   :65.000                Max.   :76.0  
##            (Other):934                                               
##             satis    
##  dissatisfied  :320  
##  neither_satisf:124  
##  satisfied     :270  
##  very_dissatisf:283  
##  very_satisfied: 43  
##                      
## 
# limit data with [], aggregate to count, write to csv
# write.csv(
#  aggregate(
#    Zone ~ satis, 
#    data = surveys[surveys$satis  == 'CORNER', c('Zone', 'satis')], 
#    FUN = 'length'), 
#  'data/surveys_jkidson.csv',
#  row.names = FALSE)

Using dplyr

# load libraries
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# read in csv
surveys = read_csv('data/jkidson_ugandasms.csv') 

# dplyr elegance
surveys %T>%                            # note tee operator %T>% for glimpse
  glimpse() %>%                         # view data
  select(Zone, satis) %>%               # limit columns
  filter(Zone  == 'CORNER') %>%         # limit rows
  group_by(satis) %>%                   # get count by first grouping
  summarize(n = n()) %>%                # then summarize
  write_csv('data/surveys_jkidson.csv') # write out csv
## Observations: 1,040
## Variables: 6
## $ Nominated  (chr) "yes", "no", "yes", "yes", "yes", "no", "yes", "yes...
## $ Zone       (chr) "LUGOBA", "KISOWERA", "KISOWERA", "GOOD HOPE", "LUG...
## $ years_zone (int) 6, 30, 15, 1, 2, 3, 3, 13, 1, 2, 9, 35, 10, 4, 35, ...
## $ gender     (chr) "female", "female", "male", "female", "male", "fema...
## $ age        (int) 30, 40, 29, 42, 23, 30, 26, 32, 25, 22, 30, 35, 29,...
## $ satis      (chr) "very_dissatisf", "satisfied", "satisfied", "very_s...

4. Tidying Up Data

EDAWR

install.packages("devtools")
devtools::install_github("rstudio/EDAWR")
library(EDAWR)
help(package='EDAWR')
?storms    # wind speed data for 6 hurricanes
?cases     # subset of WHO tuberculosis
?pollution # pollution data from WHO Ambient Air Pollution, 2014
?tb        # tuberculosis data
View(storms)
View(cases)
View(pollution)

slicing

# storms
storms$storm
storms$wind
storms$pressure
storms$date

# cases
cases$country
names(cases)[-1]
unlist(cases[1:3, 2:4])

# pollution
pollution$city[c(1,3,5)]
pollution$amount[c(1,3,5)]
pollution$amount[c(2,4,6)]

# ratio
storms$pressure / storms$wind

tidyr

Two main functions: gather() and spread()

# install.packages("tidyr")
library(tidyr)
?gather # gather to long
?spread # spread to wide

gather

cases %>%
# gather(cases, "year", "n", 2:4) # gathers by key of year
# if you wanted to removed a column you could enter -2 instead of 2:4 (but this makes country weird)
gather("year", "n", -country) %>%
# France and US for 2011 and 2013
  filter(
    year %in% c(2011,2013) &
    country %in% c('FR','US'))

spread

pollution
spread(pollution, size, amount)

Other functions to extract and combine columns…

separate

storms
storms2 = separate(storms, date, c("year", "month", "day"), sep = "-")
library(stringr)
# Splitting date strings!!!
#storms%>%
#  mutate(date_str = as.character(date)) %>%
#  separate(date_str, c("year","month","day"), sep = "-")

unite

storms2
unite(storms2, "date", year, month, day, sep = "-")

Recap: tidyr:

  • A package that reshapes the layout of data sets.

  • Make observations from variables with gather() Make variables from observations with spread()

  • Split and merge columns with unite() and separate()

From the data-wrangling-cheatsheet.pdf:

tidy CO2 emissions

Task. Convert the following table CO2 emissions per country since 1970 from wide to long format and output the first few rows into your Rmarkdown. I recommend consulting ?gather and you should have 3 columns in your output.

library(readxl) # install.packages('readxl')

url = 'http://edgar.jrc.ec.europa.eu/news_docs/CO2_1970-2014_dataset_of_CO2_report_2015.xls'
xls = '../data/co2_europa.xls'

print(getwd())
## [1] "/Users/jenniferkidson/Documents/Bren Docs/296 Informatics/env-info/students"
if (!file.exists(xls)){
  download.file(url, xls)
}
co2 = read_excel(xls, skip=12)
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00 
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00 
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00 
## DEFINEDNAME: 21 00 00 01 0b 00 00 00 01 00 00 00 00 00 00 0d 3b 00 00 0c 00 e0 00 00 00 2c 00
co2
## Source: local data frame [220 x 46]
## 
##                Country  1970.000000  1971.000000  1972.000000  1973.000000
##                  (chr)        (dbl)        (dbl)        (dbl)        (dbl)
## 1          Afghanistan  1813.981234  1840.794051  1794.831482  1817.997422
## 2              Albania  4435.433037  4425.968296  5035.309230  4638.057137
## 3              Algeria 18850.745443 22226.320843 34278.432478 46375.728534
## 4       American Samoa     6.183023     6.327022     6.462429     6.587474
## 5               Angola  8946.495011  8532.101439 10382.218539 11365.512295
## 6             Anguilla     2.168006     2.168036     2.263426     2.108604
## 7  Antigua and Barbuda   196.161773   203.964395   244.721752   263.402867
## 8            Argentina 87478.401090 87827.313960 88888.066154 93076.186184
## 9              Armenia 10951.012839 10972.887313 11783.384976 12844.589903
## 10               Aruba    40.380782    40.635025    41.138578    41.949184
## ..                 ...          ...          ...          ...          ...
## Variables not shown: 1974.000000 (dbl), 1975.000000 (dbl), 1976.000000
##   (dbl), 1977.000000 (dbl), 1978.000000 (dbl), 1979.000000 (dbl),
##   1980.000000 (dbl), 1981.000000 (dbl), 1982.000000 (dbl), 1983.000000
##   (dbl), 1984.000000 (dbl), 1985.000000 (dbl), 1986.000000 (dbl),
##   1987.000000 (dbl), 1988.000000 (dbl), 1989.000000 (dbl), 1990.000000
##   (dbl), 1991.000000 (dbl), 1992.000000 (dbl), 1993.000000 (dbl),
##   1994.000000 (dbl), 1995.000000 (dbl), 1996.000000 (dbl), 1997.000000
##   (dbl), 1998.000000 (dbl), 1999.000000 (dbl), 2000.000000 (dbl),
##   2001.000000 (dbl), 2002.000000 (dbl), 2003.000000 (dbl), 2004.000000
##   (dbl), 2005.000000 (dbl), 2006.000000 (dbl), 2007.000000 (dbl),
##   2008.000000 (dbl), 2009.000000 (dbl), 2010.000000 (dbl), 2011.000000
##   (dbl), 2012.000000 (dbl), 2013.000000 (dbl), 2014.000000 (dbl)

Question. Why use skip=12 argument in read_excel()? Everything before line 13 is separate from the actual data table, so skip=12 jumps over the first 12 rows.

dplyr

A package that helps transform tabular data

# install.packages("dplyr")
library(dplyr)
?select
?filter
?arrange
?mutate
?group_by
?summarise

See sections in the data-wrangling-cheatsheet.pdf:

select

storms
select(storms, storm, pressure)
storms %>% select(storm, pressure)

filter

storms
filter(storms, wind >= 50)
storms %>% filter(wind >= 50)

storms %>%
  filter(wind >= 50) %>%
  select(storm, pressure)

mutate

storms %>%
  mutate(ratio = pressure / wind) %>%
  select(storm, ratio)

group_by

pollution
pollution %>% group_by(city)

summarise

# by city
pollution %>% 
  group_by(city) %>%
  summarise(
    mean = mean(amount), 
    sum = sum(amount), 
    n = n())

# by size
pollution %>% 
  group_by(size) %>%
  summarise(
    mean = mean(amount), 
    sum = sum(amount), 
    n = n())

note that summarize synonymously works

ungroup

pollution %>% 
  group_by(size)

pollution %>% 
  group_by(size) %>%
  ungroup()

multiple groups

tb %>%
  group_by(country, year) %>%
  summarise(cases = sum(cases))
  summarise(cases = sum(cases))

Recap: dplyr:

  • Extract columns with select() and rows with filter()

  • Sort rows by column with arrange()

  • Make new columns with mutate()

  • Group rows by column with group_by() and summarise()

See sections in the data-wrangling-cheatsheet.pdf:

  • Subset Variables (Columns), eg select()

  • Subset Observations (Rows), eg filter()

  • Reshaping Data - Change the layout of a data set, eg arrange()

  • Make New Variables, eg mutate()

  • Group Data, eg group_by() and summarise()

summarize CO2 emissions

Assignment 4

Task 1 Task. Report the top 5 emitting countries (not World or EU28) for 2014 using your long format table. (You may need to convert your year column from factor to numeric, eg mutate(year = as.numeric(as.character(year))). As with most analyses, there are multiple ways to do this. I used the following functions: filter, arrange, desc, head).

  names(co2) = c('country', 1970:2014) # get rid of extra decimal places in column names
  library(dplyr)
  library(tidyr)
  co2
## Source: local data frame [220 x 46]
## 
##                country         1970         1971         1972         1973
##                  (chr)        (dbl)        (dbl)        (dbl)        (dbl)
## 1          Afghanistan  1813.981234  1840.794051  1794.831482  1817.997422
## 2              Albania  4435.433037  4425.968296  5035.309230  4638.057137
## 3              Algeria 18850.745443 22226.320843 34278.432478 46375.728534
## 4       American Samoa     6.183023     6.327022     6.462429     6.587474
## 5               Angola  8946.495011  8532.101439 10382.218539 11365.512295
## 6             Anguilla     2.168006     2.168036     2.263426     2.108604
## 7  Antigua and Barbuda   196.161773   203.964395   244.721752   263.402867
## 8            Argentina 87478.401090 87827.313960 88888.066154 93076.186184
## 9              Armenia 10951.012839 10972.887313 11783.384976 12844.589903
## 10               Aruba    40.380782    40.635025    41.138578    41.949184
## ..                 ...          ...          ...          ...          ...
## Variables not shown: 1974 (dbl), 1975 (dbl), 1976 (dbl), 1977 (dbl), 1978
##   (dbl), 1979 (dbl), 1980 (dbl), 1981 (dbl), 1982 (dbl), 1983 (dbl), 1984
##   (dbl), 1985 (dbl), 1986 (dbl), 1987 (dbl), 1988 (dbl), 1989 (dbl), 1990
##   (dbl), 1991 (dbl), 1992 (dbl), 1993 (dbl), 1994 (dbl), 1995 (dbl), 1996
##   (dbl), 1997 (dbl), 1998 (dbl), 1999 (dbl), 2000 (dbl), 2001 (dbl), 2002
##   (dbl), 2003 (dbl), 2004 (dbl), 2005 (dbl), 2006 (dbl), 2007 (dbl), 2008
##   (dbl), 2009 (dbl), 2010 (dbl), 2011 (dbl), 2012 (dbl), 2013 (dbl), 2014
##   (dbl)
  co2 %>%
    gather("year","n", 2:46) %>%
    filter(year %in% 2014) %>%
    subset(country != c("World","EU28")) %>%
    arrange(desc(n)) %>% 
    head(n=5)
## Source: local data frame [5 x 3]
## 
##                    country  year        n
##                      (chr) (chr)    (dbl)
## 1                    China  2014 10540750
## 2 United States of America  2014  5334530
## 3                    India  2014  2341897
## 4       Russian Federation  2014  1766427
## 5                    Japan  2014  1278922

Task 2 Task. Summarize the total emissions by country (not World or EU28) across years from your long format table and return the top 5 emitting countries. (As with most analyses, there are multiple ways to do this. I used the following functions: filter, arrange, desc, head).

co2 %>%
    gather("year","n", 2:46)%>%
    group_by(country) %>%
    subset(country != c("World","EU28")) %>%
    summarise(
      sum=sum(n)) %>%
    arrange(desc(sum)) %>%
    head
## Source: local data frame [6 x 2]
## 
##                    country       sum
##                      (chr)     (dbl)
## 1 United States of America 231948899
## 2                    China 174045927
## 3       Russian Federation  81242427
## 4                    Japan  51276329
## 5                  Germany  43382205
## 6                    India  39004218

by city

pollution %>% group_by(city) %>% summarise( mean = mean(amount), sum = sum(amount), n = n())

by size

pollution %>% group_by(size) %>% summarise( mean = mean(amount), sum = sum(amount), n = n())

joining data

bind_cols

y = data.frame(
  x1 = c('A','B','C'), 
  x2 = c( 1 , 2 , 3), 
  stringsAsFactors=F)
z = data.frame(
  x1 = c('B','C','D'), 
  x2 = c( 2 , 3 , 4), 
  stringsAsFactors=F)
y
z
bind_cols(y, z)

bind_rows

y
z
bind_rows(y, z)

union

y
z
union(y, z)

intersect

y
z
intersect(y, z)

setdiff

y
z
setdiff(y, z)

left_join

songs = data.frame(
  song = c('Across the Universe','Come Together', 'Hello, Goodbye', 'Peggy Sue'),
  name = c('John','John','Paul','Buddy'), 
  stringsAsFactors=F)
artists = data.frame(
  name = c('George','John','Paul','Ringo'),
  plays = c('sitar','guitar','bass','drums'), 
  stringsAsFactors=F)
left_join(songs, artists, by='name')

inner_join

inner_join(songs, artists, by = "name")

semi_join

semi_join(songs, artists, by = "name")

anti_join

anti_join(songs, artists, by = "name")

summarize per capita CO2 emissions

You’ll join the gapminder datasets to get world population per country.

Task. Report the top 5 emitting countries (not World or EU28) per capita using your long format table. (You may need to convert your year column from factor to numeric, eg mutate(year = as.numeric(as.character(year))). As with most analyses, there are multiple ways to do this. I used the following functions: filter, arrange, desc, head).

Task. Summarize the total emissions by country (not World or EU28) per capita across years from your long format table and return the top 5 emitting countries. (As with most analyses, there are multiple ways to do this. I used the following functions: filter, arrange, desc, head).

library(gapminder) # install.packages('gapminder')

Assignment 4