## function (dir) 
## .Internal(setwd(dir))
## <bytecode: 0x0000000013190bb0>
## <environment: namespace:base>

Content

I am interested on climate impacts on a large landscape scale, how ecosystems and species will adapt, and how we as managers can help improve the resilience of these environments.

Techniques

There are several techniques I am looking forward to learning:

  1. Dealing with large datasets
  2. Using Git to collaborate with other researchers
  3. Visualizing and sharing data analytics

Data

I did make up some data today during class:

I do think United National Environmental Programme will have some interesting data, as will NOAA.

Penguins

Here is a photo of a molting, juvenile Gentoo penguin (Pygoscelis papua) on King George Island, Antarctica. I took this photo in February, 2011 while working for NOAA.

Sea Level Rise Data from UNEP

I downloaded data from the United National Environmental Programme’s Environmental Data Explorer that contains the percent of total land area that is below 5m elevation for each country. Below are summary statistics of that data:

# read csv
sea_level_unep_kb <- read.csv("data/knboysen_sea_level_unep.csv")
      
# output histogram
hist(sea_level_unep_kb$Percent_land_under_5m, xlab="Percent Land Below 5m", main="Histogram of UNEP Elevation Data", col ="blue")

This data set includes 237 countries. Below is a table of the countries that are completely (100% of land area) below 5m of elevation

subset<- subset(sea_level_unep_kb, Percent_land_under_5m==100)
print(subset[,c(1,3,15)])
##     Country.Name     GEO.Region Percent_land_under_5m
## 199     Maldives Asia + Pacific                   100
## 200       Monaco         Europe                   100
## 201  South Sudan         Africa                   100
## 202       Tuvalu Asia + Pacific                   100

The mean percent of land under 5m is 9.7063412%.

Data Wrangling

Install Packages

# Run this chunk only once in your Console
# Do not evaluate when knitting Rmarkdown

# list of packages
pkgs = c(
  'readr',        # read csv
  'readxl',       # read xls
  'dplyr',        # data frame manipulation
  'tidyr',        # data tidying
  'nycflights13', # test dataset of NYC flights for 2013
  'gapminder')    # test dataset of life expectancy and popultion

# install packages if not found
for (p in pkgs){
  if (!require(p, character.only=T)){
    install.packages(p)
  }
}

utils::read.csv

Traditionally, you would read a CSV like so:

d = read.csv('../data/r-ecology/species.csv')
d
##    species_id            genus         species    taxa
## 1          AB       Amphispiza       bilineata    Bird
## 2          AH Ammospermophilus         harrisi  Rodent
## 3          AS       Ammodramus      savannarum    Bird
## 4          BA          Baiomys         taylori  Rodent
## 5          CB  Campylorhynchus brunneicapillus    Bird
## 6          CM      Calamospiza     melanocorys    Bird
## 7          CQ       Callipepla        squamata    Bird
## 8          CS         Crotalus      scutalatus Reptile
## 9          CT    Cnemidophorus          tigris Reptile
## 10         CU    Cnemidophorus       uniparens Reptile
## 11         CV         Crotalus         viridis Reptile
## 12         DM        Dipodomys        merriami  Rodent
## 13         DO        Dipodomys           ordii  Rodent
## 14         DS        Dipodomys     spectabilis  Rodent
## 15         DX        Dipodomys             sp.  Rodent
## 16         EO          Eumeces       obsoletus Reptile
## 17         GS         Gambelia           silus Reptile
## 18         NL          Neotoma        albigula  Rodent
## 19         NX          Neotoma             sp.  Rodent
## 20         OL        Onychomys     leucogaster  Rodent
## 21         OT        Onychomys        torridus  Rodent
## 22         OX        Onychomys             sp.  Rodent
## 23         PB      Chaetodipus         baileyi  Rodent
## 24         PC           Pipilo       chlorurus    Bird
## 25         PE       Peromyscus        eremicus  Rodent
## 26         PF      Perognathus          flavus  Rodent
## 27         PG        Pooecetes       gramineus    Bird
## 28         PH      Perognathus        hispidus  Rodent
## 29         PI      Chaetodipus     intermedius  Rodent
## 30         PL       Peromyscus        leucopus  Rodent
## 31         PM       Peromyscus     maniculatus  Rodent
## 32         PP      Chaetodipus    penicillatus  Rodent
## 33         PU           Pipilo          fuscus    Bird
## 34         PX      Chaetodipus             sp.  Rodent
## 35         RF  Reithrodontomys      fulvescens  Rodent
## 36         RM  Reithrodontomys       megalotis  Rodent
## 37         RO  Reithrodontomys        montanus  Rodent
## 38         RX  Reithrodontomys             sp.  Rodent
## 39         SA       Sylvilagus       audubonii  Rabbit
## 40         SB         Spizella         breweri    Bird
## 41         SC       Sceloporus          clarki Reptile
## 42         SF         Sigmodon     fulviventer  Rodent
## 43         SH         Sigmodon        hispidus  Rodent
## 44         SO         Sigmodon    ochrognathus  Rodent
## 45         SS     Spermophilus       spilosoma  Rodent
## 46         ST     Spermophilus    tereticaudus  Rodent
## 47         SU       Sceloporus       undulatus Reptile
## 48         SX         Sigmodon             sp.  Rodent
## 49         UL           Lizard             sp. Reptile
## 50         UP           Pipilo             sp.    Bird
## 51         UR           Rodent             sp.  Rodent
## 52         US          Sparrow             sp.    Bird
## 53         ZL      Zonotrichia      leucophrys    Bird
## 54         ZM          Zenaida        macroura    Bird
head(d) ##species_ID is a factor
##   species_id            genus         species   taxa
## 1         AB       Amphispiza       bilineata   Bird
## 2         AH Ammospermophilus         harrisi Rodent
## 3         AS       Ammodramus      savannarum   Bird
## 4         BA          Baiomys         taylori Rodent
## 5         CB  Campylorhynchus brunneicapillus   Bird
## 6         CM      Calamospiza     melanocorys   Bird
summary(d)
##    species_id             genus         species        taxa   
##  AB     : 1   Chaetodipus    : 4   sp.      :10   Bird   :13  
##  AH     : 1   Dipodomys      : 4   hispidus : 2   Rabbit : 1  
##  AS     : 1   Reithrodontomys: 4   albigula : 1   Reptile: 9  
##  BA     : 1   Sigmodon       : 4   audubonii: 1   Rodent :31  
##  CB     : 1   Onychomys      : 3   baileyi  : 1               
##  CM     : 1   Peromyscus     : 3   bilineata: 1               
##  (Other):48   (Other)        :32   (Other)  :38

readr::read_csv Better yet, try read_csv:

library(readr)

d = read_csv('../data/r-ecology/species.csv')
d
##    species_id            genus         species    taxa
## 1          AB       Amphispiza       bilineata    Bird
## 2          AH Ammospermophilus         harrisi  Rodent
## 3          AS       Ammodramus      savannarum    Bird
## 4          BA          Baiomys         taylori  Rodent
## 5          CB  Campylorhynchus brunneicapillus    Bird
## 6          CM      Calamospiza     melanocorys    Bird
## 7          CQ       Callipepla        squamata    Bird
## 8          CS         Crotalus      scutalatus Reptile
## 9          CT    Cnemidophorus          tigris Reptile
## 10         CU    Cnemidophorus       uniparens Reptile
## 11         CV         Crotalus         viridis Reptile
## 12         DM        Dipodomys        merriami  Rodent
## 13         DO        Dipodomys           ordii  Rodent
## 14         DS        Dipodomys     spectabilis  Rodent
## 15         DX        Dipodomys             sp.  Rodent
## 16         EO          Eumeces       obsoletus Reptile
## 17         GS         Gambelia           silus Reptile
## 18         NL          Neotoma        albigula  Rodent
## 19         NX          Neotoma             sp.  Rodent
## 20         OL        Onychomys     leucogaster  Rodent
## 21         OT        Onychomys        torridus  Rodent
## 22         OX        Onychomys             sp.  Rodent
## 23         PB      Chaetodipus         baileyi  Rodent
## 24         PC           Pipilo       chlorurus    Bird
## 25         PE       Peromyscus        eremicus  Rodent
## 26         PF      Perognathus          flavus  Rodent
## 27         PG        Pooecetes       gramineus    Bird
## 28         PH      Perognathus        hispidus  Rodent
## 29         PI      Chaetodipus     intermedius  Rodent
## 30         PL       Peromyscus        leucopus  Rodent
## 31         PM       Peromyscus     maniculatus  Rodent
## 32         PP      Chaetodipus    penicillatus  Rodent
## 33         PU           Pipilo          fuscus    Bird
## 34         PX      Chaetodipus             sp.  Rodent
## 35         RF  Reithrodontomys      fulvescens  Rodent
## 36         RM  Reithrodontomys       megalotis  Rodent
## 37         RO  Reithrodontomys        montanus  Rodent
## 38         RX  Reithrodontomys             sp.  Rodent
## 39         SA       Sylvilagus       audubonii  Rabbit
## 40         SB         Spizella         breweri    Bird
## 41         SC       Sceloporus          clarki Reptile
## 42         SF         Sigmodon     fulviventer  Rodent
## 43         SH         Sigmodon        hispidus  Rodent
## 44         SO         Sigmodon    ochrognathus  Rodent
## 45         SS     Spermophilus       spilosoma  Rodent
## 46         ST     Spermophilus    tereticaudus  Rodent
## 47         SU       Sceloporus       undulatus Reptile
## 48         SX         Sigmodon             sp.  Rodent
## 49         UL           Lizard             sp. Reptile
## 50         UP           Pipilo             sp.    Bird
## 51         UR           Rodent             sp.  Rodent
## 52         US          Sparrow             sp.    Bird
## 53         ZL      Zonotrichia      leucophrys    Bird
## 54         ZM          Zenaida        macroura    Bird
head(d) ##now the specices_ID is a "character". why does this matter, BB?!
##   species_id            genus         species   taxa
## 1         AB       Amphispiza       bilineata   Bird
## 2         AH Ammospermophilus         harrisi Rodent
## 3         AS       Ammodramus      savannarum   Bird
## 4         BA          Baiomys         taylori Rodent
## 5         CB  Campylorhynchus brunneicapillus   Bird
## 6         CM      Calamospiza     melanocorys   Bird
summary(d)
##   species_id           genus             species         
##  Length:54          Length:54          Length:54         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      taxa          
##  Length:54         
##  Class :character  
##  Mode  :character

dplry::tbl_df Now convert to a dplyr table:

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
d = read_csv('../data/r-ecology/species.csv') %>%
  tbl_df()
d
## Source: local data frame [54 x 4]
## 
##    species_id            genus         species    taxa
##         (chr)            (chr)           (chr)   (chr)
## 1          AB       Amphispiza       bilineata    Bird
## 2          AH Ammospermophilus         harrisi  Rodent
## 3          AS       Ammodramus      savannarum    Bird
## 4          BA          Baiomys         taylori  Rodent
## 5          CB  Campylorhynchus brunneicapillus    Bird
## 6          CM      Calamospiza     melanocorys    Bird
## 7          CQ       Callipepla        squamata    Bird
## 8          CS         Crotalus      scutalatus Reptile
## 9          CT    Cnemidophorus          tigris Reptile
## 10         CU    Cnemidophorus       uniparens Reptile
## ..        ...              ...             ...     ...
head(d)
## Source: local data frame [6 x 4]
## 
##   species_id            genus         species   taxa
##        (chr)            (chr)           (chr)  (chr)
## 1         AB       Amphispiza       bilineata   Bird
## 2         AH Ammospermophilus         harrisi Rodent
## 3         AS       Ammodramus      savannarum   Bird
## 4         BA          Baiomys         taylori Rodent
## 5         CB  Campylorhynchus brunneicapillus   Bird
## 6         CM      Calamospiza     melanocorys   Bird
summary(d)
##   species_id           genus             species         
##  Length:54          Length:54          Length:54         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      taxa          
##  Length:54         
##  Class :character  
##  Mode  :character
glimpse(d)
## Observations: 54
## Variables: 4
## $ species_id (chr) "AB", "AH", "AS", "BA", "CB", "CM", "CQ", "CS", "CT...
## $ genus      (chr) "Amphispiza", "Ammospermophilus", "Ammodramus", "Ba...
## $ species    (chr) "bilineata", "harrisi", "savannarum", "taylori", "b...
## $ taxa       (chr) "Bird", "Rodent", "Bird", "Rodent", "Bird", "Bird",...
b=read_csv('C:/Users/Kristen/Documents/BREN/winter16/env_info/env-info/data/r-ecology/surveys.csv') %>%
  select(species_id, year) %>%
  filter(species_id== "AB") %>%
  group_by(species_id, year) %>%
  summarize(count = n())

elegance with dplry

week 3 individual assignment

library(readr)
library(dplyr)
#read is csv
surveys = read_csv('../data/r-ecology/surveys.csv')

surveys %T>% ##tee operator is good for printing or plotting that wouldn't ouput a return usually
  glimpse() %>%
  select(species_id, year) %>% #selected columns
  filter(species_id == 'NL') %>% ##with specific row entries
  group_by(year) %>%
  summarize(n=n()) %T>%  ##summarize n counts
  glimpse() %>% ##view the table before writing it. it seems to work. 31 "NL" counts in 1977
  write_csv('data/surveys_kboysen.csv') ##hooray it works!
## Observations: 35,549
## Variables: 9
## $ record_id       (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ month           (int) 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ day             (int) 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16...
## $ year            (int) 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1977...
## $ plot_id         (int) 2, 3, 2, 7, 3, 1, 2, 1, 1, 6, 5, 7, 3, 8, 6, 4...
## $ species_id      (chr) "NL", "NL", "DM", "DM", "DM", "PF", "PE", "DM"...
## $ sex             (chr) "M", "M", "F", "M", "M", "M", "F", "M", "F", "...
## $ hindfoot_length (int) 32, 33, 37, 36, 35, 14, NA, 37, 34, 20, 53, 38...
## $ weight          (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Observations: 26
## Variables: 2
## $ year (int) 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 198...
## $ n    (int) 31, 48, 30, 57, 63, 111, 98, 64, 45, 60, 128, 102, 67, 29...

Wrangling Webinar

Piping

## Skipping install for github remote, the SHA1 (2652ea64) has not changed since last install.
##   Use `force = TRUE` to force installation
## 
## Attaching package: 'EDAWR'
## The following object is masked _by_ '.GlobalEnv':
## 
##     b
library(tidyr)
library(dplyr)

select(tb, child:elderly) ##verses
## Source: local data frame [3,800 x 3]
## 
##    child adult elderly
##    (int) (int)   (int)
## 1     NA    NA      NA
## 2     NA    NA      NA
## 3     NA    NA      NA
## 4     NA    NA      NA
## 5      5    96       1
## 6      0    26       0
## 7     45  1142      20
## 8     30   500      41
## 9     25   484       8
## 10     8   212       8
## ..   ...   ...     ...
tb %>% select(child:elderly)
## Source: local data frame [3,800 x 3]
## 
##    child adult elderly
##    (int) (int)   (int)
## 1     NA    NA      NA
## 2     NA    NA      NA
## 3     NA    NA      NA
## 4     NA    NA      NA
## 5      5    96       1
## 6      0    26       0
## 7     45  1142      20
## 8     30   500      41
## 9     25   484       8
## 10     8   212       8
## ..   ...   ...     ...
#yay piping

tidyr

library(tidyr)
##cases, a dataset about TB

gather(cases, "year", "n", 2:4) ##cases is the dataframe we are reshaping, "year"/"n" are the new column names, and we are collapsing columns 2-4
##   country year     n
## 1      FR 2011  7000
## 2      DE 2011  5800
## 3      US 2011 15000
## 4      FR 2012  6900
## 5      DE 2012  6000
## 6      US 2012 14000
## 7      FR 2013  7000
## 8      DE 2013  6200
## 9      US 2013 13000
##the opposite function is "spread"
#use "pollution" dataset

spread(pollution, size, amount)
##       city large small
## 1  Beijing   121    56
## 2   London    22    16
## 3 New York    23    14
#creates a new columns for each size (large & small), and populates it with the "amount"


##separate!
storms2<- separate(storms, date, c("year", "month", "day"), sep = "-") ##separates the date of the storm by the year month day (dash separates the original date) this is very cool

##unite!
unite(storms2, "date", year, month, day, sep = "-") ## put it back together again
## Source: local data frame [6 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)      (chr)
## 1 Alberto   110     1007 2000-08-03
## 2    Alex    45     1009 1998-07-27
## 3 Allison    65     1005 1995-06-03
## 4     Ana    40     1013 1997-06-30
## 5  Arlene    50     1010 1999-06-11
## 6  Arthur    45     1010 1996-06-17

Dplyr

library(nycflights13)

###dplyr ways to access info
##select() extracts variables
##filter() extracts exisiting observations
#mutate()  derive new variables
##summarise() ##change the unit of analysis

## - selects everything but
## : selects range
##contains, ends_with, everthing(), matches(), num_range(), one_of(), starts_with()

filter(storms, wind>=50, 
       storm %in% c("Alberto", "Alex", "Allison"))
## Source: local data frame [2 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
#mutate

mutate(storms, ratio =pressure/wind)
## Source: local data frame [6 x 5]
## 
##     storm  wind pressure       date     ratio
##     (chr) (int)    (int)     (date)     (dbl)
## 1 Alberto   110     1007 2000-08-03  9.154545
## 2    Alex    45     1009 1998-07-27 22.422222
## 3 Allison    65     1005 1995-06-03 15.461538
## 4     Ana    40     1013 1997-06-30 25.325000
## 5  Arlene    50     1010 1999-06-11 20.200000
## 6  Arthur    45     1010 1996-06-17 22.444444
mutate(storms, ratio =pressure/wind, inverse= ratio^-1)
## Source: local data frame [6 x 6]
## 
##     storm  wind pressure       date     ratio    inverse
##     (chr) (int)    (int)     (date)     (dbl)      (dbl)
## 1 Alberto   110     1007 2000-08-03  9.154545 0.10923535
## 2    Alex    45     1009 1998-07-27 22.422222 0.04459861
## 3 Allison    65     1005 1995-06-03 15.461538 0.06467662
## 4     Ana    40     1013 1997-06-30 25.325000 0.03948667
## 5  Arlene    50     1010 1999-06-11 20.200000 0.04950495
## 6  Arthur    45     1010 1996-06-17 22.444444 0.04455446
#useful mutate functions

mutate(storms, cummean(pressure)) ##creates new column with cummulative mean
## Source: local data frame [6 x 5]
## 
##     storm  wind pressure       date cummean(pressure)
##     (chr) (int)    (int)     (date)             (dbl)
## 1 Alberto   110     1007 2000-08-03            1007.0
## 2    Alex    45     1009 1998-07-27            1008.0
## 3 Allison    65     1005 1995-06-03            1007.0
## 4     Ana    40     1013 1997-06-30            1008.5
## 5  Arlene    50     1010 1999-06-11            1008.8
## 6  Arthur    45     1010 1996-06-17            1009.0
mutate(storms, percent_rank(wind)) ##ranks them 0-1
## Source: local data frame [6 x 5]
## 
##     storm  wind pressure       date percent_rank(wind)
##     (chr) (int)    (int)     (date)              (dbl)
## 1 Alberto   110     1007 2000-08-03                1.0
## 2    Alex    45     1009 1998-07-27                0.2
## 3 Allison    65     1005 1995-06-03                0.8
## 4     Ana    40     1013 1997-06-30                0.0
## 5  Arlene    50     1010 1999-06-11                0.6
## 6  Arthur    45     1010 1996-06-17                0.2
mutate(storms, dense_rank(wind)) ##ranks them 1(low)-6
## Source: local data frame [6 x 5]
## 
##     storm  wind pressure       date dense_rank(wind)
##     (chr) (int)    (int)     (date)            (int)
## 1 Alberto   110     1007 2000-08-03                5
## 2    Alex    45     1009 1998-07-27                2
## 3 Allison    65     1005 1995-06-03                4
## 4     Ana    40     1013 1997-06-30                1
## 5  Arlene    50     1010 1999-06-11                3
## 6  Arthur    45     1010 1996-06-17                2
mutate(storms, min_rank(wind))
## Source: local data frame [6 x 5]
## 
##     storm  wind pressure       date min_rank(wind)
##     (chr) (int)    (int)     (date)          (int)
## 1 Alberto   110     1007 2000-08-03              6
## 2    Alex    45     1009 1998-07-27              2
## 3 Allison    65     1005 1995-06-03              5
## 4     Ana    40     1013 1997-06-30              1
## 5  Arlene    50     1010 1999-06-11              4
## 6  Arthur    45     1010 1996-06-17              2
##window functions
## i don't get these


##summarise

pollution %>% summarise(median=median(amount), variance=var(amount))
##   median variance
## 1   22.5   1731.6
pollution %>% summarise(mean= mean(amount), sum= sum(amount), n=n())
##   mean sum n
## 1   42 252 6
##arrange() sorts data frame

##desc makes it descending. default is ascending

##piping again

storms %>% filter(wind >=50) 
## Source: local data frame [3 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
## 3  Arlene    50     1010 1999-06-11
#vs
filter(storms, wind>=50)
## Source: local data frame [3 x 4]
## 
##     storm  wind pressure       date
##     (chr) (int)    (int)     (date)
## 1 Alberto   110     1007 2000-08-03
## 2 Allison    65     1005 1995-06-03
## 3  Arlene    50     1010 1999-06-11
storms %>%
  mutate(ratio = pressure/wind) %>%
  select(storm, ratio)
## Source: local data frame [6 x 2]
## 
##     storm     ratio
##     (chr)     (dbl)
## 1 Alberto  9.154545
## 2    Alex 22.422222
## 3 Allison 15.461538
## 4     Ana 25.325000
## 5  Arlene 20.200000
## 6  Arthur 22.444444
#ctrl shift m!

# %>% 

Unit of analysis

#group by 
pollution %>%  group_by(city)
## Source: local data frame [6 x 3]
## Groups: city [3]
## 
##       city  size amount
##      (chr) (chr)  (dbl)
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56
pollution %>%  group_by(city) %>% summarise(mean= mean(amount), sum= sum(amount), n=n())
## Source: local data frame [3 x 4]
## 
##       city  mean   sum     n
##      (chr) (dbl) (dbl) (int)
## 1  Beijing  88.5   177     2
## 2   London  19.0    38     2
## 3 New York  18.5    37     2
pollution %>%  group_by(size) %>% summarise(mean=mean(amount))
## Source: local data frame [2 x 2]
## 
##    size     mean
##   (chr)    (dbl)
## 1 large 55.33333
## 2 small 28.66667
pollution %>%  ungroup()
##       city  size amount
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
## 6  Beijing small     56
##tb example

joining data

y<- data.frame(x1=c("a","b","c"), x2=(c(1,2,3)))
y[] <- lapply(y, as.character)
z<-data.frame(x1=c("b","c","d"),x2=(c(2,3,4)))

z[] <- lapply(z, as.character)

##is this the best way to change factors to characters?
##are we losing info by changes the #s (x2) into characters?
##bind_cols
bind_cols(y,z)
## Source: local data frame [3 x 4]
## 
##      x1    x2    x1    x2
##   (chr) (chr) (chr) (chr)
## 1     a     1     b     2
## 2     b     2     c     3
## 3     c     3     d     4
##bind_rows
bind_rows(y,z)
## Source: local data frame [6 x 2]
## 
##      x1    x2
##   (chr) (chr)
## 1     a     1
## 2     b     2
## 3     c     3
## 4     b     2
## 5     c     3
## 6     d     4
##Union
union(y,z)
##   x1 x2
## 1  a  1
## 2  b  2
## 3  c  3
## 4  d  4
##intersect
intersect(y,z)
##   x1 x2
## 1  b  2
## 2  c  3
##setdiff
setdiff(y,z)
##   x1 x2
## 1  a  1
##d doesn't show up here? 

left_join(songs, artists, by="name")
##                  song  name  plays
## 1 Across the Universe  John guitar
## 2       Come Together  John guitar
## 3      Hello, Goodbye  Paul   bass
## 4           Peggy Sue Buddy   <NA>
inner_join(songs, artists, by="name")
##                  song name  plays
## 1 Across the Universe John guitar
## 2       Come Together John guitar
## 3      Hello, Goodbye Paul   bass

Week 04– Tidyr

EDAWR

# install.packages("devtools")
# devtools::install_github("rstudio/EDAWR")
library(EDAWR)
help(package='EDAWR')
?storms    # wind speed data for 6 hurricanes
?cases     # subset of WHO tuberculosis
?pollution # pollution data from WHO Ambient Air Pollution, 2014
?tb        # tuberculosis data
#Viewd(storms)
#View(cases)
#View(pollution)

slicing

# storms
storms$storm
storms$wind
storms$pressure
storms$date

# cases
cases$country
names(cases)[-1]
unlist(cases[1:3, 2:4])

# pollution
pollution$city[c(1,3,5)]
pollution$amount[c(1,3,5)]
pollution$amount[c(2,4,6)]
##single equals sign sets the value, double equals sign searches for things with that value
pollution %>% 
  filter(city != 'New York')

# ratio
storms$pressure / storms$wind

tidyr

Two main functions: gather() and spread()

# install.packages("tidyr")
library(tidyr)
?gather # gather to long
?spread # spread to wide

gather

cases
gather(cases, "year", "n", 2:4)
gather(cases, "year", "n", -country)
cases %>% 
  gather("year", "n", -country) %>% 
  filter(year %in% c(2011,2013), 
         country %in% c('FR', 'US'))

## ! flips it all: 
cases %>%
  gather("year", "n", -country) %>% 
  filter(year %in% c(2011,2013), 
         !country %in% c('FR', 'US')) ##this selects all things NOT FR, US :-) 

spread

pollution
spread(pollution, size, amount)
##data fram, colmn to use as keys, :amount' fills the cells 

Other functions to extract and combine columns…

separate

storms
storms2 <- separate(storms, date, c("year", "month", "day"), sep = "-")

storms %>% 
  mutate(date_str= as.character(date))

storms3 <- separate(storms, date, c("year", "month", "day"), sep = c(4,6))

unite

storms2
unite(storms2, "date", year, month, day, sep = "-")

Recap: tidyr:

  • A package that reshapes the layout of data sets.

  • Make observations from variables with gather() Make variables from observations with spread()

  • Split and merge columns with unite() and separate()

From the data-wrangling-cheatsheet.pdf:

tidy CO2 emissions

Assignment 4

Task. Convert the following table CO2 emissions per country since 1970 from wide to long format and output the first few rows into your Rmarkdown. I recommend consulting ?gather and you should have 3 columns in your output.

library(readxl) # install.packages('readxl')

url = 'http://edgar.jrc.ec.europa.eu/news_docs/CO2_1970-2014_dataset_of_CO2_report_2015.xls'
xls = '../data/co2_europa.xls'

print(getwd())
if (!file.exists(xls)){
  download.file(url, xls, method='internal')
}
co2 = read_excel(xls, skip=12)
co2

Question. Why use skip=12 argument in read_excel()? The ‘skip-12’ command in the read_excel because the first 12 rows of the excel file are metadata– titles, information about where the data came from etc.

co2 #%>% 
##         Jan    Feb    Mar    Apr    May    Jun    Jul    Aug    Sep    Oct
## 1959 315.42 316.31 316.50 317.56 318.13 318.00 316.39 314.65 313.68 313.18
## 1960 316.27 316.81 317.42 318.87 319.87 319.43 318.01 315.74 314.00 313.68
## 1961 316.73 317.54 318.38 319.31 320.42 319.61 318.42 316.63 314.83 315.16
## 1962 317.78 318.40 319.53 320.42 320.85 320.45 319.45 317.25 316.11 315.27
## 1963 318.58 318.92 319.70 321.22 322.08 321.31 319.58 317.61 316.05 315.83
## 1964 319.41 320.07 320.74 321.40 322.06 321.73 320.27 318.54 316.54 316.71
## 1965 319.27 320.28 320.73 321.97 322.00 321.71 321.05 318.71 317.66 317.14
## 1966 320.46 321.43 322.23 323.54 323.91 323.59 322.24 320.20 318.48 317.94
## 1967 322.17 322.34 322.88 324.25 324.83 323.93 322.38 320.76 319.10 319.24
## 1968 322.40 322.99 323.73 324.86 325.40 325.20 323.98 321.95 320.18 320.09
## 1969 323.83 324.26 325.47 326.50 327.21 326.54 325.72 323.50 322.22 321.62
## 1970 324.89 325.82 326.77 327.97 327.91 327.50 326.18 324.53 322.93 322.90
## 1971 326.01 326.51 327.01 327.62 328.76 328.40 327.20 325.27 323.20 323.40
## 1972 326.60 327.47 327.58 329.56 329.90 328.92 327.88 326.16 324.68 325.04
## 1973 328.37 329.40 330.14 331.33 332.31 331.90 330.70 329.15 327.35 327.02
## 1974 329.18 330.55 331.32 332.48 332.92 332.08 331.01 329.23 327.27 327.21
## 1975 330.23 331.25 331.87 333.14 333.80 333.43 331.73 329.90 328.40 328.17
## 1976 331.58 332.39 333.33 334.41 334.71 334.17 332.89 330.77 329.14 328.78
## 1977 332.75 333.24 334.53 335.90 336.57 336.10 334.76 332.59 331.42 330.98
## 1978 334.80 335.22 336.47 337.59 337.84 337.72 336.37 334.51 332.60 332.38
## 1979 336.05 336.59 337.79 338.71 339.30 339.12 337.56 335.92 333.75 333.70
## 1980 337.84 338.19 339.91 340.60 341.29 341.00 339.39 337.43 335.72 335.84
## 1981 339.06 340.30 341.21 342.33 342.74 342.08 340.32 338.26 336.52 336.68
## 1982 340.57 341.44 342.53 343.39 343.96 343.18 341.88 339.65 337.81 337.69
## 1983 341.20 342.35 342.93 344.77 345.58 345.14 343.81 342.21 339.69 339.82
## 1984 343.52 344.33 345.11 346.88 347.25 346.62 345.22 343.11 340.90 341.18
## 1985 344.79 345.82 347.25 348.17 348.74 348.07 346.38 344.51 342.92 342.62
## 1986 346.11 346.78 347.68 349.37 350.03 349.37 347.76 345.73 344.68 343.99
## 1987 347.84 348.29 349.23 350.80 351.66 351.07 349.33 347.92 346.27 346.18
## 1988 350.25 351.54 352.05 353.41 354.04 353.62 352.22 350.27 348.55 348.72
## 1989 352.60 352.92 353.53 355.26 355.52 354.97 353.75 351.52 349.64 349.83
## 1990 353.50 354.55 355.23 356.04 357.00 356.07 354.67 352.76 350.82 351.04
## 1991 354.59 355.63 357.03 358.48 359.22 358.12 356.06 353.92 352.05 352.11
## 1992 355.88 356.63 357.72 359.07 359.58 359.17 356.94 354.92 352.94 353.23
## 1993 356.63 357.10 358.32 359.41 360.23 359.55 357.53 355.48 353.67 353.95
## 1994 358.34 358.89 359.95 361.25 361.67 360.94 359.55 357.49 355.84 356.00
## 1995 359.98 361.03 361.66 363.48 363.82 363.30 361.94 359.50 358.11 357.80
## 1996 362.09 363.29 364.06 364.76 365.45 365.01 363.70 361.54 359.51 359.65
## 1997 363.23 364.06 364.61 366.40 366.84 365.68 364.52 362.57 360.24 360.83
##         Nov    Dec
## 1959 314.66 315.43
## 1960 314.84 316.03
## 1961 315.94 316.85
## 1962 316.53 317.53
## 1963 316.91 318.20
## 1964 317.53 318.55
## 1965 318.70 319.25
## 1966 319.63 320.87
## 1967 320.56 321.80
## 1968 321.16 322.74
## 1969 322.69 323.95
## 1970 323.85 324.96
## 1971 324.63 325.85
## 1972 326.34 327.39
## 1973 327.99 328.48
## 1974 328.29 329.41
## 1975 329.32 330.59
## 1976 330.14 331.52
## 1977 332.24 333.68
## 1978 333.75 334.78
## 1979 335.12 336.56
## 1980 336.93 338.04
## 1981 338.19 339.44
## 1982 339.09 340.32
## 1983 340.98 342.82
## 1984 342.80 344.04
## 1985 344.06 345.38
## 1986 345.48 346.72
## 1987 347.64 348.78
## 1988 349.91 351.18
## 1989 351.14 352.37
## 1990 352.69 354.07
## 1991 353.64 354.89
## 1992 354.09 355.33
## 1993 355.30 356.78
## 1994 357.59 359.05
## 1995 359.61 360.74
## 1996 360.80 362.38
## 1997 362.49 364.34
  #gather("Year", "CO2", 2:46) #%>% 
  #head()

dplyr

A package that helps transform tabular data

# install.packages("dplyr")
library(dplyr)
?select
?filter
?arrange
?mutate
?group_by
?summarise
#co2 %>% 
 # mutate(total_emissions= sum(2:46)) %>% 
  #select(Country, total_emissions)

summarize CO2 emissions

Task. Report the top 5 emitting countries (not World or EU28) for 2014 using your long format table. (You may need to convert your year column from factor to numeric, eg mutate(year = as.numeric(as.character(year))). As with most analyses, there are multiple ways to do this. I used the following functions: filter, arrange, desc, head).

#co2 %>% 
 # mutate(year = as.numeric(as.character(year)))

Task. Summarize the total emissions by country (not World or EU28) across years from your long format table and return the top 5 emitting countries. (As with most analyses, there are multiple ways to do this. I used the following functions: filter, arrange, desc, head).