# Libraries
library(tidyverse)
# file names
<- "data/raw/"
datadir_raw
<- "data/processed/"
datadir_processed
<- "ASDN_Daily_species.csv"
species_file
<- "ASDN_Snow_survey.csv" snowsurvey_file
Data Cleaning
Snow survey data
Data Exploration
Import the snow survey
# Import the species
<- read_csv(file.path(datadir_raw, snowsurvey_file)) snowsurvey_csv
Rows: 42830 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Site, Date, Plot, Location, Snow_cover, Water_cover, Land_cover, T...
dbl (1): Year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(snowsurvey_csv)
Rows: 42,830
Columns: 11
$ Site <chr> "barr", "barr", "barr", "barr", "barr", "barr", "barr", "b…
$ Year <dbl> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
$ Date <chr> "29-May-11", "29-May-11", "29-May-11", "29-May-11", "29-Ma…
$ Plot <chr> "brw1", "brw1", "brw1", "brw1", "brw1", "brw1", "brw1", "b…
$ Location <chr> "b10", "b12", "b2", "b4", "b6", "b8", "d10", "d12", "d2", …
$ Snow_cover <chr> "90", "100", "90", "100", "95", "95", "95", "90", "95", "9…
$ Water_cover <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "10", "0…
$ Land_cover <chr> "10", "0", "10", "0", "5", "5", "5", "10", "5", "5", "0", …
$ Total_cover <chr> "100", "100", "100", "100", "100", "100", "100", "100", "1…
$ Observer <chr> "adoll", "adoll", "adoll", "adoll", "adoll", "adoll", "ado…
$ Notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Ok, the types are not what we were expecting for the percentages of cover. Let’s find out why:
%>%
snowsurvey_csv count(Snow_cover)
# A tibble: 62 × 2
Snow_cover n
<chr> <int>
1 - 10
2 . 568
3 0 24823
4 1 15
5 1.25 1
6 1.65 1
7 10 2106
8 10.45 1
9 100 3086
10 15 538
# ℹ 52 more rows
Let’s focus on the non-numeric values as a starting point:
%>%
snowsurvey_csv count(Snow_cover) %>%
filter(is.na(as.numeric(Snow_cover)))
Warning: There was 1 warning in `filter()`.
ℹ In argument: `is.na(as.numeric(Snow_cover))`.
Caused by warning:
! NAs introduced by coercion
# A tibble: 6 × 2
Snow_cover n
<chr> <int>
1 - 10
2 . 568
3 <1 1
4 n/a 3
5 unk 1
6 <NA> 214
Data cleaning
Ok, we found our problematic values that are not numeric. There are a non-negligible number of cells with a dot as value. There is no mention of using this symbol in the metadata. We should probably have a look at those rows:
%>%
snowsurvey_csv filter(Snow_cover == ".") %>%
View()
Interestingly, when there is a “dot” for snow cover, it is also the case for all the other covers. Let’s replace them all with NA since there is no supplemental information in the provided metadata
<- snowsurvey_csv %>%
snowsurvey_fixed # filter(Snow_cover == ".") %>%
mutate(across(ends_with("_cover"), ~ifelse(.x == ".", NA, .x)))
We will now tackle the other problematic values:
The problem is similar with “-”, let’s set it to NA
<- snowsurvey_fixed %>%
snowsurvey_fixed # filter(Snow_cover == "-") %>%
mutate(across(ends_with("_cover"), ~ifelse(.x == "-", NA, .x)))
“n/a” is pretty clear regarding how to fix it:
<- snowsurvey_fixed %>%
snowsurvey_fixed mutate(Snow_cover = ifelse(Snow_cover == "n/a", NA, Snow_cover))
“unk” is probably an abbreviation for unknown:
<- snowsurvey_fixed %>%
snowsurvey_fixed mutate(Snow_cover = ifelse(Snow_cover == "unk", NA, Snow_cover))
Finally we will set “<1” as zero (quite arbitrary indeed):
<- snowsurvey_fixed %>%
snowsurvey_fixed mutate(Snow_cover = ifelse(Snow_cover == "<1", "0", Snow_cover))
Now we can test if we now only have NAs as non numeric values in the column:
%>%
snowsurvey_fixed count(Snow_cover) %>%
filter(is.na(as.numeric(Snow_cover)))
# A tibble: 1 × 2
Snow_cover n
<chr> <int>
1 <NA> 796
Ok, we can do the transformation:
<- snowsurvey_fixed %>%
snowsurvey_fixed mutate(Snow_cover = as.numeric(Snow_cover))
Yeah we have finally a numeric column 🎉. Now we can verify that all the values are between 0 and 100:
%>%
snowsurvey_fixed filter(Snow_cover > 100)
# A tibble: 2 × 11
Site Year Date Plot Location Snow_cover Water_cover Land_cover Total_cover
<chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
1 barr 2011 31-M… brw3 d10 200 0 -100 100
2 barr 2007 5-Ju… brw1 j6 470 0 0 470
# ℹ 2 more variables: Observer <chr>, Notes <chr>
We have two values above 100, with an interesting 470%! ☃️ We should probavly set those values to NAs:
<- snowsurvey_fixed %>%
snowsurvey_fixed mutate(Snow_cover = ifelse(Snow_cover > 100, NA, Snow_cover))
Let’s check for negative values:
%>%
snowsurvey_fixed filter(Snow_cover < 0)
# A tibble: 0 × 11
# ℹ 11 variables: Site <chr>, Year <dbl>, Date <chr>, Plot <chr>,
# Location <chr>, Snow_cover <dbl>, Water_cover <chr>, Land_cover <chr>,
# Total_cover <chr>, Observer <chr>, Notes <chr>
No negative value detected ✅
Let’s write the presence table to a csv file:
write_csv(snowsurvey_fixed, file.path(datadir_processed, "snow_cover.csv"))
Species data
Data exploration
Import the species csv files with the bird species information:
# Import the species
<- read_csv(paste0(datadir_raw, species_file)) species_csv
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
Rows: 3367 Columns: 357
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (125): Site, Date, All_obs_reported, Aleutian_Tern, American_Bittern, Am...
dbl (232): Year, Jdate, Num_observers, Observer_hours, Alder_Flycatcher, Ame...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
%>%
species_csv select(1:20) %>%
glimpse()
Rows: 3,367
Columns: 20
$ Year <dbl> 2003, 2003, 2003, 2003, 2003, 2…
$ Site <chr> "barr", "barr", "barr", "barr",…
$ Date <chr> "1-Jun-03", "2-Jun-03", "3-Jun-…
$ Jdate <dbl> 152, 153, 154, 155, 156, 157, 1…
$ Num_observers <dbl> NA, NA, NA, NA, NA, NA, NA, NA,…
$ All_obs_reported <chr> "y", "y", "y", "y", "y", "y", "…
$ Observer_hours <dbl> NA, NA, NA, NA, NA, NA, NA, NA,…
$ Alder_Flycatcher <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ Aleutian_Tern <chr> "0", "0", "0", "0", "0", "0", "…
$ American_Bittern <chr> "0", "0", "0", "0", "0", "0", "…
$ American_Black_Duck <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ American_Black_Duck_x_Mallard_.hybrid. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ American_Coot <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ American_Crow <chr> "0", "0", "0", "0", "0", "0", "…
$ American_Golden_Plover <chr> "0", "0", "0", "0", "X", "0", "…
$ American_Kestrel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ American_Pipit <chr> "0", "0", "0", "0", "0", "0", "…
$ American_Redstart <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ American_Robin <chr> "0", "0", "0", "0", "0", "0", "…
$ American_Tree_Sparrow <chr> "0", "0", "0", "0", "0", "0", "…
This data set is stored in a wide format where each specie has its own column. This means that every time we discover a new species we will have to add a column. In addition, a bunch of 0
are stored in this table but do not really provide any information. According to the metadata:
The number of individuals seen is recorded for each species, except when individuals were not counted but the species was present (typically for very abundant species), an "X" is shown. The remaining columns list the full-name of species (birds and mammals) recorded for at least study site in at least one year.
This data model is not convenient for a database, we will have to switch to a long format.
Data cleaning
<- species_csv %>%
species_long pivot_longer(
cols = !c(Year, Site, Date, Jdate, Num_observers, All_obs_reported, Observer_hours),
names_to = "species",
values_to = "species_count",
values_transform = list(species_count = as.character)
)
%>%
species_long count(species_count) %>%
arrange(desc(n))
# A tibble: 460 × 2
species_count n
<chr> <int>
1 0 1091921
2 1 13798
3 2 12031
4 3 5827
5 4 5712
6 5 4022
7 6 3870
8 10 3003
9 8 2890
10 X 2255
# ℹ 450 more rows
We want to focus on the presence and absence of species and not the count. Let’s create a new column for presence where anything else than 0 is considered present
<- species_long %>%
species_presence mutate(species_presence = ifelse(species_count == "0", 0, 1))
We can remove some columns: “Num_observers”, “All_obs_reported”, “Observer_hours” are here to help to compute the effort of observation but since we just want presence and absence, we do not need it. We can also remove all the zeros values to reduce the size of our data set:
<- species_presence %>%
species_presence filter(species_presence == 1) %>%
select(-c(Num_observers, All_obs_reported, Observer_hours, species_count))
Last but not least, let’s have a look at our species list
%>%
species_presence distinct(species) %>%
arrange(species)
# A tibble: 319 × 1
species
<chr>
1 Alder_Flycatcher
2 Aleutian_Tern
3 American_Bittern
4 American_Black_Duck
5 American_Black_Duck_x_Mallard_.hybrid.
6 American_Crow
7 American_Golden_Plover
8 American_Kestrel
9 American_Pipit
10 American_Robin
# ℹ 309 more rows
We have 319 species observed in this table. The “convention” seems to be that _
are used to separate the different parts of a name. Note that it is not clear what type of nomenclature reference is used to pull those names from.
Let’s write the presence table to a csv file:
write_csv(species_presence, file.path(datadir_processed, "species_presence.csv"))