This vignette aims to showcase a use case using the 2 main functions of metajam
- download_d1_data
and read_d1_files
using a data processing workflow developed by the NCO synthesis working group Stream Elemental Cycling.
The datasets used are from the LTER site - Luquillo and can be found in the PASTA data repository http://dx.doi.org/doi:10.6073/pasta/f9df56348f510da0113b1e6012fa2967. This data package is a collection of 8 datasets of stream water samples from 8 different locations of the Luquillo Mountains.
Our goal is to read the data for the 8 different sampling sites and aggregate them into one harmonized dataset. We will use the metadata to check if the data structures and units are the same across the 8 different sampling sites before performing the aggregation.
#devtools::install_github("NCEAS/metajam")
library(metajam)
library(udunits2)
# For wrangling the data
library(readr)
library(tidyr)
library(dplyr)
library(purrr)
library(stringr)
# Download the data from DataONE on your local machine
"Data_SEC"
data_folder <-
# Ammonium to Ammoniacal-nitrogen conversion. We will use this conversion later.
0.7764676534 coeff_conv_NH4_to_NH4N <-
# Create the local directory to store datasets
dir.create(data_folder, showWarnings = FALSE)
# Get the datasets unique identifiers
read_csv(system.file("extdata", "LTER-SEC_DatasetsListing_SearchedData.csv", package = "metajam"))
test_datasets_listing <-
# Keep only the LUQ related datasets
test_datasets_listing %>%
luq_test_datasets <- filter(grepl("LUQ", .$`LTER site abbreviation`)) %>%
select(`LTER site abbreviation`,
`Data Repository (PASTA) URL to Archive/Metadata`,
`Data Repository (PASTA) URL to File`,
`Data Repository (PASTA) Filename`) %>%
na.omit() %>%
arrange(`Data Repository (PASTA) Filename`) # sort the data sets alphabetically
## Batch download the datasets
# the tidiest way
map(luq_test_datasets$`Data Repository (PASTA) URL to File`, ~download_d1_data(.x, data_folder))
local_datasets <-
# the apply way
# local_datasets <- lapply(luq_test_datasets$`Data Repository (PASTA) URL to File`, download_d1_data, data_folder)
# the map way
# local_datasets <- map(luq_test_datasets$`Data Repository (PASTA) URL to File`, function(x) {download_d1_data(x, data_folder)})
At this point, you should have all the data and the metadata downloaded inside your main directory; Data_SEC
in this example. metajam
organize the files as follow:
my_data.csv
__full_metadata.xml
: my_data__full_metadata.xml
__summary_metadata.csv
: my_data__summary_metadata.csv
__attribute_metadata.csv
: my_data__attribute_metadata.csv
__attribute_factor_metadata.csv
: my_data__attribute_factor_metadata.csv
# You could list the datasets dowloaded in the `Data_SEC` folder
# local_datasets <- dir(data_folder, full.names = TRUE)
# or you can directly use the outputed paths from download_d1_data
# Read all the datasets and their associated metadata in as a named list
map(local_datasets, read_d1_files) %>%
luq_datasets <- set_names(map(., ~.x$summary_metadata$value[.x$summary_metadata$name == "File_Name"]))
Is the data structure the same across sampling sites (datasets)? For example, do the datasets all have the same column names?
# list all the attributes
luq_datasets %>% map("data") %>% map(colnames)
attributes_luq <-
# Check if they are identical by comparing all against the first site
for(ds in names(attributes_luq)) {
print(identical(attributes_luq[[1]], attributes_luq[[ds]]))
}
#> => We are good, same data structure across the sampling sites
Is data reported in identical units? For example, in every dataset is CI reported in microgramsPerLiter?
# List all the units used
luq_datasets %>% map("attribute_metadata") %>% map(~.[["unit"]])
luq_units <-
# Check if they are identical by comparing all against the first site
for(us in names(luq_units)) {
print(identical(luq_units[[1]], luq_units[[us]]))
}
#>!!! => The 2 last datasets have different units!!!!!!!!!!
# Let's check the differences
luq_datasets %>%
luq_units_merged <- map("attribute_metadata") %>%
map(. %>% select(attributeName, unit)) %>%
reduce(full_join, by = "attributeName")
## Rename
# Create the new names
names(luq_units) %>%
luq_new_colnames <- str_split("[.]") %>%
map(~.[1]) %>%
paste("unit", ., sep = "_")
# Apply the new names
colnames(luq_units_merged) <- c("attributeName", luq_new_colnames)
RioIcacos
and RioMameyesPuenteRoto
, the units used for the gage height (“Gage_Ht”) are in feet and not meters like the other sitesRioIcacos
and RioMameyesPuenteRoto
, NH4
and not NH4-N
is measured# fix attribute naming discrepancies -- to be improved
# Copy the units for Gage height
luq_units_merged %>%
luq_units_merged <- mutate(unit_RioIcacos = ifelse(attributeName == "Gage_Ht", "foot", unit_RioIcacos),
unit_RioMameyesPuenteRoto = ifelse(attributeName == "Gage_Ht", "foot", unit_RioMameyesPuenteRoto))
# Copy the units for NH4
luq_units_merged %>%
luq_units_merged <- mutate(unit_RioIcacos = ifelse(attributeName == "NH4-N", "microgramsPerLiter", unit_RioIcacos),
unit_RioMameyesPuenteRoto = ifelse(attributeName == "NH4-N", "microgramsPerLiter", unit_RioMameyesPuenteRoto))
# drop the 2 last rows
head(luq_units_merged, -2)
luq_units_merged <-
### Implement the unit conversion for RioIcacos and RioMameyesPuenteRoto ----
# Simplify naming
luq_datasets$RioIcacos$data
RioIcacos_data <- luq_datasets$RioIcacos$attribute_metadata
RioIcacos_attrmeta <-
## RioIcacos
# Fix NAs. In this dataset "-9999" is the missing value code. So we need to replace those with NAs
na_if(RioIcacos_data, "-9999")
RioIcacos_data <-
# Do the unit conversion
#Check to see if you can use the udunits package to convert feet to meters
ud.are.convertible("foot", "meter")
#TRUE, so we can use udunits to do the conversion
# Do the unit conversion in both the data file and the attributes file - Gage height - udunits way
$Gage_Ht <- ud.convert(RioIcacos_data$Gage_Ht, "foot", "meter")
RioIcacos_data
$unit <- str_replace_all(RioIcacos_attrmeta$unit, pattern = "foot", replacement = "meter")
RioIcacos_attrmeta
# If we could NOT use udunits to do the conversion we could do it manually this way
#RioIcacos_data <- RioIcacos_data %>% mutate( `Gage_Ht` = `Gage_Ht`* 0.3048)
# Do the unit conversion for RioIcacos and RioMameyesPuenteRoto - NH4 to NH4-N
#Check to see if you can use udunits to convert the measurements you want to convert.
ud.are.convertible("NH4", "NH4-N")
#FALSE = Udunits cannot convert these units for us so we will manually convert using the input below
# Ammonium to Ammoniacal-nitrogen conversion
0.7764676534
coeff_conv_NH4_to_NH4N <-
# Unit conversion for RioIcacos and RioMameyesPuenteRoto - NH4 to NH4-N
RioIcacos_data %>% mutate( `NH4-N` = `NH4-N`* coeff_conv_NH4_to_NH4N)
RioIcacos_data <-
# Update the main object
$RioIcacos$data <- RioIcacos_data
luq_datasets
## RioMameyesPuenteRoto
# Simplify naming
luq_datasets$RioMameyesPuenteRoto$data
RioMameyesPuenteRoto_data <- luq_datasets$RioMameyesPuenteRoto$attribute_metadata
RioMameyesPuenteRoto_attrmeta <-
#Replace all cells with the missing value code ("-9999") with "NA"
na_if(RioMameyesPuenteRoto_data, "-9999")
RioMameyesPuenteRoto_data <-
#Tidy version of unit conversion
$Gage_Ht <- ud.convert(RioMameyesPuenteRoto_data$Gage_Ht, "foot", "meter")
RioMameyesPuenteRoto_data$unit <- str_replace_all (RioMameyesPuenteRoto_attrmeta$unit, pattern = "foot", replacement = "meter")
RioMameyesPuenteRoto_attrmeta
# Do the unit conversion for RioMameyesPuenteRoto - NH4 to NH4-N (recall that we cannot use udunits for this so we are doing it manually)
#In this dataset the NH4-N column is actually empty, so this is not necessary. But here is how you would do it if you had to.
RioMameyesPuenteRoto_data %>% mutate( `NH4-N` = `NH4-N`* coeff_conv_NH4_to_NH4N)
RioMameyesPuenteRoto_data <-
# Update the main object
$RioMameyesPuenteRoto$data <- RioMameyesPuenteRoto_data
luq_datasets
# bind the sampling sites data into one master dataset for LUQ
luq_datasets %>%
all_sites_luq <- map("data") %>%
bind_rows(.id = "prov")
# Replace -9999 with NAs
na_if(all_sites_luq, "-9999")
all_sites_luq <-
# Write as csv
write_csv(all_sites_luq, "stream_chem_all_LUQ.csv")