Search and Access Data

library(EDIutils)

Access

The EDI data repository provides open access to archived data as packages. Packages can be downloaded in .zip format or their individual data entities downloaded as raw bytes and files. Entities with a common format can be parsed simply by most readers while more complex formats need metadata to help with parsing.

# A data package
packageId <- "edi.1047.1"

Downloading a data package archive (.zip) requires a data package ID.

# Request a zip archive
transaction <- create_data_package_archive(packageId)
transaction
#> [1] "archive_edi.1047.1_14896683904724129"

# Check status of the request (no response indicates success)
read_data_package_error(transaction)

# Download to path
read_data_package_archive(packageId, transaction, path = tempdir())
#> |=============================================================| 100%
dir(tempdir())
#> [1] ""edi.1047.1.zip"

Downloading an individual data entity requires the entity ID.

# List data entities of the data package
res <- read_data_entity_names(packageId)
res
#>                           entityId                entityName
#> 1 3abac5f99ecc1585879178a355176f6d        Environmentals.csv
#> 2 f6bfa89b48ced8292840e53567cbf0c8               ByCatch.csv
#> 3 c75642ddccb4301327b4b1a86bdee906               Chinook.csv
#> 4 2c9ee86cc3f3ffc729c5f18bfe0a2a1d             Steelhead.csv
#> 5 785690848dd20f4910637250cdc96819 TrapEfficiencyRelease.csv
#> 6 58b9000439a5671ea7fe13212e889ba5 TrapEfficiencySummary.csv
#> 7 86e61c1a501b7dcf0040d10e009bfd87        TrapOperations.csv

# Download Steelhead.csv in raw bytes. Use the entityName and entityID as keys.
entityName <- "Steelhead.csv"
entityId <- res$entityId[res$entityName == entityName]
raw <- read_data_entity(packageId, entityId)
head(raw)
#> [1] ef bb bf 44 61 74

Common formats are easily parsed.

# These data have a common format are simply parsed
data <- readr::read_csv(file = raw)
data
#> # A tibble: 2,926 x 14
#>    Date   trapVisitID subSiteName catchRawID releaseID commonName 
#>    <chr>        <dbl> <chr>            <dbl>     <dbl> <chr>      
#>  1 1/12/~         326 North Chan~      32123         0 Steelhead ~
#>  2 1/14/~         336 North Chan~      33980         0 Steelhead ~
#>  3 1/15/~         337 North Chan~      32683         0 Steelhead ~
#>  4 1/16/~         339 North Chan~      32971         0 Steelhead ~
#>  5 1/17/~         341 North Chan~      33104         0 Steelhead ~
#>  6 1/18/~         342 North Chan~      33304         0 Steelhead ~
#>  7 1/19/~         343 North Chan~      33432         0 Steelhead ~
#>  8 1/21/~         349 North Chan~      34083         0 Steelhead ~
#>  9 1/21/~         349 North Chan~      34084         0 Steelhead ~
#> 10 1/23/~         351 North Chan~      34384         0 Steelhead ~
#> # ... with 2,916 more rows, and 8 more variables:
#> #   lifeStage <chr>, forkLength <dbl>, weight <dbl>, n <dbl>,
#> #   mort <chr>, fishOrigin <chr>, markType <chr>,
#> #   CatchRaw.comments <chr>

Less common formats require metadata for parsing. This metadata is listed under the “physical” node of a data entities EML.

See the emld library for more on working with EML as a list or JSON-LD. See the xml2 library for working with EML as XML.

# Read the same data entity but using the physical metadata
library(xml2)
eml <- read_metadata(packageId)
meta <- read_metadata_entity(packageId, entityId)
fieldDelimiter <- xml_text(xml_find_first(meta, ".//physical//fieldDelimiter"))
numHeaderLines <- xml_double(xml_find_first(meta, ".//physical//numHeaderLines"))
data <- readr::read_delim(
  file = raw, 
  delim = fieldDelimiter, 
  skip = numHeaderLines-1)
data
#> # A tibble: 2,926 x 14
#>    Date   trapVisitID subSiteName catchRawID releaseID commonName 
#>    <chr>        <dbl> <chr>            <dbl>     <dbl> <chr>      
#>  1 1/12/~         326 North Chan~      32123         0 Steelhead ~
#>  2 1/14/~         336 North Chan~      33980         0 Steelhead ~
#>  3 1/15/~         337 North Chan~      32683         0 Steelhead ~
#>  4 1/16/~         339 North Chan~      32971         0 Steelhead ~
#>  5 1/17/~         341 North Chan~      33104         0 Steelhead ~
#>  6 1/18/~         342 North Chan~      33304         0 Steelhead ~
#>  7 1/19/~         343 North Chan~      33432         0 Steelhead ~
#>  8 1/21/~         349 North Chan~      34083         0 Steelhead ~
#>  9 1/21/~         349 North Chan~      34084         0 Steelhead ~
#> 10 1/23/~         351 North Chan~      34384         0 Steelhead ~
#> # ... with 2,916 more rows, and 8 more variables:
#> #   lifeStage <chr>, forkLength <dbl>, weight <dbl>, n <dbl>,
#> #   mort <chr>, fishOrigin <chr>, markType <chr>,
#> #   CatchRaw.comments <chr>