library(EDIutils)
The repository search service is a standard deployment of Apache Solr and indexes select metadata fields of data packages. Some of the possible motivations for using the search API include:
Building a custom search interface that offers some feature not found in the EDI data portal (e.g. build a query interface where the search results are always restricted to a particular research project).
Building a local data catalog. In this case we are constructing the query in our program and displaying a table of matching documents for our local site including things like title, authors, keywords, and perhaps the abstract.
Mining data where EML metadata is the data to be mined, processed, or analyzed in some way.
Increased efficiency in that using the search API is often faster than relying on the data portal where the results are displayed in HTML and paged 10 documents at a time.
For a list of searchable fields see search_data_packages()
. For more on constructing Solr queries see the Apache Solr Wiki. For a browser based search experience use the EDI data portal.
Results can be filtered to include only fields of interest.
# Match all documents with keywords "disturbance" and return only their IDs
<- search_data_packages(query = 'q=keyword:disturbance&fl=id') res
When constructing a query note that the 15403 data packages of the ecotrends project and 10492 data packages of the LTER Landsat project, can be excluded from the returned results by including &fq=-scope:(ecotrends+lter-landsat)
in the query string.
# Match all documents with keywords "disturbance", excluding ecotrends and
# lter-landsat scopes from the returned results
<- 'q=keyword:disturbance&fl=packageid&fq=-scope:(ecotrends+lter-landsat)'
query <- search_data_packages(query) res
Use wild cards operators to match anything and control the number of returned results.
# Match anything, display all fields, limit to only one document
<- search_data_packages(query = 'q=*&fl=*&rows=1') res
Use scope, keyword, and author fields to get all data packages belonging to a research site, organization, or author.
# Find all FCE LTER data packages, displaying the package id, title, and DOI
<- 'q=scope:knb-lter-fce&fl=packageid,title,doi&rows=100'
query <- search_data_packages(query)
res
# Query on author
<- 'q=author:duane+costa&fq=author:costa&fl=id,title,author,score'
query <- search_data_packages(query) res
Queries can be complex.
# Query on subject "Primary Production" OR subject "plant". Note that 'subject'
# is an aggregation of several other fields containing searchable text:
# 'author', 'organization', 'title', 'keyword', and 'abstract' fields rolled
# together into a single searchable field.
<- paste0('q=subject:("Primary+Production")+OR+subject:plant&fq=',
query '-scope:ecotrends&fq=-scope:lter-landsat*&fl=id,packageid,',
'title,author,organization,pubdate,coordinates')
<- search_data_packages(query) res
An alternative method for searching and retrieving metadata is the “list and read” method. It involves listing data package identifiers of interest, then read the corresponding metadata of those data packages one at a time, parsing and extracting whatever metadata from them that you’re interested in searching or doing additional processing on. Although this sounds like more work than using the search API, and in fact it generally is, there are some use cases where this might be the preferred approach. For example this list and read method allows access to previous versions of a data package, whereas the search API only provides access to the most recent version, and this method provides access to all EML content, not just indexed fields.
The EDI data repository provides open access to archived data as packages. Packages can be downloaded in .zip format or their individual data entities downloaded as raw bytes and files. Entities with a common format can be parsed simply by most readers while more complex formats need metadata to help with parsing.
# A data package
<- "edi.1047.1" packageId
Downloading a data package archive (.zip) requires a data package ID.
# Request a zip archive
<- create_data_package_archive(packageId)
transaction
transaction#> [1] "archive_edi.1047.1_14896683904724129"
# Check status of the request (no response indicates success)
read_data_package_error(transaction)
# Download to path
read_data_package_archive(packageId, transaction, path = tempdir())
#> |=============================================================| 100%
dir(tempdir())
#> [1] ""edi.1047.1.zip"
Downloading an individual data entity requires the entity ID.
# List data entities of the data package
<- read_data_entity_names(packageId)
res
res#> entityId entityName
#> 1 3abac5f99ecc1585879178a355176f6d Environmentals.csv
#> 2 f6bfa89b48ced8292840e53567cbf0c8 ByCatch.csv
#> 3 c75642ddccb4301327b4b1a86bdee906 Chinook.csv
#> 4 2c9ee86cc3f3ffc729c5f18bfe0a2a1d Steelhead.csv
#> 5 785690848dd20f4910637250cdc96819 TrapEfficiencyRelease.csv
#> 6 58b9000439a5671ea7fe13212e889ba5 TrapEfficiencySummary.csv
#> 7 86e61c1a501b7dcf0040d10e009bfd87 TrapOperations.csv
# Download Steelhead.csv in raw bytes. Use the entityName and entityID as keys.
<- "Steelhead.csv"
entityName <- res$entityId[res$entityName == entityName]
entityId <- read_data_entity(packageId, entityId)
raw head(raw)
#> [1] ef bb bf 44 61 74
Common formats are easily parsed.
# These data have a common format are simply parsed
<- readr::read_csv(file = raw)
data
data#> # A tibble: 2,926 x 14
#> Date trapVisitID subSiteName catchRawID releaseID commonName
#> <chr> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 1/12/~ 326 North Chan~ 32123 0 Steelhead ~
#> 2 1/14/~ 336 North Chan~ 33980 0 Steelhead ~
#> 3 1/15/~ 337 North Chan~ 32683 0 Steelhead ~
#> 4 1/16/~ 339 North Chan~ 32971 0 Steelhead ~
#> 5 1/17/~ 341 North Chan~ 33104 0 Steelhead ~
#> 6 1/18/~ 342 North Chan~ 33304 0 Steelhead ~
#> 7 1/19/~ 343 North Chan~ 33432 0 Steelhead ~
#> 8 1/21/~ 349 North Chan~ 34083 0 Steelhead ~
#> 9 1/21/~ 349 North Chan~ 34084 0 Steelhead ~
#> 10 1/23/~ 351 North Chan~ 34384 0 Steelhead ~
#> # ... with 2,916 more rows, and 8 more variables:
#> # lifeStage <chr>, forkLength <dbl>, weight <dbl>, n <dbl>,
#> # mort <chr>, fishOrigin <chr>, markType <chr>,
#> # CatchRaw.comments <chr>
Less common formats require metadata for parsing. This metadata is listed under the “physical” node of a data entities EML.
See the emld library for more on working with EML as a list or JSON-LD. See the xml2 library for working with EML as XML.
# Read the same data entity but using the physical metadata
library(xml2)
<- read_metadata(packageId)
eml <- read_metadata_entity(packageId, entityId)
meta <- xml_text(xml_find_first(meta, ".//physical//fieldDelimiter"))
fieldDelimiter <- xml_double(xml_find_first(meta, ".//physical//numHeaderLines"))
numHeaderLines <- readr::read_delim(
data file = raw,
delim = fieldDelimiter,
skip = numHeaderLines-1)
data#> # A tibble: 2,926 x 14
#> Date trapVisitID subSiteName catchRawID releaseID commonName
#> <chr> <dbl> <chr> <dbl> <dbl> <chr>
#> 1 1/12/~ 326 North Chan~ 32123 0 Steelhead ~
#> 2 1/14/~ 336 North Chan~ 33980 0 Steelhead ~
#> 3 1/15/~ 337 North Chan~ 32683 0 Steelhead ~
#> 4 1/16/~ 339 North Chan~ 32971 0 Steelhead ~
#> 5 1/17/~ 341 North Chan~ 33104 0 Steelhead ~
#> 6 1/18/~ 342 North Chan~ 33304 0 Steelhead ~
#> 7 1/19/~ 343 North Chan~ 33432 0 Steelhead ~
#> 8 1/21/~ 349 North Chan~ 34083 0 Steelhead ~
#> 9 1/21/~ 349 North Chan~ 34084 0 Steelhead ~
#> 10 1/23/~ 351 North Chan~ 34384 0 Steelhead ~
#> # ... with 2,916 more rows, and 8 more variables:
#> # lifeStage <chr>, forkLength <dbl>, weight <dbl>, n <dbl>,
#> # mort <chr>, fishOrigin <chr>, markType <chr>,
#> # CatchRaw.comments <chr>