RAthena 2.6.0

Bug Fix:

INFO: (Data scanned: -43839744 Bytes)

Feature:

RAthena 2.5.1

Bug Fix:

RAthena 2.5.0

Feature:

RAthena 2.4.0

Feature:

Bug Fix:

RAthena 2.3.0

Feature:

import awswrangler as wr
import getpass

bucket = getpass.getpass()
path = f"s3://{bucket}/data/"

if "awswrangler_test" not in wr.catalog.databases().values:
    wr.catalog.create_database("awswrangler_test")
    
cols = ["id", "dt", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"]

df = wr.s3.read_csv(
    path="s3://noaa-ghcn-pds/csv/189",
    names=cols,
    parse_dates=["dt", "obs_time"])  # Read 10 files from the 1890 decade (~1GB)
    
wr.s3.to_parquet(
    df=df,
    path=path,
    dataset=True,
    mode="overwrite",
    database="awswrangler_test",
    table="noaa"
);

wr.catalog.table(database="awswrangler_test", table="noaa")
library(DBI)
con <- dbConnect(RAthena::athena())

# Query ran using CSV output
system.time({
  df = dbGetQuery(con, "SELECT * FROM awswrangler_test.noaa")
})
# Info: (Data scanned: 80.88 MB)
#    user  system elapsed
#  57.004   8.430 160.567 

RAthena::RAthena_options(cache_size = 1)

# Query ran using UNLOAD Parquet output
system.time({
  df = dbGetQuery(con, "SELECT * FROM awswrangler_test.noaa", unload = T)
})

# Info: (Data scanned: 80.88 MB)
#    user  system elapsed 
#  21.622   2.350  39.232 
# Query ran using cache

system.time({
  df = dbGetQuery(con, "SELECT * FROM awswrangler_test.noaa", unload = T)
})
# Info: (Data scanned: 80.88 MB)
#    user  system elapsed 
#  13.738   1.886  11.029 

RAthena 2.2.0

Bug Fix:

Feature:

library(data.table)
library(DBI)

x = 5

dt = data.table(
  var1 = sample(LETTERS, size = x, T),
  var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)

con <- dbConnect(RAthena::athena())

#> Version: 2.2.0

sqlData(con, dt)

# Registered S3 method overwritten by 'jsonify':
#   method     from    
#   print.json jsonlite
# Info: Special characters "\t" has been converted to " " to help with Athena reading file format tsv
#    var1                                                   var2
# 1:    1 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 2:    2 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 3:    3 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 4:    4 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
# 5:    5 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}

#> Version: 2.1.0

sqlData(con, dt)

# Info: Special characters "\t" has been converted to " " to help with Athena reading file format tsv
#    var1                                        var2
# 1:    1 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 2:    2 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 3:    3 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 4:    4 1:3|list(var5 = c("a", "b", "c", "d", "e"))
# 5:    5 1:3|list(var5 = c("a", "b", "c", "d", "e"))

v-2.2.0 now converts lists into json lines format so that AWS Athena can parse with sql array/mapping/json functions. Small down side a s3 method conflict occurs when jsonify is called to convert lists into json lines. jsonify was choose in favor to jsonlite due to the performance improvements (noctua # 156).

RAthena 2.1.0

Bug Fix:

Feature:

RAthena 2.0.1

Bug Fix:

RAthena 2.0.0

API Change

By utilising environments for AthenaConnection and AthenaResult, all AthenaResult classes created from AthenaConnection will point to the same ptr and info environments for it’s connection. Previously ptr and info would make a copy. This means if it was modified it would not affect the child or parent class for example:

# Old Method
library(DBI)
con <- dbConnect(RAthena::athena(),
                 rstudio_conn_tab = F)

res <- dbExecute(con, "select 'helloworld'")

# modifying parent class to influence child
con@info$made_up <- "helloworld"

# nothing happened
res@connection@info$made_up
# > NULL

# modifying child class to influence parent
res@connection@info$made_up <- "oh no!"

# nothing happened
con@info$made_up
# > "helloworld"

# New Method
library(DBI)
con <- dbConnect(RAthena::athena(),
                 rstudio_conn_tab = F)

res <- dbExecute(con, "select 'helloworld'")

# modifying parent class to influence child
con@info$made_up <- "helloworld"

# picked up change
res@connection@info$made_up
# > "helloworld"

# modifying child class to influence parent
res@connection@info$made_up <- "oh no!"

# picked up change
con@info$made_up
# > "oh no!"

New Feature

library(DBI)
library(RAthena)

# default conversion methods
con <- dbConnect(RAthena::athena())

# change json conversion method
RAthena_options(json = "character")
RAthena:::athena_option_env$json
# [1] "character"

# change json conversion to custom method
RAthena_options(json = jsonify::from_json)

RAthena:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024) 
# {
#   json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
#   <environment: namespace:jsonify>

# change bigint conversion without affecting custom json conversion methods
RAthena_options(bigint = "numeric")

RAthena:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024) 
# {
#   json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
#   <environment: namespace:jsonify>
RAthena:::athena_option_env$bigint
# [1] "numeric"

# change binary conversion without affect, bigint or json methods
RAthena_options(binary = "character")

RAthena:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024) 
# {
#   json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
#   <environment: namespace:jsonify>
RAthena:::athena_option_env$bigint
# [1] "numeric"
RAthena:::athena_option_env$binary
# [1] "character"

# no conversion for json objects
con2 <- dbConnect(RAthena::athena(), json = "character")
# use custom json parser
con <- dbConnect(RAthena::athena(), json = jsonify::from_json)

Bug Fix:

RAthena 1.12.0

New Feature

library(DBI)
library(RAthena)

con <- dbConnect(athena())

dbGetPartition(con, "test_df2", .format = T)

# Info: (Data scanned: 0 Bytes)
#    year month day
# 1: 2020    11  17

dbGetPartition(con, "test_df2")

# Info: (Data scanned: 0 Bytes)
#                    partition
# 1: year=2020/month=11/day=17
library(DBI)

con <- dbConnect(RAthena::athena(), bigint = "numeric")

When switching between the different file parsers the bigint to be represented according to the file parser i.e. data.table: “integer64” -> vroom: “I”.

Bug Fix:

Documentation:

RAthena 1.11.1

Note:

Error: write_parquet requires the arrow package, please install it first and try again

RAthena 1.11.0

New Feature

Bug Fix

RAthena 1.10.1

Bug Fix

RAthena 1.10.0

New Feature

# Stop AWS Athena when R has been interrupted:

con <- dbConnect(RAthena::athena())

# Let AWS Athena keep running when R has been interrupted:

con <- dbConnect(RAthena::athena(),
                 keyboard_interrupt = F)

RAthena 1.9.1

Minor Change

Documentation:

RAthena 1.9.0

New Feature

library(DBI)
con <- dbConnect(RAthena::athena())
res <- dbExecute(con, "select * from some_big_table limit 10000")
dbFetch(res, 5000)
stop("Boto3 is not detected please install boto3 using either: `pip install boto3 numpy` in terminal or `install_boto()`.",
     "\nIf this doesn't work please set the python you are using with `reticulate::use_python()` or `reticulate::use_condaenv()`",
     call. = FALSE)

Bug Fix

Documentation

RAthena 1.8.0

New Feature

library(DBI)

con = dbConnect(RAthena::athena())

# upload iris dataframe for removal test
dbWriteTable(con, "iris2", iris)

# Athena method
system.time(dbRemoveTable(con, "iris2", confirm = T))
# user  system elapsed 
# 0.131   0.037   2.404 

# upload iris dataframe for removal test
dbWriteTable(con, "iris2", iris)

# Glue method
system.time(dbRemoveTable(con, "iris2", confirm = T))
# user  system elapsed 
# 0.065   0.009   1.303 
library(DBI)

con = dbConnect(RAthena::athena())

dbWriteTable(con, "iris2", iris, file.type = "json")

dbGetQuery(con, "select * from iris2")

Bug Fix

Documentation

Unit tests:

RAthena 1.7.1

Bug Fix

# Before
dbplyr::translate_sql("2019-01-01", con = con)
# '2019-01-01'

# Now
dbplyr::translate_sql("2019-01-01", con = con)
# DATE '2019-01-01'
# R code:
paste("hi", "bye", sep = "-")

# SQL translation:
('hi'||'-'||'bye')
library(DBI)
library(dplyr)

con <- dbConnect(RAthena::athena())

tbl(con, "iris") %>%
  compute(name = "temp.iris")

New Feature

library(DBI)
library(dplyr)

con <- dbConnect(RAthena::athena())

# ident method:
t1 <- system.time(tbl(con, "iris"))

# sub query method:
t2 <- system.time(tbl(con, sql("select * from iris")))

# ident method
# user  system elapsed 
# 0.082   0.012   0.288 

# sub query method
# user  system elapsed 
# 0.993   0.138   3.660 

Unit test

RAthena 1.7.0

New Feature

library(RAthena)

RAthena_options("vroom")

Unit tests

Documentation

Updated R documentation to roxygen2 7.0.2

RAthena 1.6.0

Major Change

warning('Appended `file.type` is not compatible with the existing Athena DDL file type and has been converted to "', File.Type,'".', call. = FALSE)

Minor Change

New Feature

Bug fix

Unit Tests

RAthena 1.5.0

Major Change

Performance results

library(DBI)

X <- 1e8

df <- data.frame(w =runif(X),
                 x = 1:X,
                 y = sample(letters, X, replace = T), 
                 z = sample(c(TRUE, FALSE), X, replace = T))

con <- dbConnect(RAthena::athena())

# upload dataframe with different splits
dbWriteTable(con, "test_split1", df, compress = T, max.batch = nrow(df), overwrite = T) # no splits
dbWriteTable(con, "test_split2", df, compress = T, max.batch = 0.05 * nrow(df), overwrite = T) # 20 splits
dbWriteTable(con, "test_split3", df, compress = T, max.batch = 0.1 * nrow(df), overwrite = T) # 10 splits

AWS Athena performance results from AWS console (query executed: select count(*) from .... ):

library(DBI)

X <- 1e8

df <- data.frame(w =runif(X),
                 x = 1:X,
                 y = sample(letters, X, replace = T), 
                 z = sample(c(TRUE, FALSE), X, replace = T))

con <- dbConnect(RAthena::athena())

dbWriteTable(con, "test_split1", df, compress = T, overwrite = T) # default will now split compressed file into 20 equal size files.

Added information message to inform user about what files have been added to S3 location if user is overwriting an Athena table.

Minor Change

Bug Fix

Unit tests

RAthena 1.4.1

New Features:

Bug Fix

RAthena 1.4.0

Minor Change

Backend Change

library(DBI)

con <- dbConnect(RAthena::athena())

dbWrite(con, "iris", iris)

New Feature

Bug Fix

Unit Tests

RAthena 1.3.0

Major Changes

Minor Change

New Features:

Unit tests

RAthena 1.2.0

Minor Changes

New Features

Bug Fix

Unit tests

RAthena 1.1.0

New Features

Unit tests

RAthena 1.0.3

New Feature

Bug Fix

Unit tests

RAthena 1.0.2

CRAN Requirement Changes

Bug Fixes

Unit tests

RAthena 1.0.1

CRAN Requirement Changes

Minor Change

RAthena 1.0.0