| 1 |
#' View or Establish Custom Norming Contexts |
|
| 2 |
#' |
|
| 3 |
#' Custom norming contexts can be used to process later texts by specifying the |
|
| 4 |
#' \code{custom_context} API argument in the \code{receptiviti} function (e.g.,
|
|
| 5 |
#' \code{receptiviti("text to score", version = "v2",
|
|
| 6 |
#' options = list(custom_context = "norm_name"))}, |
|
| 7 |
#' where \code{norm_name} is the name you set here).
|
|
| 8 |
#' |
|
| 9 |
#' @param name Name of a new norming context, to be established from the provided \code{text}.
|
|
| 10 |
#' Not providing a name will list the previously created contexts. |
|
| 11 |
#' @param text Text to be processed and used as the custom norming context. |
|
| 12 |
#' Not providing text will return the status of the named norming context. |
|
| 13 |
#' @param options Options to set for the norming context (e.g., |
|
| 14 |
#' \code{list(min_word_count = 350,} \code{max_punctuation = .25)}).
|
|
| 15 |
#' @param delete Logical; If \code{TRUE}, will request to remove the \code{name} context.
|
|
| 16 |
#' @param name_only Logical; If \code{TRUE}, will return a character vector of names
|
|
| 17 |
#' only, including those of build-in contexts. |
|
| 18 |
#' @param id,text_column,id_column,files,dir,file_type,collapse_lines,encoding Additional |
|
| 19 |
#' arguments used to handle \code{text}; same as those in \code{\link{receptiviti}}.
|
|
| 20 |
#' @param bundle_size,bundle_byte_limit,retry_limit,clear_scratch_cache,use_future,in_memory |
|
| 21 |
#' Additional arguments used to manage the requests; same as those in |
|
| 22 |
#' \code{\link{receptiviti}}.
|
|
| 23 |
#' @param key,secret,url Request arguments; same as those in \code{\link{receptiviti}}.
|
|
| 24 |
#' @param verbose Logical; if \code{TRUE}, will show status messages.
|
|
| 25 |
#' @returns Nothing if \code{delete} if \code{TRUE}.
|
|
| 26 |
#' Otherwise, if \code{name} is not specified, a character vector containing names of each
|
|
| 27 |
#' available norming context (built-in and custom). |
|
| 28 |
#' If \code{text} is not specified, the status of the
|
|
| 29 |
#' named context in a \code{list}. If \code{text}s are provided, a \code{list}:
|
|
| 30 |
#' \itemize{
|
|
| 31 |
#' \item \code{initial_status}: Initial status of the context.
|
|
| 32 |
#' \item \code{first_pass}: Response after texts are sent the first time, or
|
|
| 33 |
#' \code{NULL} if the initial status is \code{pass_two}.
|
|
| 34 |
#' \item \code{second_pass}: Response after texts are sent the second time.
|
|
| 35 |
#' } |
|
| 36 |
#' @examples |
|
| 37 |
#' \dontrun{
|
|
| 38 |
#' |
|
| 39 |
#' # get status of all existing custom norming contexts |
|
| 40 |
#' contexts <- receptiviti_norming(name_only = TRUE) |
|
| 41 |
#' |
|
| 42 |
#' # create or get the status of a single custom norming context |
|
| 43 |
#' status <- receptiviti_norming("new_context")
|
|
| 44 |
#' |
|
| 45 |
#' # send texts to establish the context |
|
| 46 |
#' |
|
| 47 |
#' ## these texts can be specified just like |
|
| 48 |
#' ## texts in the main receptiviti function |
|
| 49 |
#' |
|
| 50 |
#' ## such as directly |
|
| 51 |
#' full_status <- receptiviti_norming("new_context", c(
|
|
| 52 |
#' "a text to set the norm", |
|
| 53 |
#' "another text part of the new context" |
|
| 54 |
#' )) |
|
| 55 |
#' |
|
| 56 |
#' ## or from a file |
|
| 57 |
#' full_status <- receptiviti_norming( |
|
| 58 |
#' "new_context", "./path/to/text.csv", |
|
| 59 |
#' text_column = "texts" |
|
| 60 |
#' ) |
|
| 61 |
#' |
|
| 62 |
#' ## or from multiple files in a directory |
|
| 63 |
#' full_status <- receptiviti_norming( |
|
| 64 |
#' "new_context", |
|
| 65 |
#' dir = "./path/to/txt_files" |
|
| 66 |
#' ) |
|
| 67 |
#' } |
|
| 68 |
#' @export |
|
| 69 | ||
| 70 |
receptiviti_norming <- function( |
|
| 71 |
name = NULL, |
|
| 72 |
text = NULL, |
|
| 73 |
options = list(), |
|
| 74 |
delete = FALSE, |
|
| 75 |
name_only = FALSE, |
|
| 76 |
id = NULL, |
|
| 77 |
text_column = NULL, |
|
| 78 |
id_column = NULL, |
|
| 79 |
files = NULL, |
|
| 80 |
dir = NULL, |
|
| 81 |
file_type = "txt", |
|
| 82 |
collapse_lines = FALSE, |
|
| 83 |
encoding = NULL, |
|
| 84 |
bundle_size = 1000, |
|
| 85 |
bundle_byte_limit = 75e5, |
|
| 86 |
retry_limit = 50, |
|
| 87 |
clear_scratch_cache = TRUE, |
|
| 88 |
use_future = FALSE, |
|
| 89 |
in_memory = TRUE, |
|
| 90 |
url = Sys.getenv("RECEPTIVITI_URL"),
|
|
| 91 |
key = Sys.getenv("RECEPTIVITI_KEY"),
|
|
| 92 |
secret = Sys.getenv("RECEPTIVITI_SECRET"),
|
|
| 93 |
verbose = TRUE |
|
| 94 |
) {
|
|
| 95 | 12x |
params <- handle_request_params(url, key, secret) |
| 96 | 12x |
if (name_only) {
|
| 97 | 5x |
req <- curl::curl_fetch_memory( |
| 98 | 5x |
paste0(params$url, "/v2/norming"), |
| 99 | 5x |
params$handler |
| 100 |
) |
|
| 101 | 5x |
if (req$status_code != 200) {
|
| 102 | ! |
stop( |
| 103 | ! |
"failed to make norming list request: ", |
| 104 | ! |
req$status_code, |
| 105 | ! |
call. = FALSE |
| 106 |
) |
|
| 107 |
} |
|
| 108 | 5x |
norms <- jsonlite::fromJSON(rawToChar(req$content)) |
| 109 | 5x |
if (verbose) {
|
| 110 | 1x |
if (length(norms)) {
|
| 111 | 1x |
message( |
| 112 | 1x |
"available norming context(s): ", |
| 113 | 1x |
paste(sub("custom/", "", norms, fixed = TRUE), collapse = ", ")
|
| 114 |
) |
|
| 115 |
} else {
|
|
| 116 | ! |
message("no custom norming contexts found")
|
| 117 |
} |
|
| 118 |
} |
|
| 119 | 5x |
return(norms) |
| 120 |
} |
|
| 121 | ||
| 122 | 7x |
baseurl <- paste0(params$url, "/v2/norming/custom/") |
| 123 | 7x |
if (!is.null(name) && grepl("[^a-z0-9_.-]", name)) {
|
| 124 | 1x |
stop( |
| 125 | 1x |
"`name` can only include lowercase letters, numbers, hyphens, underscores, or periods", |
| 126 | 1x |
call. = FALSE |
| 127 |
) |
|
| 128 |
} |
|
| 129 | ||
| 130 |
# list current contexts |
|
| 131 | 6x |
req <- curl::curl_fetch_memory(baseurl, params$handler) |
| 132 | 6x |
if (req$status_code != 200) {
|
| 133 | ! |
stop( |
| 134 | ! |
"failed to make norming list request: ", |
| 135 | ! |
req$status_code, |
| 136 | ! |
call. = FALSE |
| 137 |
) |
|
| 138 |
} |
|
| 139 | 6x |
norms <- jsonlite::fromJSON(rawToChar(req$content)) |
| 140 | 6x |
if (length(norms)) {
|
| 141 | 6x |
if (verbose && is.null(name)) {
|
| 142 | 1x |
message( |
| 143 | 1x |
"custom norming context(s) found: ", |
| 144 | 1x |
paste(sub("custom/", "", norms$name, fixed = TRUE), collapse = ", ")
|
| 145 |
) |
|
| 146 |
} |
|
| 147 |
} else {
|
|
| 148 | ! |
if (verbose && is.null(name)) message("no custom norming contexts found")
|
| 149 | ! |
norms <- NULL |
| 150 |
} |
|
| 151 | 6x |
if (is.null(name)) {
|
| 152 | 1x |
return(norms) |
| 153 |
} |
|
| 154 | ||
| 155 | 5x |
context_id <- paste0("custom/", name)
|
| 156 | 5x |
if (context_id %in% norms$name) {
|
| 157 | 4x |
if (delete) {
|
| 158 | 1x |
curl::handle_setopt(params$handler, customrequest = "DELETE") |
| 159 | 1x |
req <- curl::curl_fetch_memory(paste0(baseurl, name), params$handler) |
| 160 | 1x |
if (req$status_code != 200) {
|
| 161 | ! |
message <- list(error = rawToChar(req$content)) |
| 162 | ! |
if (substr(message$error, 1, 1) == "{")
|
| 163 | ! |
message$error <- jsonlite::fromJSON(message$error) |
| 164 | ! |
stop( |
| 165 | ! |
"failed to delete custom norming context: ", |
| 166 | ! |
message$error, |
| 167 | ! |
call. = FALSE |
| 168 |
) |
|
| 169 |
} |
|
| 170 | 1x |
return(invisible(NULL)) |
| 171 |
} |
|
| 172 | 3x |
status <- as.list(norms[norms$name == context_id, ]) |
| 173 | 3x |
if (length(options)) {
|
| 174 | ! |
warning( |
| 175 | ! |
"context ", |
| 176 | ! |
name, |
| 177 | ! |
" already exists, so options do not apply", |
| 178 | ! |
call. = FALSE |
| 179 |
) |
|
| 180 |
} |
|
| 181 | 1x |
} else if (!delete) {
|
| 182 |
# establish a new context if needed |
|
| 183 | 1x |
if (verbose) message("requesting creation of custom context ", name)
|
| 184 | 1x |
curl::handle_setopt( |
| 185 | 1x |
params$handler, |
| 186 | 1x |
copypostfields = jsonlite::toJSON( |
| 187 | 1x |
c(name = name, options), |
| 188 | 1x |
auto_unbox = TRUE |
| 189 |
) |
|
| 190 |
) |
|
| 191 | 1x |
req <- curl::curl_fetch_memory(baseurl, params$handler) |
| 192 | 1x |
if (req$status_code != 200) {
|
| 193 | ! |
message <- list(error = rawToChar(req$content)) |
| 194 | ! |
if (substr(message$error, 1, 1) == "{")
|
| 195 | ! |
message$error <- jsonlite::fromJSON(message$error) |
| 196 | ! |
stop( |
| 197 | ! |
"failed to make norming creation request: ", |
| 198 | ! |
message$error, |
| 199 | ! |
call. = FALSE |
| 200 |
) |
|
| 201 |
} |
|
| 202 | 1x |
status <- jsonlite::fromJSON(rawToChar(req$content)) |
| 203 | 1x |
for (option in names(options)) {
|
| 204 | 1x |
if (!is.null(status[[option]]) && status[[option]] != options[[option]]) {
|
| 205 | ! |
warning( |
| 206 | ! |
"set option ", |
| 207 | ! |
option, |
| 208 | ! |
" does not match the requested value", |
| 209 | ! |
call. = FALSE |
| 210 |
) |
|
| 211 |
} |
|
| 212 |
} |
|
| 213 |
} |
|
| 214 | 4x |
if (delete) {
|
| 215 | ! |
message("context ", name, " does not exist")
|
| 216 | ! |
return(invisible(NULL)) |
| 217 |
} |
|
| 218 | 4x |
if (verbose) {
|
| 219 | 4x |
message( |
| 220 | 4x |
"status of ", |
| 221 | 4x |
name, |
| 222 |
": ", |
|
| 223 | 4x |
jsonlite::toJSON(status, pretty = TRUE, auto_unbox = TRUE) |
| 224 |
) |
|
| 225 |
} |
|
| 226 | 4x |
if (is.null(text)) {
|
| 227 | 3x |
return(status) |
| 228 |
} |
|
| 229 | 1x |
if (status$status != "created") {
|
| 230 | ! |
warning("status is not `created`, so cannot send text", call. = FALSE)
|
| 231 | ! |
return(invisible(list( |
| 232 | ! |
initial_status = status, |
| 233 | ! |
first_pass = NULL, |
| 234 | ! |
second_pass = NULL |
| 235 |
))) |
|
| 236 |
} |
|
| 237 | 1x |
if (verbose) message("sending first-pass samples for ", name)
|
| 238 | 1x |
first_pass <- manage_request( |
| 239 | 1x |
text, |
| 240 | 1x |
id = id, |
| 241 | 1x |
text_column = text_column, |
| 242 | 1x |
id_column = id_column, |
| 243 | 1x |
files = files, |
| 244 | 1x |
dir = dir, |
| 245 | 1x |
file_type = file_type, |
| 246 | 1x |
collapse_lines = collapse_lines, |
| 247 | 1x |
encoding = encoding, |
| 248 | 1x |
bundle_size = bundle_size, |
| 249 | 1x |
bundle_byte_limit = bundle_byte_limit, |
| 250 | 1x |
retry_limit = retry_limit, |
| 251 | 1x |
clear_scratch_cache = clear_scratch_cache, |
| 252 | 1x |
cores = 1, |
| 253 | 1x |
use_future = use_future, |
| 254 | 1x |
in_memory = in_memory, |
| 255 | 1x |
url = paste0(baseurl, name, "/one"), |
| 256 | 1x |
key = key, |
| 257 | 1x |
secret = secret, |
| 258 | 1x |
verbose = verbose, |
| 259 | 1x |
to_norming = TRUE |
| 260 | 1x |
)$final_res |
| 261 | 1x |
second_pass <- NULL |
| 262 |
if ( |
|
| 263 | 1x |
!is.null(first_pass$analyzed_samples) && |
| 264 | 1x |
all(first_pass$analyzed_samples == 0) |
| 265 |
) {
|
|
| 266 | ! |
warning( |
| 267 | ! |
"no texts were successfully analyzed in the first pass, so second pass was skipped", |
| 268 | ! |
call. = FALSE |
| 269 |
) |
|
| 270 |
} else {
|
|
| 271 | 1x |
if (verbose) message("sending second-pass samples for ", name)
|
| 272 | 1x |
second_pass <- manage_request( |
| 273 | 1x |
text, |
| 274 | 1x |
id = id, |
| 275 | 1x |
text_column = text_column, |
| 276 | 1x |
id_column = id_column, |
| 277 | 1x |
files = files, |
| 278 | 1x |
dir = dir, |
| 279 | 1x |
file_type = file_type, |
| 280 | 1x |
collapse_lines = collapse_lines, |
| 281 | 1x |
encoding = encoding, |
| 282 | 1x |
bundle_size = bundle_size, |
| 283 | 1x |
bundle_byte_limit = bundle_byte_limit, |
| 284 | 1x |
retry_limit = retry_limit, |
| 285 | 1x |
clear_scratch_cache = clear_scratch_cache, |
| 286 | 1x |
cores = 1, |
| 287 | 1x |
use_future = use_future, |
| 288 | 1x |
in_memory = in_memory, |
| 289 | 1x |
url = paste0(baseurl, name, "/two"), |
| 290 | 1x |
key = key, |
| 291 | 1x |
secret = secret, |
| 292 | 1x |
verbose = verbose, |
| 293 | 1x |
to_norming = TRUE |
| 294 | 1x |
)$final_res |
| 295 |
} |
|
| 296 |
if ( |
|
| 297 | 1x |
!is.null(second_pass$analyzed_samples) && |
| 298 | 1x |
all(second_pass$analyzed_samples == 0) |
| 299 |
) {
|
|
| 300 | ! |
warning( |
| 301 | ! |
"no texts were successfully analyzed in the second pass", |
| 302 | ! |
call. = FALSE |
| 303 |
) |
|
| 304 |
} |
|
| 305 | 1x |
invisible(list( |
| 306 | 1x |
initial_status = status, |
| 307 | 1x |
first_pass = first_pass, |
| 308 | 1x |
second_pass = second_pass |
| 309 |
)) |
|
| 310 |
} |
| 1 |
manage_request <- function( |
|
| 2 |
text = NULL, |
|
| 3 |
id = NULL, |
|
| 4 |
text_column = NULL, |
|
| 5 |
id_column = NULL, |
|
| 6 |
files = NULL, |
|
| 7 |
dir = NULL, |
|
| 8 |
file_type = "txt", |
|
| 9 |
encoding = NULL, |
|
| 10 |
context = "written", |
|
| 11 |
api_args = getOption("receptiviti.api_args", list()),
|
|
| 12 |
bundle_size = 1000, |
|
| 13 |
bundle_byte_limit = 75e5, |
|
| 14 |
collapse_lines = FALSE, |
|
| 15 |
retry_limit = 50, |
|
| 16 |
clear_scratch_cache = TRUE, |
|
| 17 |
request_cache = TRUE, |
|
| 18 |
cores = detectCores() - 1, |
|
| 19 |
collect_results = TRUE, |
|
| 20 |
use_future = FALSE, |
|
| 21 |
in_memory = TRUE, |
|
| 22 |
verbose = FALSE, |
|
| 23 |
make_request = TRUE, |
|
| 24 |
text_as_paths = FALSE, |
|
| 25 |
cache = Sys.getenv("RECEPTIVITI_CACHE"),
|
|
| 26 |
cache_overwrite = FALSE, |
|
| 27 |
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"),
|
|
| 28 |
key = Sys.getenv("RECEPTIVITI_KEY"),
|
|
| 29 |
secret = Sys.getenv("RECEPTIVITI_SECRET"),
|
|
| 30 |
url = Sys.getenv("RECEPTIVITI_URL"),
|
|
| 31 |
version = Sys.getenv("RECEPTIVITI_VERSION"),
|
|
| 32 |
endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT"),
|
|
| 33 |
to_norming = FALSE |
|
| 34 |
) {
|
|
| 35 |
# check input |
|
| 36 | 70x |
if (use_future && !requireNamespace("future.apply", quietly = TRUE)) {
|
| 37 | ! |
stop("install the `future.apply` package to use future", call. = FALSE)
|
| 38 |
} |
|
| 39 | 70x |
st <- proc.time()[[3]] |
| 40 | 70x |
text_as_dir <- FALSE |
| 41 | 70x |
if (is.null(text)) {
|
| 42 | 3x |
if (!is.null(dir)) {
|
| 43 | ! |
if (!dir.exists(dir)) stop("entered dir does not exist", call. = FALSE)
|
| 44 | 1x |
text <- dir |
| 45 | 1x |
text_as_dir <- TRUE |
| 46 | 2x |
} else if (!is.null(files)) {
|
| 47 | 1x |
text <- files |
| 48 | 1x |
text_as_paths <- TRUE |
| 49 |
} else {
|
|
| 50 | 1x |
stop( |
| 51 | 1x |
"enter text as the first argument, or use files or dir", |
| 52 | 1x |
call. = FALSE |
| 53 |
) |
|
| 54 |
} |
|
| 55 |
} |
|
| 56 | 69x |
if (text_as_paths) {
|
| 57 | 3x |
if (anyNA(text)) |
| 58 | 1x |
stop( |
| 59 | 1x |
"NAs are not allowed in text when being treated as file paths", |
| 60 | 1x |
call. = FALSE |
| 61 |
) |
|
| 62 | 2x |
if (!all(file.exists(text))) |
| 63 | 1x |
stop("not all of the files in text exist", call. = FALSE)
|
| 64 |
} |
|
| 65 | 67x |
read_in <- FALSE |
| 66 | 67x |
handle_encoding <- function(file) {
|
| 67 | 162x |
if (is.null(encoding)) {
|
| 68 | 160x |
con <- gzfile(file, "rb") |
| 69 | 160x |
on.exit(close(con)) |
| 70 | 160x |
unlist(stringi::stri_enc_detect(readBin(con, "raw", file.size(file)))[[ |
| 71 | 160x |
1 |
| 72 | 160x |
]])[[1]] |
| 73 |
} else {
|
|
| 74 | 2x |
encoding |
| 75 |
} |
|
| 76 |
} |
|
| 77 |
if ( |
|
| 78 | 67x |
text_as_dir || |
| 79 | 67x |
text_as_paths || |
| 80 | 67x |
(is.character(text) && !anyNA(text) && all(nchar(text) < 500)) |
| 81 |
) {
|
|
| 82 | 50x |
if (text_as_dir || length(text) == 1 && dir.exists(text)) {
|
| 83 | 4x |
if (verbose) |
| 84 | ! |
message( |
| 85 | ! |
"reading in texts from directory: ", |
| 86 | ! |
text, |
| 87 |
" (",
|
|
| 88 | ! |
round(proc.time()[[3]] - st, 4), |
| 89 |
")" |
|
| 90 |
) |
|
| 91 | 4x |
text_as_paths <- TRUE |
| 92 | 4x |
text <- normalizePath( |
| 93 | 4x |
list.files(text, file_type, full.names = TRUE), |
| 94 |
"/", |
|
| 95 | 4x |
FALSE |
| 96 |
) |
|
| 97 |
} |
|
| 98 | 50x |
if (text_as_paths || all(file.exists(text))) {
|
| 99 | 14x |
text_as_paths <- collapse_lines |
| 100 | 14x |
if (!collapse_lines) {
|
| 101 | 12x |
if (verbose) |
| 102 | ! |
message( |
| 103 | ! |
"reading in texts from file list (",
|
| 104 | ! |
round(proc.time()[[3]] - st, 4), |
| 105 |
")" |
|
| 106 |
) |
|
| 107 | 12x |
if (is.null(id_column)) |
| 108 | 11x |
names(text) <- if (length(id) != length(text)) text else id |
| 109 | 12x |
if (all(grepl("\\.csv", text, TRUE))) {
|
| 110 | 6x |
if (is.null(text_column)) |
| 111 | 2x |
stop( |
| 112 | 2x |
"text appears to point to csv files, but text_column was not specified", |
| 113 | 2x |
call. = FALSE |
| 114 |
) |
|
| 115 | 4x |
read_in <- TRUE |
| 116 | 4x |
text <- unlist(lapply(text, function(f) {
|
| 117 | 4x |
d <- tryCatch( |
| 118 |
{
|
|
| 119 | 4x |
enc <- handle_encoding(f) |
| 120 | 4x |
con <- gzfile(f, encoding = enc) |
| 121 | 4x |
arrow::read_csv_arrow( |
| 122 | 4x |
con, |
| 123 | 4x |
read_options = arrow::CsvReadOptions$create( |
| 124 | 4x |
encoding = enc |
| 125 |
), |
|
| 126 | 4x |
col_select = c(text_column, id_column) |
| 127 |
) |
|
| 128 |
}, |
|
| 129 | 4x |
error = function(e) NULL |
| 130 |
) |
|
| 131 | 1x |
if (is.null(d)) stop("failed to read in file ", f, call. = FALSE)
|
| 132 | 3x |
if (!is.null(id_column) && id_column %in% colnames(d)) {
|
| 133 | 1x |
structure( |
| 134 | 1x |
d[, text_column, drop = TRUE], |
| 135 | 1x |
names = d[, id_column, drop = TRUE] |
| 136 |
) |
|
| 137 |
} else {
|
|
| 138 | 2x |
d[, text_column, drop = TRUE] |
| 139 |
} |
|
| 140 |
})) |
|
| 141 |
} else {
|
|
| 142 | 6x |
text <- unlist(lapply(text, function(f) {
|
| 143 | 106x |
tryCatch( |
| 144 |
{
|
|
| 145 | 106x |
con <- gzfile(f, encoding = handle_encoding(f)) |
| 146 | 106x |
on.exit(close(con)) |
| 147 | 106x |
d <- readLines(con, warn = FALSE, skipNul = TRUE) |
| 148 | 106x |
d[d != ""] |
| 149 |
}, |
|
| 150 | 106x |
error = function(e) |
| 151 | 106x |
stop("failed to read in file ", f, call. = FALSE)
|
| 152 |
) |
|
| 153 |
})) |
|
| 154 |
} |
|
| 155 | 9x |
id <- names(text) |
| 156 |
} |
|
| 157 |
} else if ( |
|
| 158 | 36x |
length(text) == 1 && |
| 159 | 36x |
dirname(text) != "." && |
| 160 | 36x |
dir.exists(dirname(dirname(text))) |
| 161 |
) {
|
|
| 162 | ! |
stop("text appears to be a directory, but it does not exist")
|
| 163 |
} |
|
| 164 | 47x |
if (text_as_paths && is.null(id)) {
|
| 165 | 2x |
id <- text |
| 166 | 2x |
if (anyDuplicated(id)) |
| 167 | ! |
id <- names(unlist(lapply( |
| 168 | ! |
split(id, factor(id, unique(id))), |
| 169 | ! |
seq_along |
| 170 |
))) |
|
| 171 |
} |
|
| 172 |
} |
|
| 173 | 64x |
if (is.null(dim(text))) {
|
| 174 | 58x |
if (!read_in) {
|
| 175 | 55x |
if (!text_as_paths && !is.null(text_column)) |
| 176 | 1x |
stop("text_column is specified, but text has no columns", call. = FALSE)
|
| 177 | 54x |
if (!is.null(id_column)) |
| 178 | 1x |
stop("id_column is specified, but text has no columns", call. = FALSE)
|
| 179 |
} |
|
| 180 |
} else {
|
|
| 181 | 1x |
if (length(id) == 1 && id %in% colnames(text)) id_column <- id |
| 182 | 6x |
if (!is.null(id_column)) {
|
| 183 | 2x |
if (id_column %in% colnames(text)) {
|
| 184 | 1x |
id <- text[, id_column, drop = TRUE] |
| 185 |
} else {
|
|
| 186 | 1x |
stop("id_column not found in text", call. = FALSE)
|
| 187 |
} |
|
| 188 |
} |
|
| 189 | 5x |
if (!is.null(text_column)) {
|
| 190 | 3x |
if (text_column %in% colnames(text)) {
|
| 191 | 2x |
text <- text[, text_column, drop = TRUE] |
| 192 |
} else {
|
|
| 193 | 1x |
if (!text_as_paths) stop("text_column not found in text", call. = FALSE)
|
| 194 |
} |
|
| 195 |
} |
|
| 196 | 4x |
if (!is.null(dim(text))) {
|
| 197 | 2x |
if (ncol(text) == 1) {
|
| 198 | 1x |
text <- text[, 1, drop = TRUE] |
| 199 |
} else {
|
|
| 200 | 1x |
stop("text has dimensions, but no text_column column", call. = FALSE)
|
| 201 |
} |
|
| 202 |
} |
|
| 203 |
} |
|
| 204 | 2x |
if (!is.character(text)) text <- as.character(text) |
| 205 | 59x |
if (!length(text)) |
| 206 | 1x |
stop("no texts were found after resolving the text argument")
|
| 207 | 4x |
if (length(id) && !is.character(id)) id <- as.character(id) |
| 208 | 58x |
provided_id <- FALSE |
| 209 | 58x |
if (length(id)) {
|
| 210 | 15x |
if (length(id) != length(text)) |
| 211 | 1x |
stop("id is not the same length as text", call. = FALSE)
|
| 212 | 1x |
if (anyDuplicated(id)) stop("id contains duplicate values", call. = FALSE)
|
| 213 | 13x |
provided_id <- TRUE |
| 214 |
} else {
|
|
| 215 | 43x |
id <- paste0("t", seq_along(text))
|
| 216 |
} |
|
| 217 | 1x |
if (!is.numeric(retry_limit)) retry_limit <- 0 |
| 218 | 56x |
if (to_norming) {
|
| 219 | 2x |
version <- "v2" |
| 220 | 2x |
endpoint <- "norming" |
| 221 | 2x |
full_url <- url |
| 222 | 2x |
request_cache <- FALSE |
| 223 |
} else {
|
|
| 224 | 54x |
url_parts <- unlist(strsplit( |
| 225 | 54x |
regmatches( |
| 226 | 54x |
url, |
| 227 | 54x |
gregexpr("/[Vv]\\d+(?:/[^/]+)?", url)
|
| 228 | 54x |
)[[1]], |
| 229 |
"/", |
|
| 230 | 54x |
fixed = TRUE |
| 231 |
)) |
|
| 232 | 54x |
if (version == "") |
| 233 | 49x |
version <- if (length(url_parts) > 1) url_parts[[2]] else "v1" |
| 234 | 54x |
version <- tolower(version) |
| 235 | 54x |
if (version == "" || !grepl("^v\\d+$", version)) {
|
| 236 | 1x |
stop("invalid version: ", version, call. = FALSE)
|
| 237 |
} |
|
| 238 | 53x |
if (endpoint == "") {
|
| 239 | 52x |
endpoint <- if (length(url_parts) > 2) {
|
| 240 | ! |
url_parts[[3]] |
| 241 |
} else {
|
|
| 242 | 4x |
if (tolower(version) == "v1") "framework" else "analyze" |
| 243 |
} |
|
| 244 |
} |
|
| 245 | 53x |
endpoint <- sub("^.*/", "", tolower(endpoint))
|
| 246 | 53x |
if (endpoint == "" || grepl("[^a-z]", endpoint)) {
|
| 247 | 1x |
stop("invalid endpoint: ", endpoint, call. = FALSE)
|
| 248 |
} |
|
| 249 | 52x |
url <- paste0(sub("/+[Vv]\\d+(/.*)?$|/+$", "", url), "/", version, "/")
|
| 250 | 52x |
full_url <- paste0( |
| 251 | 52x |
url, |
| 252 | 52x |
endpoint, |
| 253 | 52x |
if (version == "v1") "/bulk" else paste0("/", context)
|
| 254 |
) |
|
| 255 | ! |
if (!is.list(api_args)) api_args <- as.list(api_args) |
| 256 |
if ( |
|
| 257 | 52x |
version != "v1" && |
| 258 | 52x |
"context" %in% api_args && |
| 259 | 52x |
"custom_context" %in% api_args |
| 260 |
) {
|
|
| 261 | ! |
stop( |
| 262 | ! |
"only one of `context` or `custom_context may be specified", |
| 263 | ! |
call. = FALSE |
| 264 |
) |
|
| 265 |
} |
|
| 266 | 52x |
if (version != "v1" && length(api_args)) {
|
| 267 | ! |
full_url <- paste0( |
| 268 | ! |
full_url, |
| 269 |
"?", |
|
| 270 | ! |
paste0(names(api_args), "=", unlist(api_args), collapse = "&") |
| 271 |
) |
|
| 272 |
} |
|
| 273 |
} |
|
| 274 | 54x |
args_hash <- digest::digest( |
| 275 | 54x |
jsonlite::toJSON( |
| 276 | 54x |
c( |
| 277 | 54x |
api_args, |
| 278 | 54x |
url = full_url, |
| 279 | 54x |
key = key, |
| 280 | 54x |
secret = secret |
| 281 |
), |
|
| 282 | 54x |
auto_unbox = TRUE |
| 283 |
), |
|
| 284 | 54x |
serialize = FALSE |
| 285 |
) |
|
| 286 | ||
| 287 |
# ping API |
|
| 288 | 54x |
if (make_request) {
|
| 289 | 4x |
if (verbose) message("pinging API (", round(proc.time()[[3]] - st, 4), ")")
|
| 290 | 51x |
ping <- receptiviti_status(url, key, secret, verbose = FALSE) |
| 291 | 1x |
if (is.null(ping)) stop("URL is unreachable", call. = FALSE)
|
| 292 | 1x |
if (ping$status_code != 200) stop(ping$status_message, call. = FALSE) |
| 293 |
} |
|
| 294 | ||
| 295 |
# prepare text |
|
| 296 | 4x |
if (verbose) message("preparing text (", round(proc.time()[[3]] - st, 4), ")")
|
| 297 | 50x |
data <- data.frame(text = text, id = id, stringsAsFactors = FALSE) |
| 298 | 50x |
text <- data[!is.na(data$text) & data$text != "" & !duplicated(data$text), ] |
| 299 | 1x |
if (!nrow(text)) stop("no valid texts to process", call. = FALSE)
|
| 300 | 1x |
if (!is.numeric(bundle_size)) bundle_size <- 1000 |
| 301 | 49x |
n_texts <- nrow(text) |
| 302 | 49x |
n <- ceiling(n_texts / min(1000, max(1, bundle_size))) |
| 303 | 49x |
bundles <- split(text, sort(rep_len(seq_len(n), nrow(text)))) |
| 304 | 49x |
size_fun <- if (text_as_paths) function(b) sum(file.size(b$text)) else |
| 305 | 49x |
object.size |
| 306 | 49x |
for (i in rev(seq_along(bundles))) {
|
| 307 | 160x |
size <- size_fun(bundles[[i]]) |
| 308 | 160x |
if (size > bundle_byte_limit) {
|
| 309 | 1x |
sizes <- vapply( |
| 310 | 1x |
seq_len(nrow(bundles[[i]])), |
| 311 | 1x |
function(r) as.numeric(size_fun(bundles[[i]][r, ])), |
| 312 | 1x |
0 |
| 313 |
) |
|
| 314 | 1x |
if (any(sizes > bundle_byte_limit)) {
|
| 315 | 1x |
stop( |
| 316 | 1x |
"one of your texts is over the individual size limit (",
|
| 317 | 1x |
bundle_byte_limit / 1024e3, |
| 318 | 1x |
" MB)", |
| 319 | 1x |
call. = FALSE |
| 320 |
) |
|
| 321 |
} |
|
| 322 | ! |
bins <- rep(1, length(sizes)) |
| 323 | ! |
bin_size <- 0 |
| 324 | ! |
bi <- 1 |
| 325 | ! |
for (ti in seq_along(bins)) {
|
| 326 | ! |
bin_size <- bin_size + sizes[ti] |
| 327 | ! |
if (bin_size > bundle_byte_limit) {
|
| 328 | ! |
bin_size <- sizes[ti] |
| 329 | ! |
bi <- bi + 1 |
| 330 |
} |
|
| 331 | ! |
bins[ti] <- bi |
| 332 |
} |
|
| 333 | ! |
bundles <- c( |
| 334 | ! |
bundles[-i], |
| 335 | ! |
unname(split(bundles[[i]], paste0(i, ".", bins))) |
| 336 |
) |
|
| 337 |
} |
|
| 338 |
} |
|
| 339 | 48x |
n_bundles <- length(bundles) |
| 340 | 48x |
bundle_ref <- if (n_bundles == 1) "bundle" else "bundles" |
| 341 | 48x |
if (verbose) |
| 342 | 4x |
message( |
| 343 | 4x |
"prepared text in ", |
| 344 | 4x |
n_bundles, |
| 345 |
" ", |
|
| 346 | 4x |
bundle_ref, |
| 347 |
" (",
|
|
| 348 | 4x |
round(proc.time()[[3]] - st, 4), |
| 349 |
")" |
|
| 350 |
) |
|
| 351 | ||
| 352 | 48x |
auth <- paste0(key, ":", secret) |
| 353 | 48x |
if (is.null(in_memory) && (use_future || cores > 1) && n_bundles > cores) |
| 354 | ! |
in_memory <- FALSE |
| 355 | 48x |
request_scratch <- NULL |
| 356 | 48x |
if (!in_memory) {
|
| 357 | 2x |
if (verbose) |
| 358 | ! |
message( |
| 359 | ! |
"writing ", |
| 360 | ! |
bundle_ref, |
| 361 | ! |
" to disc (",
|
| 362 | ! |
round(proc.time()[[3]] - st, 4), |
| 363 |
")" |
|
| 364 |
) |
|
| 365 | 2x |
request_scratch <- paste0(tempdir(), "/receptiviti_request_scratch/") |
| 366 | 2x |
dir.create(request_scratch, FALSE) |
| 367 | 2x |
if (clear_scratch_cache) on.exit(unlink(request_scratch, recursive = TRUE)) |
| 368 | 2x |
bundles <- vapply( |
| 369 | 2x |
bundles, |
| 370 | 2x |
function(b) {
|
| 371 | 6x |
scratch_bundle <- paste0(request_scratch, digest::digest(b), ".rds") |
| 372 | 6x |
if (!file.exists(scratch_bundle)) |
| 373 | 6x |
saveRDS(b, scratch_bundle, compress = FALSE) |
| 374 | 6x |
scratch_bundle |
| 375 |
}, |
|
| 376 |
"", |
|
| 377 | 2x |
USE.NAMES = FALSE |
| 378 |
) |
|
| 379 |
} |
|
| 380 | ||
| 381 | 48x |
doprocess <- function(bundles, cores, future) {
|
| 382 | 7x |
env <- parent.frame() |
| 383 | 7x |
if (future) {
|
| 384 | 1x |
eval( |
| 385 | 1x |
expression(future.apply::future_lapply(bundles, process)), |
| 386 | 1x |
envir = env |
| 387 |
) |
|
| 388 |
} else {
|
|
| 389 | 6x |
cl <- parallel::makeCluster(cores) |
| 390 | 6x |
parallel::clusterExport(cl, ls(envir = env), env) |
| 391 | 6x |
on.exit(parallel::stopCluster(cl)) |
| 392 | 6x |
(if (length(bundles) > cores * 2) parallel::parLapplyLB else |
| 393 | 6x |
parallel::parLapply)(cl, bundles, process) |
| 394 |
} |
|
| 395 |
} |
|
| 396 | ||
| 397 | 48x |
request <- function(body, body_hash, bin, ids, attempt = retry_limit) {
|
| 398 | 636x |
temp_file <- paste0(tempdir(), "/", body_hash, ".json") |
| 399 | 618x |
if (!request_cache) unlink(temp_file) |
| 400 | 636x |
res <- NULL |
| 401 | 636x |
if (!file.exists(temp_file)) {
|
| 402 | 630x |
if (make_request) {
|
| 403 | 629x |
handler <- tryCatch( |
| 404 | 629x |
curl::new_handle(httpauth = 1, userpwd = auth, copypostfields = body), |
| 405 | 629x |
error = function(e) e$message |
| 406 |
) |
|
| 407 | 629x |
if (is.character(handler)) {
|
| 408 | ! |
stop( |
| 409 | ! |
if (grepl("libcurl", handler, fixed = TRUE)) {
|
| 410 | ! |
"libcurl encountered an error; try setting the bundle_byte_limit argument to a smaller value" |
| 411 |
} else {
|
|
| 412 | ! |
paste("failed to create handler:", handler)
|
| 413 |
}, |
|
| 414 | ! |
call. = FALSE |
| 415 |
) |
|
| 416 |
} |
|
| 417 | 2x |
if (to_norming) curl::handle_setopt(handler, customrequest = "PATCH") |
| 418 | 629x |
res <- curl::curl_fetch_disk(full_url, temp_file, handler) |
| 419 |
} else {
|
|
| 420 | 1x |
stop( |
| 421 | 1x |
"make_request is FALSE, but there are texts with no cached results", |
| 422 | 1x |
call. = FALSE |
| 423 |
) |
|
| 424 |
} |
|
| 425 |
} |
|
| 426 | 635x |
result <- if (file.exists(temp_file)) {
|
| 427 | 635x |
if ( |
| 428 | 635x |
is.null(res$type) || grepl("application/json", res$type, fixed = TRUE)
|
| 429 |
) {
|
|
| 430 | 635x |
tryCatch( |
| 431 | 635x |
jsonlite::read_json(temp_file, simplifyVector = TRUE), |
| 432 | 635x |
error = function(e) list(message = "invalid response format") |
| 433 |
) |
|
| 434 |
} else {
|
|
| 435 | ! |
list(message = "invalid response format") |
| 436 |
} |
|
| 437 |
} else {
|
|
| 438 | ! |
list(message = rawToChar(res$content)) |
| 439 |
} |
|
| 440 | 635x |
valid_result <- if (to_norming) {
|
| 441 | 2x |
!is.null(result$submitted) |
| 442 |
} else {
|
|
| 443 | 633x |
!is.null(result$results) || is.null(result$message) |
| 444 |
} |
|
| 445 | 635x |
if (valid_result) {
|
| 446 | 81x |
if (!is.null(result$results)) result <- result$results |
| 447 | 83x |
if ("error" %in% names(result)) {
|
| 448 | ! |
if (!is.list(result$error)) {
|
| 449 | ! |
warning("bundle ", body_hash, " failed: ", result$error)
|
| 450 | ! |
} else if (is.list(result$error)) {
|
| 451 | ! |
warning( |
| 452 | ! |
"bundle ", |
| 453 | ! |
body_hash, |
| 454 | ! |
" failed: ", |
| 455 | ! |
if (!is.null(result$error$code)) |
| 456 | ! |
paste0("(", result$error$code, ") ") else NULL,
|
| 457 | ! |
result$error$message |
| 458 |
) |
|
| 459 |
} else {
|
|
| 460 | ! |
su <- !is.na(result$error$code) |
| 461 | ! |
errors <- if (is.data.frame(result)) {
|
| 462 | ! |
result[su & !duplicated(result$error$code), "error"] |
| 463 |
} else {
|
|
| 464 | ! |
result$error |
| 465 |
} |
|
| 466 | ! |
warning( |
| 467 | ! |
if (sum(su) > 1) "some texts were invalid: " else |
| 468 | ! |
"a text was invalid: ", |
| 469 | ! |
paste( |
| 470 | ! |
do.call( |
| 471 | ! |
paste0, |
| 472 | ! |
data.frame( |
| 473 |
"(",
|
|
| 474 | ! |
errors$code, |
| 475 |
") ", |
|
| 476 | ! |
errors$message, |
| 477 | ! |
stringsAsFactors = FALSE |
| 478 |
) |
|
| 479 |
), |
|
| 480 | ! |
collapse = "; " |
| 481 |
), |
|
| 482 | ! |
call. = FALSE |
| 483 |
) |
|
| 484 |
} |
|
| 485 |
} |
|
| 486 | 83x |
if (to_norming) {
|
| 487 | 2x |
cbind(body_hash = body_hash, as.data.frame(result)) |
| 488 |
} else {
|
|
| 489 | 81x |
unpack <- function(d) {
|
| 490 | 814x |
if (is.list(d)) as.data.frame(lapply(d, unpack), optional = TRUE) else |
| 491 | 16204x |
d |
| 492 |
} |
|
| 493 | 81x |
result <- unpack(result[ |
| 494 | 81x |
!names(result) %in% c("response_id", "language", "version", "error")
|
| 495 |
]) |
|
| 496 | 81x |
if (!is.null(result) && nrow(result)) {
|
| 497 | 81x |
if (colnames(result)[[1]] == "request_id") {
|
| 498 | 81x |
colnames(result)[[1]] <- "text_hash" |
| 499 |
} |
|
| 500 | 81x |
cbind(id = ids, bin = bin, result) |
| 501 |
} |
|
| 502 |
} |
|
| 503 |
} else {
|
|
| 504 | 552x |
unlink(temp_file) |
| 505 | 552x |
if (length(result$message) == 1 && substr(result$message, 1, 1) == "{") {
|
| 506 | ! |
result <- jsonlite::fromJSON(result$message) |
| 507 |
} |
|
| 508 | 48x |
if ( |
| 509 | 552x |
attempt > 0 && |
| 510 | 552x |
(length(result$code) == 1 && result$code == 1420) || |
| 511 | 552x |
(length(result$message) == 1 && |
| 512 | 552x |
result$message == "invalid response format") |
| 513 |
) {
|
|
| 514 | 526x |
wait_time <- as.numeric(regmatches( |
| 515 | 526x |
result$message, |
| 516 | 526x |
regexec("[0-9]+(?:\\.[0-9]+)?", result$message)
|
| 517 |
)) |
|
| 518 | 526x |
Sys.sleep(if (is.na(wait_time)) 1 else wait_time / 1e3) |
| 519 | 526x |
request(body, body_hash, bin, ids, attempt - 1) |
| 520 |
} else {
|
|
| 521 | 26x |
message <- if (is.null(res$status_code)) 200 else res$status_code |
| 522 | 26x |
if (length(result$code)) |
| 523 | 26x |
message <- paste0(message, " (", result$code, "): ", result$message)
|
| 524 | ! |
if (length(result$error)) message <- paste0(message, ": ", result$error) |
| 525 | 26x |
stop(message, call. = FALSE) |
| 526 |
} |
|
| 527 |
} |
|
| 528 |
} |
|
| 529 | ||
| 530 | 48x |
process <- function(bundle) {
|
| 531 | 148x |
opts <- getOption("stringsAsFactors")
|
| 532 | 148x |
options("stringsAsFactors" = FALSE)
|
| 533 | 148x |
on.exit(options("stringsAsFactors" = opts))
|
| 534 | 6x |
if (is.character(bundle)) bundle <- readRDS(bundle) |
| 535 | 148x |
text <- bundle$text |
| 536 | 148x |
bin <- NULL |
| 537 | 148x |
if (text_as_paths) {
|
| 538 | 4x |
if (all(grepl("\\.csv", text, TRUE))) {
|
| 539 | 3x |
if (is.null(text_column)) |
| 540 | ! |
stop( |
| 541 | ! |
"files appear to be csv, but no text_column was specified", |
| 542 | ! |
call. = FALSE |
| 543 |
) |
|
| 544 | 3x |
text <- vapply( |
| 545 | 3x |
text, |
| 546 | 3x |
function(f) {
|
| 547 | 51x |
tryCatch( |
| 548 | 51x |
paste( |
| 549 | 51x |
arrow::read_csv_arrow( |
| 550 | 51x |
f, |
| 551 | 51x |
read_options = arrow::CsvReadOptions$create( |
| 552 | 51x |
encoding = handle_encoding(f) |
| 553 |
), |
|
| 554 | 51x |
col_select = dplyr::all_of(text_column) |
| 555 | 51x |
)[[1]], |
| 556 | 51x |
collapse = " " |
| 557 |
), |
|
| 558 | 51x |
error = function(e) |
| 559 | 51x |
stop("failed to read in file ", f, call. = FALSE)
|
| 560 |
) |
|
| 561 |
}, |
|
| 562 |
"" |
|
| 563 |
) |
|
| 564 |
} else {
|
|
| 565 | 1x |
text <- vapply( |
| 566 | 1x |
text, |
| 567 | 1x |
function(f) {
|
| 568 | 1x |
tryCatch( |
| 569 |
{
|
|
| 570 | 1x |
con <- file(f, encoding = handle_encoding(f)) |
| 571 | 1x |
on.exit(close(con)) |
| 572 | 1x |
paste( |
| 573 | 1x |
readLines(con, warn = FALSE, skipNul = TRUE), |
| 574 | 1x |
collapse = " " |
| 575 |
) |
|
| 576 |
}, |
|
| 577 | 1x |
error = function(e) |
| 578 | 1x |
stop("failed to read in file ", f, call. = FALSE)
|
| 579 |
) |
|
| 580 |
}, |
|
| 581 |
"" |
|
| 582 |
) |
|
| 583 |
} |
|
| 584 |
} |
|
| 585 | 148x |
bundle$hashes <- paste0(vapply( |
| 586 | 148x |
paste0(args_hash, text), |
| 587 | 148x |
digest::digest, |
| 588 |
"", |
|
| 589 | 148x |
serialize = FALSE |
| 590 |
)) |
|
| 591 | 148x |
if (to_norming) {
|
| 592 | 2x |
body <- jsonlite::toJSON( |
| 593 | 2x |
lapply( |
| 594 | 2x |
seq_along(text), |
| 595 | 2x |
function(i) list(text = text[[i]], request_id = bundle$hashes[[i]]) |
| 596 |
), |
|
| 597 | 2x |
auto_unbox = TRUE |
| 598 |
) |
|
| 599 | 2x |
res <- request( |
| 600 | 2x |
body, |
| 601 | 2x |
digest::digest(body, serialize = FALSE), |
| 602 | 2x |
initial, |
| 603 | 2x |
bundle$id |
| 604 |
) |
|
| 605 | 2x |
prog(amount = nrow(bundle)) |
| 606 |
} else {
|
|
| 607 | 146x |
initial <- paste0("h", substr(bundle$hashes, 1, 1))
|
| 608 | 146x |
set <- !is.na(text) & |
| 609 | 146x |
text != "" & |
| 610 | 146x |
text != "logical(0)" & |
| 611 | 146x |
!duplicated(bundle$hashes) |
| 612 | 146x |
res_cached <- cached_cols <- res_fresh <- NULL |
| 613 | 146x |
nres <- ncached <- 0 |
| 614 | 146x |
check_cache <- !cache_overwrite && |
| 615 | 146x |
(cache != "" && length(list.dirs(cache))) |
| 616 | 146x |
if (check_cache) {
|
| 617 | 42x |
db <- arrow::open_dataset( |
| 618 | 42x |
cache, |
| 619 | 42x |
partitioning = arrow::schema(bin = arrow::string()), |
| 620 | 42x |
format = cache_format |
| 621 |
) |
|
| 622 | 42x |
cached_cols <- colnames(db) |
| 623 | 42x |
cached <- if (!is.null(db$schema$GetFieldByName("text_hash"))) {
|
| 624 | 40x |
text_hash <- NULL |
| 625 | 40x |
su <- dplyr::filter( |
| 626 | 40x |
db, |
| 627 | 40x |
bin %in% unique(initial), |
| 628 | 40x |
text_hash %in% bundle$hashes |
| 629 |
) |
|
| 630 | 40x |
tryCatch( |
| 631 | 40x |
dplyr::compute( |
| 632 | 40x |
if (collect_results) su else dplyr::select(su, text_hash) |
| 633 |
), |
|
| 634 | 40x |
error = function(e) matrix(integer(), 0) |
| 635 |
) |
|
| 636 |
} else {
|
|
| 637 | 2x |
matrix(integer(), 0) |
| 638 |
} |
|
| 639 | 42x |
ncached <- nrow(cached) |
| 640 | 42x |
if (ncached) {
|
| 641 | 38x |
cached <- as.data.frame(cached$to_data_frame()) |
| 642 | 38x |
if (anyDuplicated(cached$text_hash)) |
| 643 | ! |
cached <- cached[!duplicated(cached$text_hash), ] |
| 644 | 38x |
rownames(cached) <- cached$text_hash |
| 645 | 38x |
cached_set <- which(bundle$hashes %in% cached$text_hash) |
| 646 | 38x |
set[cached_set] <- FALSE |
| 647 | 38x |
if (collect_results) {
|
| 648 | 38x |
res_cached <- cbind( |
| 649 | 38x |
id = bundle$id[cached_set], |
| 650 | 38x |
cached[bundle$hashes[cached_set], ] |
| 651 |
) |
|
| 652 |
} |
|
| 653 |
} |
|
| 654 |
} |
|
| 655 | 146x |
valid_options <- names(api_args) |
| 656 | 146x |
if (any(set)) {
|
| 657 | 108x |
set <- which(set) |
| 658 | 108x |
make_bundle <- if (version == "v1") {
|
| 659 | 104x |
function(i) {
|
| 660 | 202x |
c( |
| 661 | 202x |
api_args, |
| 662 | 202x |
list(content = text[[i]], request_id = bundle$hashes[[i]]) |
| 663 |
) |
|
| 664 |
} |
|
| 665 |
} else {
|
|
| 666 | 4x |
function(i) {
|
| 667 | 4x |
list(text = text[[i]], request_id = bundle$hashes[[i]]) |
| 668 |
} |
|
| 669 |
} |
|
| 670 | 108x |
body <- jsonlite::toJSON( |
| 671 | 108x |
unname(lapply(set, make_bundle)), |
| 672 | 108x |
auto_unbox = TRUE |
| 673 |
) |
|
| 674 | 108x |
body_hash <- digest::digest(body, serialize = FALSE) |
| 675 | 108x |
res_fresh <- request(body, body_hash, initial[set], bundle$id[set]) |
| 676 | 81x |
valid_options <- valid_options[valid_options %in% colnames(res_fresh)] |
| 677 | 81x |
if (length(valid_options)) {
|
| 678 | 1x |
res_fresh <- res_fresh[, |
| 679 | 1x |
!colnames(res_fresh) %in% valid_options, |
| 680 | 1x |
drop = FALSE |
| 681 |
] |
|
| 682 |
} |
|
| 683 | 81x |
if (ncached && !all(cached_cols %in% colnames(res_fresh))) {
|
| 684 | ! |
res_cached <- NULL |
| 685 | ! |
ncached <- 0 |
| 686 | ! |
body <- jsonlite::toJSON( |
| 687 | ! |
lapply(cached_set, make_bundle), |
| 688 | ! |
auto_unbox = TRUE |
| 689 |
) |
|
| 690 | ! |
res_fresh <- rbind( |
| 691 | ! |
res_fresh, |
| 692 | ! |
request( |
| 693 | ! |
body, |
| 694 | ! |
digest::digest(body, serialize = FALSE), |
| 695 | ! |
initial[cached_set], |
| 696 | ! |
bundle$id[cached_set] |
| 697 |
) |
|
| 698 |
) |
|
| 699 |
} |
|
| 700 | 81x |
nres <- nrow(res_fresh) |
| 701 | 81x |
if (cache != "" && nres) {
|
| 702 | 4x |
writer <- if (cache_format == "parquet") arrow::write_parquet else |
| 703 | 4x |
arrow::write_feather |
| 704 | 4x |
cols <- vapply( |
| 705 | 4x |
res_fresh[, |
| 706 | 4x |
!(colnames(res_fresh) %in% c("id", "bin", names(api_args)))
|
| 707 |
], |
|
| 708 | 4x |
is.character, |
| 709 | 4x |
TRUE |
| 710 |
) |
|
| 711 | 4x |
schema <- list() |
| 712 | 4x |
for (v in names(cols)) {
|
| 713 | 796x |
schema[[v]] <- if (cols[[v]]) {
|
| 714 | 4x |
arrow::string() |
| 715 | 796x |
} else if ( |
| 716 | 796x |
v %in% c("summary.word_count", "summary.sentence_count")
|
| 717 |
) {
|
|
| 718 | 8x |
if (anyNA(res_fresh[[v]])) |
| 719 | ! |
res_fresh[[v]][is.na(res_fresh[[v]])] <- NA_integer_ |
| 720 | 8x |
arrow::int32() |
| 721 |
} else {
|
|
| 722 | 784x |
if (anyNA(res_fresh[[v]])) |
| 723 | 4x |
res_fresh[[v]][is.na(res_fresh[[v]])] <- NA_real_ |
| 724 | 784x |
arrow::float64() |
| 725 |
} |
|
| 726 |
} |
|
| 727 | 4x |
schema <- arrow::schema(schema) |
| 728 | 4x |
for (part_bin in unique(res_fresh$bin)) {
|
| 729 | 18x |
part <- res_fresh[res_fresh$bin == part_bin, ] |
| 730 | 18x |
part$id <- NULL |
| 731 | 18x |
part$bin <- NULL |
| 732 | 18x |
bin_dir <- paste0(cache, "/bin=", part_bin, "/") |
| 733 | 18x |
dir.create(bin_dir, FALSE, TRUE) |
| 734 | 18x |
writer( |
| 735 | 18x |
arrow::as_arrow_table(part, schema = schema), |
| 736 | 18x |
paste0(bin_dir, "fragment-", body_hash, "-0.", cache_format) |
| 737 |
) |
|
| 738 |
} |
|
| 739 |
} |
|
| 740 |
} |
|
| 741 | 119x |
if (collect_results) {
|
| 742 | 119x |
res <- rbind(res_cached, res_fresh) |
| 743 | 119x |
if (length(valid_options)) |
| 744 | 1x |
for (n in valid_options) res[[n]] <- api_args[[n]] |
| 745 | 119x |
missing_ids <- !bundle$id %in% res$id |
| 746 | 119x |
if (any(missing_ids)) {
|
| 747 | 1x |
varnames <- colnames(res)[colnames(res) != "id"] |
| 748 | 1x |
res <- rbind( |
| 749 | 1x |
res, |
| 750 | 1x |
cbind( |
| 751 | 1x |
id = bundle$id[missing_ids], |
| 752 | 1x |
as.data.frame(matrix( |
| 753 | 1x |
NA, |
| 754 | 1x |
sum(missing_ids), |
| 755 | 1x |
length(varnames), |
| 756 | 1x |
dimnames = list(NULL, varnames) |
| 757 |
)) |
|
| 758 |
) |
|
| 759 |
) |
|
| 760 | 1x |
res$text_hash <- structure(bundle$hashes, names = bundle$id)[res$id] |
| 761 |
} |
|
| 762 |
} |
|
| 763 | 119x |
prog(amount = nres + ncached) |
| 764 |
} |
|
| 765 | ! |
if (collect_results) res else NULL |
| 766 |
} |
|
| 767 | ||
| 768 |
# make request(s) |
|
| 769 | 48x |
cores <- if (is.numeric(cores)) max(1, min(n_bundles, cores)) else 1 |
| 770 | 48x |
prog <- progressor(n_texts) |
| 771 | 48x |
results <- if (use_future || cores > 1) {
|
| 772 | 7x |
call_env <- new.env(parent = globalenv()) |
| 773 | 7x |
environment(doprocess) <- call_env |
| 774 | 7x |
environment(request) <- call_env |
| 775 | 7x |
environment(process) <- call_env |
| 776 | 7x |
for (name in c( |
| 777 | 7x |
"doprocess", |
| 778 | 7x |
"request", |
| 779 | 7x |
"process", |
| 780 | 7x |
"text_column", |
| 781 | 7x |
"prog", |
| 782 | 7x |
"make_request", |
| 783 | 7x |
"full_url", |
| 784 | 7x |
"cache", |
| 785 | 7x |
"cache_overwrite", |
| 786 | 7x |
"use_future", |
| 787 | 7x |
"cores", |
| 788 | 7x |
"bundles", |
| 789 | 7x |
"cache_format", |
| 790 | 7x |
"request_cache", |
| 791 | 7x |
"auth", |
| 792 | 7x |
"version", |
| 793 | 7x |
"to_norming", |
| 794 | 7x |
"text_as_paths", |
| 795 | 7x |
"retry_limit", |
| 796 | 7x |
"api_args", |
| 797 | 7x |
"args_hash", |
| 798 | 7x |
"encoding", |
| 799 | 7x |
"handle_encoding", |
| 800 | 7x |
"collect_results" |
| 801 |
)) {
|
|
| 802 | 168x |
call_env[[name]] <- get(name) |
| 803 |
} |
|
| 804 | 7x |
if (verbose) {
|
| 805 | ! |
message( |
| 806 | ! |
"processing ", |
| 807 | ! |
bundle_ref, |
| 808 | ! |
" using ", |
| 809 | ! |
if (use_future) "future backend" else paste(cores, "cores"), |
| 810 |
" (",
|
|
| 811 | ! |
round(proc.time()[[3]] - st, 4), |
| 812 |
")" |
|
| 813 |
) |
|
| 814 |
} |
|
| 815 | 7x |
eval(expression(doprocess(bundles, cores, use_future)), envir = call_env) |
| 816 |
} else {
|
|
| 817 | 41x |
if (verbose) |
| 818 | 4x |
message( |
| 819 | 4x |
"processing ", |
| 820 | 4x |
bundle_ref, |
| 821 | 4x |
" sequentially (",
|
| 822 | 4x |
round(proc.time()[[3]] - st, 4), |
| 823 |
")" |
|
| 824 |
) |
|
| 825 | 41x |
lapply(bundles, process) |
| 826 |
} |
|
| 827 | 46x |
if (verbose) |
| 828 | 4x |
message("done retrieving (", round(proc.time()[[3]] - st, 4), ")")
|
| 829 | 46x |
if (collect_results) {
|
| 830 | 46x |
final_res <- do.call(rbind, results) |
| 831 | 46x |
list(data = data, final_res = final_res, provided_id = provided_id) |
| 832 |
} else {
|
|
| 833 | ! |
NULL |
| 834 |
} |
|
| 835 |
} |
| 1 |
.onLoad <- function(lib, pkg) {
|
|
| 2 | ! |
if (Sys.getenv("RECEPTIVITI_URL") == "")
|
| 3 | ! |
Sys.setenv(RECEPTIVITI_URL = "https://api.receptiviti.com/") |
| 4 |
} |
|
| 5 | ||
| 6 |
#' Receptiviti API |
|
| 7 |
#' |
|
| 8 |
#' The main function to access the \href{https://www.receptiviti.com}{Receptiviti} API.
|
|
| 9 |
#' |
|
| 10 |
#' @param text A character vector with text to be processed, path to a directory containing files, or a vector of file paths. |
|
| 11 |
#' If a single path to a directory, each file is collapsed to a single text. If a path to a file or files, |
|
| 12 |
#' each line or row is treated as a separate text, unless \code{collapse_lines} is \code{TRUE} (in which case,
|
|
| 13 |
#' files will be read in as part of bundles at processing time, as is always the case when a directory). |
|
| 14 |
#' Use \code{files} to more reliably enter files, or \code{dir} to more reliably specify a directory.
|
|
| 15 |
#' @param output Path to a \code{.csv} file to write results to. If this already exists, set \code{overwrite} to \code{TRUE}
|
|
| 16 |
#' to overwrite it. |
|
| 17 |
#' @param id Vector of unique IDs the same length as \code{text}, to be included in the results.
|
|
| 18 |
#' @param text_column,id_column Column name of text/id, if \code{text} is a matrix-like object, or a path to a csv file.
|
|
| 19 |
#' @param files A list of file paths, as alternate entry to \code{text}.
|
|
| 20 |
#' @param dir A directory to search for files in, as alternate entry to \code{text}.
|
|
| 21 |
#' @param file_type File extension to search for, if \code{text} is the path to a directory containing files to be read in.
|
|
| 22 |
#' @param encoding Encoding of file(s) to be read in. If not specified, this will be detected, which can fail, |
|
| 23 |
#' resulting in mis-encoded characters; for best (and fasted) results, specify encoding. |
|
| 24 |
#' @param return_text Logical; if \code{TRUE}, \code{text} is included as the first column of the result.
|
|
| 25 |
#' @param context Name of the analysis context. |
|
| 26 |
#' @param custom_context Name of a custom context (as listed by \code{\link{receptiviti_norming}}),
|
|
| 27 |
#' or \code{TRUE} if \code{context} is the name of a custom context.
|
|
| 28 |
#' @param api_args A list of additional arguments to pass to the API (e.g., \code{list(sallee_mode = "sparse")}). Defaults to the
|
|
| 29 |
#' \code{receptiviti.api_args} option. Custom norming contexts can be established with the \code{\link{receptiviti_norming}}
|
|
| 30 |
#' function, then referred to here with the \code{custom_context} argument (only available in API V2).
|
|
| 31 |
#' @param frameworks A vector of frameworks to include results from. Texts are always scored with all available framework -- |
|
| 32 |
#' this just specifies what to return. Defaults to \code{all}, to return all scored frameworks. Can be set by the
|
|
| 33 |
#' \code{receptiviti.frameworks} option (e.g., \code{options(receptiviti.frameworks = c("liwc", "sallee"))}).
|
|
| 34 |
#' @param framework_prefix Logical; if \code{FALSE}, will remove the framework prefix from column names, which may result in duplicates.
|
|
| 35 |
#' If this is not specified, and 1 framework is selected, or \code{as_list} is \code{TRUE}, will default to remove prefixes.
|
|
| 36 |
#' @param as_list Logical; if \code{TRUE}, returns a list with frameworks in separate entries.
|
|
| 37 |
#' @param bundle_size Number of texts to include in each request; between 1 and 1,000. |
|
| 38 |
#' @param bundle_byte_limit Memory limit (in bytes) of each bundle, under \code{1e7} (10 MB, which is the API's limit).
|
|
| 39 |
#' May need to be lower than the API's limit, depending on the system's requesting library. |
|
| 40 |
#' @param collapse_lines Logical; if \code{TRUE}, and \code{text} contains paths to files, each file is treated as a single text.
|
|
| 41 |
#' @param retry_limit Maximum number of times each request can be retried after hitting a rate limit. |
|
| 42 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{output} file.
|
|
| 43 |
#' @param compress Logical; if \code{TRUE}, will save as an \code{xz}-compressed file.
|
|
| 44 |
#' @param make_request Logical; if \code{FALSE}, a request is not made. This could be useful if you want to be sure and
|
|
| 45 |
#' load from one of the caches, but aren't sure that all results exist there; it will error out if it encounters |
|
| 46 |
#' texts it has no other source for. |
|
| 47 |
#' @param text_as_paths Logical; if \code{TRUE}, ensures \code{text} is treated as a vector of file paths. Otherwise, this will be
|
|
| 48 |
#' determined if there are no \code{NA}s in \code{text} and every entry is under 500 characters long.
|
|
| 49 |
#' @param cache Path to a directory in which to save unique results for reuse; defaults to |
|
| 50 |
#' \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}. See the Cache section for details.
|
|
| 51 |
#' @param cache_overwrite Logical; if \code{TRUE}, will write results to the cache without reading from it. This could be used
|
|
| 52 |
#' if you want fresh results to be cached without clearing the cache. |
|
| 53 |
#' @param cache_format Format of the cache database; see \code{\link[arrow]{FileFormat}}.
|
|
| 54 |
#' Defaults to \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE_FORMAT")}.
|
|
| 55 |
#' @param clear_cache Logical; if \code{TRUE}, will clear any existing files in the cache. Use \code{cache_overwrite} if
|
|
| 56 |
#' you want fresh results without clearing or disabling the cache. Use \code{cache = FALSE} to disable the cache.
|
|
| 57 |
#' @param request_cache Logical; if \code{FALSE}, will always make a fresh request, rather than using the response
|
|
| 58 |
#' from a previous identical request. |
|
| 59 |
#' @param cores Number of CPU cores to split bundles across, if there are multiple bundles. See the Parallelization section. |
|
| 60 |
#' @param collect_results Logical; if \code{FALSE}, will not retain bundle results in memory for return.
|
|
| 61 |
#' @param use_future Logical; if \code{TRUE}, uses a \code{future} back-end to process bundles, in which case,
|
|
| 62 |
#' parallelization can be controlled with the \code{\link[future]{plan}} function (e.g., \code{plan("multisession")}
|
|
| 63 |
#' to use multiple cores); this is required to see progress bars when using multiple cores. See the Parallelization section. |
|
| 64 |
#' @param in_memory Logical; if \code{FALSE}, will write bundles to temporary files, and only load them as they are being requested.
|
|
| 65 |
#' @param clear_scratch_cache Logical; if \code{FALSE}, will preserve the bundles written when \code{in_memory} is \code{TRUE}, after
|
|
| 66 |
#' the request has been made. |
|
| 67 |
#' @param verbose Logical; if \code{TRUE}, will show status messages.
|
|
| 68 |
#' @param key API Key; defaults to \code{Sys.getenv("RECEPTIVITI_KEY")}.
|
|
| 69 |
#' @param secret API Secret; defaults to \code{Sys.getenv("RECEPTIVITI_SECRET")}.
|
|
| 70 |
#' @param url API URL; defaults to \code{Sys.getenv("RECEPTIVITI_URL")}, which defaults to
|
|
| 71 |
#' \code{"https://api.receptiviti.com/"}.
|
|
| 72 |
#' @param version API version; defaults to \code{Sys.getenv("RECEPTIVITI_VERSION")}, which defaults to
|
|
| 73 |
#' \code{"v1"}.
|
|
| 74 |
#' @param endpoint API endpoint (path name after the version); defaults to \code{Sys.getenv("RECEPTIVITI_ENDPOINT")},
|
|
| 75 |
#' which defaults to \code{"framework"}.
|
|
| 76 |
#' @param include_headers Logical; if \code{TRUE}, \code{receptiviti_status}'s verbose message will include
|
|
| 77 |
#' the HTTP headers. |
|
| 78 |
#' |
|
| 79 |
#' @returns Nothing if \code{collect_results} is \code{FALSE}.
|
|
| 80 |
#' Otherwise, a \code{data.frame} with columns for \code{text} (if \code{return_text} is \code{TRUE}; the originally entered text),
|
|
| 81 |
#' \code{id} (if one was provided), \code{text_hash} (the MD5 hash of the text), a column each for relevant entries in \code{api_args},
|
|
| 82 |
#' and scores from each included framework (e.g., \code{summary.word_count} and \code{liwc.i}). If \code{as_list} is \code{TRUE},
|
|
| 83 |
#' returns a list with a named entry containing such a \code{data.frame} for each framework.
|
|
| 84 |
#' |
|
| 85 |
#' @section Request Process: |
|
| 86 |
#' This function (along with the internal \code{manage_request} function) handles texts and results in several steps:
|
|
| 87 |
#' \enumerate{
|
|
| 88 |
#' \item Prepare bundles (split \code{text} into <= \code{bundle_size} and <= \code{bundle_byte_limit} bundles).
|
|
| 89 |
#' \enumerate{
|
|
| 90 |
#' \item If \code{text} points to a directory or list of files, these will be read in later.
|
|
| 91 |
#' \item If \code{in_memory} is \code{FALSE}, bundles are written to a temporary location,
|
|
| 92 |
#' and read back in when the request is made. |
|
| 93 |
#' } |
|
| 94 |
#' \item Get scores for texts within each bundle. |
|
| 95 |
#' \enumerate{
|
|
| 96 |
#' \item If texts are paths, or \code{in_memory} is \code{FALSE}, will load texts.
|
|
| 97 |
#' \item If \code{cache} is set, will skip any texts with cached scores.
|
|
| 98 |
#' \item If \code{request_cache} is \code{TRUE}, will check for a cached request.
|
|
| 99 |
#' \item If any texts need scoring and \code{make_request} is \code{TRUE}, will send unscored texts to the API.
|
|
| 100 |
#' } |
|
| 101 |
#' \item If a request was made and \code{request_cache} is set, will cache the response.
|
|
| 102 |
#' \item If \code{cache} is set, will write bundle scores to the cache.
|
|
| 103 |
#' \item After requests are made, if \code{cache} is set, will defragment the cache
|
|
| 104 |
#' (combine bundle results within partitions). |
|
| 105 |
#' \item If \code{collect_results} is \code{TRUE}, will prepare results:
|
|
| 106 |
#' \enumerate{
|
|
| 107 |
#' \item Will realign results with \code{text} (and \code{id} if provided).
|
|
| 108 |
#' \item If \code{output} is specified, will write realigned results to it.
|
|
| 109 |
#' \item Will drop additional columns (such as \code{custom} and \code{id} if not provided).
|
|
| 110 |
#' \item If \code{framework} is specified, will use it to select columns of the results.
|
|
| 111 |
#' \item Returns results. |
|
| 112 |
#' } |
|
| 113 |
#' } |
|
| 114 |
#' |
|
| 115 |
#' @section Cache: |
|
| 116 |
#' If the \code{cache} argument is specified, results for unique texts are saved in an
|
|
| 117 |
#' \href{https://arrow.apache.org}{Arrow} database in the cache location
|
|
| 118 |
#' (\code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}), and are retrieved with subsequent requests.
|
|
| 119 |
#' This ensures that the exact same texts are not re-sent to the API. |
|
| 120 |
#' This does, however, add some processing time and disc space usage. |
|
| 121 |
#' |
|
| 122 |
#' If \code{cache} is \code{TRUE}, a default directory (\code{receptiviti_cache}) will be looked for
|
|
| 123 |
#' in the system's temporary directory (which is usually the parent of \code{tempdir()}).
|
|
| 124 |
#' If this does not exist, you will be asked if it should be created. |
|
| 125 |
#' |
|
| 126 |
#' The primary cache is checked when each bundle is processed, and existing results are loaded at |
|
| 127 |
#' that time. When processing many bundles in parallel, and many results have been cached, |
|
| 128 |
#' this can cause the system to freeze and potentially crash. |
|
| 129 |
#' To avoid this, limit the number of cores, or disable parallel processing. |
|
| 130 |
#' |
|
| 131 |
#' The \code{cache_format} arguments (or the \code{RECEPTIVITI_CACHE_FORMAT} environment variable) can be used to adjust the format of the cache.
|
|
| 132 |
#' |
|
| 133 |
#' You can use the cache independently with \code{open_database(Sys.getenv("RECEPTIVITI_CACHE"))}.
|
|
| 134 |
#' |
|
| 135 |
#' You can also set the \code{clear_cache} argument to \code{TRUE} to clear the cache before it is used again, which may be useful
|
|
| 136 |
#' if the cache has gotten big, or you know new results will be returned. Even if a cached result exists, it will be |
|
| 137 |
#' reprocessed if it does not have all of the variables of new results, but this depends on there being at least 1 uncached |
|
| 138 |
#' result. If, for instance, you add a framework to your account and want to reprocess a previously processed set of texts, |
|
| 139 |
#' you would need to first clear the cache. |
|
| 140 |
#' |
|
| 141 |
#' Either way, duplicated texts within the same call will only be sent once. |
|
| 142 |
#' |
|
| 143 |
#' The \code{request_cache} argument controls a more temporary cache of each bundle request. This is cleared when the
|
|
| 144 |
#' R session ends. You might want to set this to \code{FALSE} if a new framework becomes available on your account
|
|
| 145 |
#' and you want to process a set of text you already processed in the current R session without restarting. |
|
| 146 |
#' |
|
| 147 |
#' Another temporary cache is made when \code{in_memory} is \code{FALSE}, which is the default when processing
|
|
| 148 |
#' in parallel (when \code{cores} is over \code{1} or \code{use_future} is \code{TRUE}). This contains
|
|
| 149 |
#' a file for each unique bundle, which is read in as needed by the parallel workers. |
|
| 150 |
#' |
|
| 151 |
#' @section Parallelization: |
|
| 152 |
#' \code{text}s are split into bundles based on the \code{bundle_size} argument. Each bundle represents
|
|
| 153 |
#' a single request to the API, which is why they are limited to 1000 texts and a total size of 10 MB. |
|
| 154 |
#' When there is more than one bundle and either \code{cores} is greater than 1 or \code{use_future} is \code{TRUE} (and you've
|
|
| 155 |
#' externally specified a \code{\link[future]{plan}}), bundles are processed by multiple cores.
|
|
| 156 |
#' |
|
| 157 |
#' If you have texts spread across multiple files, they can be most efficiently processed in parallel |
|
| 158 |
#' if each file contains a single text (potentially collapsed from multiple lines). If files contain |
|
| 159 |
#' multiple texts (i.e., \code{collapse_lines = FALSE}), then texts need to be read in before bundling
|
|
| 160 |
#' in order to ensure bundles are under the length limit. |
|
| 161 |
#' |
|
| 162 |
#' Whether processing in serial or parallel, progress bars can be specified externally with |
|
| 163 |
#' \code{\link[progressr]{handlers}}; see examples.
|
|
| 164 |
#' @examples |
|
| 165 |
#' \dontrun{
|
|
| 166 |
#' |
|
| 167 |
#' # check that the API is available, and your credentials work |
|
| 168 |
#' receptiviti_status() |
|
| 169 |
#' |
|
| 170 |
#' # score a single text |
|
| 171 |
#' single <- receptiviti("a text to score")
|
|
| 172 |
#' |
|
| 173 |
#' # score multiple texts, and write results to a file |
|
| 174 |
#' multi <- receptiviti(c("first text to score", "second text"), "filename.csv")
|
|
| 175 |
#' |
|
| 176 |
#' # score many texts in separate files |
|
| 177 |
#' ## defaults to look for .txt files |
|
| 178 |
#' file_results <- receptiviti(dir = "./path/to/txt_folder") |
|
| 179 |
#' |
|
| 180 |
#' ## could be .csv |
|
| 181 |
#' file_results <- receptiviti( |
|
| 182 |
#' dir = "./path/to/csv_folder", |
|
| 183 |
#' text_column = "text", file_type = "csv" |
|
| 184 |
#' ) |
|
| 185 |
#' |
|
| 186 |
#' # score many texts from a file, with a progress bar |
|
| 187 |
#' ## set up cores and progress bar |
|
| 188 |
#' ## (only necessary if you want the progress bar) |
|
| 189 |
#' future::plan("multisession")
|
|
| 190 |
#' progressr::handlers(global = TRUE) |
|
| 191 |
#' progressr::handlers("progress")
|
|
| 192 |
#' |
|
| 193 |
#' ## make request |
|
| 194 |
#' results <- receptiviti( |
|
| 195 |
#' "./path/to/largefile.csv", |
|
| 196 |
#' text_column = "text", use_future = TRUE |
|
| 197 |
#' ) |
|
| 198 |
#' } |
|
| 199 |
#' @importFrom curl new_handle curl_fetch_memory curl_fetch_disk handle_setopt |
|
| 200 |
#' @importFrom jsonlite toJSON fromJSON read_json |
|
| 201 |
#' @importFrom utils object.size |
|
| 202 |
#' @importFrom digest digest |
|
| 203 |
#' @importFrom parallel detectCores makeCluster clusterExport parLapplyLB parLapply stopCluster |
|
| 204 |
#' @importFrom progressr progressor |
|
| 205 |
#' @importFrom stringi stri_enc_detect |
|
| 206 |
#' @export |
|
| 207 | ||
| 208 |
receptiviti <- function( |
|
| 209 |
text = NULL, |
|
| 210 |
output = NULL, |
|
| 211 |
id = NULL, |
|
| 212 |
text_column = NULL, |
|
| 213 |
id_column = NULL, |
|
| 214 |
files = NULL, |
|
| 215 |
dir = NULL, |
|
| 216 |
file_type = "txt", |
|
| 217 |
encoding = NULL, |
|
| 218 |
return_text = FALSE, |
|
| 219 |
context = "written", |
|
| 220 |
custom_context = FALSE, |
|
| 221 |
api_args = getOption("receptiviti.api_args", list()),
|
|
| 222 |
frameworks = getOption("receptiviti.frameworks", "all"),
|
|
| 223 |
framework_prefix = TRUE, |
|
| 224 |
as_list = FALSE, |
|
| 225 |
bundle_size = 1000, |
|
| 226 |
bundle_byte_limit = 75e5, |
|
| 227 |
collapse_lines = FALSE, |
|
| 228 |
retry_limit = 50, |
|
| 229 |
clear_cache = FALSE, |
|
| 230 |
clear_scratch_cache = TRUE, |
|
| 231 |
request_cache = TRUE, |
|
| 232 |
cores = detectCores() - 1, |
|
| 233 |
collect_results = TRUE, |
|
| 234 |
use_future = FALSE, |
|
| 235 |
in_memory = TRUE, |
|
| 236 |
verbose = FALSE, |
|
| 237 |
overwrite = FALSE, |
|
| 238 |
compress = FALSE, |
|
| 239 |
make_request = TRUE, |
|
| 240 |
text_as_paths = FALSE, |
|
| 241 |
cache = Sys.getenv("RECEPTIVITI_CACHE"),
|
|
| 242 |
cache_overwrite = FALSE, |
|
| 243 |
cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"),
|
|
| 244 |
key = Sys.getenv("RECEPTIVITI_KEY"),
|
|
| 245 |
secret = Sys.getenv("RECEPTIVITI_SECRET"),
|
|
| 246 |
url = Sys.getenv("RECEPTIVITI_URL"),
|
|
| 247 |
version = Sys.getenv("RECEPTIVITI_VERSION"),
|
|
| 248 |
endpoint = Sys.getenv("RECEPTIVITI_ENDPOINT")
|
|
| 249 |
) {
|
|
| 250 |
# check input |
|
| 251 | 72x |
if (!is.null(output)) {
|
| 252 | 4x |
if (!file.exists(output) && file.exists(paste0(output, ".xz"))) |
| 253 | ! |
output <- paste0(output, ".xz") |
| 254 | 4x |
if (!overwrite && file.exists(output)) |
| 255 | 1x |
stop( |
| 256 | 1x |
"output file already exists; use overwrite = TRUE to overwrite it", |
| 257 | 1x |
call. = FALSE |
| 258 |
) |
|
| 259 |
} |
|
| 260 | 71x |
if (isTRUE(cache)) {
|
| 261 | 2x |
temp <- dirname(tempdir()) |
| 262 | ! |
if (basename(temp) == "working_dir") temp <- dirname(dirname(temp)) |
| 263 | 2x |
cache <- paste0(temp, "/receptiviti_cache") |
| 264 | 2x |
if (!dir.exists(cache)) {
|
| 265 |
if ( |
|
| 266 | 2x |
interactive() && |
| 267 | 2x |
!isFALSE(getOption("receptiviti.cache_prompt")) &&
|
| 268 | 2x |
grepl( |
| 269 | 2x |
"^(?:[Yy1]|$)", |
| 270 | 2x |
readline("Do you want to establish a default cache? [Y/n] ")
|
| 271 |
) |
|
| 272 |
) {
|
|
| 273 |
} else {
|
|
| 274 | 2x |
options(receptiviti.cache_prompt = FALSE) |
| 275 | 2x |
cache <- "" |
| 276 |
} |
|
| 277 |
} |
|
| 278 |
} |
|
| 279 | ! |
if (!is.character(cache)) cache <- "" |
| 280 | 71x |
if (cache != "") {
|
| 281 | 31x |
if (!requireNamespace("arrow", quietly = TRUE)) {
|
| 282 | ! |
stop("install the `arrow` package to enable the cache", call. = FALSE)
|
| 283 |
} |
|
| 284 | 31x |
if (!(cache_format %in% c("parquet", "feather"))) {
|
| 285 | ! |
stop("cache format can only be `parquet` or `feather`", call. = FALSE)
|
| 286 |
} |
|
| 287 | 2x |
if (clear_cache) unlink(cache, TRUE) |
| 288 | 31x |
dir.create(cache, FALSE, TRUE) |
| 289 | 31x |
cached_parts <- list.files( |
| 290 | 31x |
cache, |
| 291 | 31x |
cache_format, |
| 292 | 31x |
recursive = TRUE, |
| 293 | 31x |
full.names = TRUE |
| 294 |
) |
|
| 295 |
} |
|
| 296 | 71x |
st <- proc.time()[[3]] |
| 297 | 71x |
if (is.character(custom_context)) {
|
| 298 | 3x |
context <- custom_context |
| 299 | 3x |
custom_context <- TRUE |
| 300 |
} |
|
| 301 | 71x |
if (context != "written") {
|
| 302 | 4x |
norming_status <- receptiviti_norming( |
| 303 | 4x |
name_only = TRUE, |
| 304 | 4x |
url = url, |
| 305 | 4x |
key = key, |
| 306 | 4x |
secret = secret, |
| 307 | 4x |
verbose = FALSE |
| 308 |
) |
|
| 309 | 4x |
if (verbose) {
|
| 310 | ! |
message( |
| 311 | ! |
"retrieved custom norming context list (",
|
| 312 | ! |
round(proc.time()[[3]] - st, 4), |
| 313 |
")" |
|
| 314 |
) |
|
| 315 |
} |
|
| 316 | 4x |
context_id <- if (custom_context) paste0("custom/", context) else context
|
| 317 | 4x |
if (!length(norming_status) || !(context_id %in% norming_status)) {
|
| 318 | 2x |
stop( |
| 319 | 2x |
"custom norming context ", |
| 320 | 2x |
context, |
| 321 | 2x |
" is not on record or is not complete", |
| 322 | 2x |
call. = FALSE |
| 323 |
) |
|
| 324 |
} |
|
| 325 |
} |
|
| 326 |
if ( |
|
| 327 | 69x |
length(frameworks) && |
| 328 | 69x |
!("all" %in% frameworks) &&
|
| 329 | 69x |
grepl("2", version, fixed = TRUE)
|
| 330 |
) {
|
|
| 331 | 1x |
api_args$frameworks <- paste0( |
| 332 | 1x |
frameworks[frameworks != "summary"], |
| 333 | 1x |
collapse = "," |
| 334 |
) |
|
| 335 |
} |
|
| 336 | 69x |
if (!is.null(api_args$frameworks)) {
|
| 337 | 1x |
available_frameworks <- c( |
| 338 | 1x |
"summary", |
| 339 | 1x |
receptiviti_frameworks(url, key, secret) |
| 340 |
) |
|
| 341 | 1x |
if (verbose) |
| 342 | ! |
message( |
| 343 | ! |
"retrived frameworks list (",
|
| 344 | ! |
round(proc.time()[[3]] - st, 4), |
| 345 |
")" |
|
| 346 |
) |
|
| 347 | 1x |
arg_frameworks <- unlist(strsplit(api_args$frameworks, ",", fixed = TRUE)) |
| 348 | 1x |
su <- !(arg_frameworks %in% available_frameworks) |
| 349 | 1x |
if (any(su)) {
|
| 350 | 1x |
stop( |
| 351 | 1x |
"requested framework(s) are not available to your account: ", |
| 352 | 1x |
paste(arg_frameworks[su], collapse = ", "), |
| 353 | 1x |
call. = FALSE |
| 354 |
) |
|
| 355 |
} |
|
| 356 |
} |
|
| 357 | 68x |
res <- manage_request( |
| 358 | 68x |
text, |
| 359 | 68x |
id = id, |
| 360 | 68x |
text_column = text_column, |
| 361 | 68x |
id_column = id_column, |
| 362 | 68x |
files = files, |
| 363 | 68x |
dir = dir, |
| 364 | 68x |
file_type = file_type, |
| 365 | 68x |
encoding = encoding, |
| 366 | 68x |
context = if (custom_context) paste0("custom/", context) else context,
|
| 367 | 68x |
api_args = api_args, |
| 368 | 68x |
bundle_size = bundle_size, |
| 369 | 68x |
bundle_byte_limit = bundle_byte_limit, |
| 370 | 68x |
collapse_lines = collapse_lines, |
| 371 | 68x |
retry_limit = retry_limit, |
| 372 | 68x |
clear_scratch_cache = clear_scratch_cache, |
| 373 | 68x |
request_cache = request_cache, |
| 374 | 68x |
cores = cores, |
| 375 | 68x |
use_future = use_future, |
| 376 | 68x |
in_memory = in_memory, |
| 377 | 68x |
verbose = verbose, |
| 378 | 68x |
make_request = make_request, |
| 379 | 68x |
text_as_paths = text_as_paths, |
| 380 | 68x |
cache = cache, |
| 381 | 68x |
cache_overwrite = cache_overwrite, |
| 382 | 68x |
cache_format = cache_format, |
| 383 | 68x |
key = key, |
| 384 | 68x |
secret = secret, |
| 385 | 68x |
url = url, |
| 386 | 68x |
version = version, |
| 387 | 68x |
endpoint = endpoint |
| 388 |
) |
|
| 389 | 44x |
data <- res$data |
| 390 | 44x |
final_res <- res$final_res |
| 391 | ||
| 392 |
# defragment cache |
|
| 393 | 44x |
if (cache != "") {
|
| 394 | 29x |
cache <- normalizePath(cache, "/", FALSE) |
| 395 | 29x |
exclude <- c("id", "bin", names(api_args))
|
| 396 | 29x |
bin_dirs <- list.dirs(cache) |
| 397 | 29x |
if (length(bin_dirs) > 1) {
|
| 398 | 29x |
if (verbose) |
| 399 | 2x |
message("defragmenting cache (", round(proc.time()[[3]] - st, 4), ")")
|
| 400 | 29x |
write_time <- as.numeric(Sys.time()) |
| 401 | 29x |
for (bin_dir in bin_dirs[-1]) {
|
| 402 | 295x |
files <- list.files(bin_dir, cache_format, full.names = TRUE) |
| 403 | 295x |
if (length(files) > 1) {
|
| 404 | 2x |
previous <- files[!(files %in% cached_parts)] |
| 405 | 2x |
if (collect_results && length(previous)) {
|
| 406 | 2x |
existing_cols <- unique(c( |
| 407 | 2x |
"id", |
| 408 | 2x |
"bin", |
| 409 | 2x |
names(arrow::schema(arrow::open_dataset( |
| 410 | 2x |
previous[[1]], |
| 411 | 2x |
format = cache_format |
| 412 |
))) |
|
| 413 |
)) |
|
| 414 |
if ( |
|
| 415 | 2x |
length(existing_cols) != ncol(final_res) || |
| 416 | 2x |
!all(existing_cols %in% colnames(final_res)) |
| 417 |
) {
|
|
| 418 | ! |
if (verbose) |
| 419 | ! |
message(" clearing existing cache since columns did not align")
|
| 420 | ! |
unlink(previous) |
| 421 |
} |
|
| 422 |
} |
|
| 423 | 2x |
bin_content <- dplyr::compute(arrow::open_dataset( |
| 424 | 2x |
bin_dir, |
| 425 | 2x |
format = cache_format |
| 426 |
)) |
|
| 427 | 2x |
su <- !duplicated(as.character(bin_content$text_hash)) |
| 428 | ! |
if (!all(su)) bin_content <- bin_content[su, ] |
| 429 | 2x |
writer <- if (cache_format == "parquet") arrow::write_parquet else |
| 430 | 2x |
arrow::write_feather |
| 431 | 2x |
all_rows <- nrow(bin_content) |
| 432 | 2x |
for (i in seq_len(ceiling(all_rows / 1e9))) {
|
| 433 | 2x |
writer( |
| 434 | 2x |
bin_content[seq((i - 1) * 1e9 + 1, min(all_rows, i * 1e9)), ], |
| 435 | 2x |
paste0(bin_dir, "/part-", write_time, "-", i, ".", cache_format) |
| 436 |
) |
|
| 437 |
} |
|
| 438 | 2x |
unlink(files) |
| 439 |
} |
|
| 440 |
} |
|
| 441 |
} |
|
| 442 |
} |
|
| 443 | ||
| 444 | 44x |
if (!collect_results) {
|
| 445 | ! |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")")
|
| 446 | 1x |
return(invisible(NULL)) |
| 447 |
} |
|
| 448 | ||
| 449 |
# prepare final results |
|
| 450 | 43x |
if (verbose) |
| 451 | 2x |
message("preparing output (", round(proc.time()[[3]] - st, 4), ")")
|
| 452 | 43x |
rownames(final_res) <- final_res$id |
| 453 | 43x |
rownames(data) <- data$id |
| 454 | 43x |
data$text_hash <- structure( |
| 455 | 43x |
final_res$text_hash, |
| 456 | 43x |
names = data[final_res$id, "text"] |
| 457 | 43x |
)[data$text] |
| 458 | 43x |
final_res <- cbind( |
| 459 | 43x |
data[, |
| 460 | 43x |
c(if (return_text) "text", if (res$provided_id) "id", "text_hash"), |
| 461 | 43x |
drop = FALSE |
| 462 |
], |
|
| 463 | 43x |
final_res[ |
| 464 | 43x |
structure(final_res$id, names = final_res$text_hash)[data$text_hash], |
| 465 | 43x |
!colnames(final_res) %in% c("id", "bin", "text_hash", "custom"),
|
| 466 | 43x |
drop = FALSE |
| 467 |
] |
|
| 468 |
) |
|
| 469 | 43x |
row.names(final_res) <- NULL |
| 470 | 43x |
if (!is.null(output)) {
|
| 471 | ! |
if (!grepl("\\.csv", output, TRUE)) output <- paste0(output, ".csv")
|
| 472 | 3x |
if (compress && !grepl(".xz", output, fixed = TRUE))
|
| 473 | 1x |
output <- paste0(output, ".xz") |
| 474 | 1x |
if (grepl(".xz", output, fixed = TRUE)) compress <- TRUE
|
| 475 | 3x |
if (verbose) |
| 476 | 1x |
message( |
| 477 | 1x |
"writing results to file: ", |
| 478 | 1x |
output, |
| 479 |
" (",
|
|
| 480 | 1x |
round(proc.time()[[3]] - st, 4), |
| 481 |
")" |
|
| 482 |
) |
|
| 483 | 3x |
dir.create(dirname(output), FALSE, TRUE) |
| 484 | 2x |
if (overwrite) unlink(output) |
| 485 | 1x |
if (compress) output <- xzfile(output) |
| 486 | 3x |
arrow::write_csv_arrow(final_res, file = output) |
| 487 |
} |
|
| 488 | ||
| 489 | 43x |
if (is.character(frameworks) && frameworks[1] != "all") {
|
| 490 | 6x |
if (verbose) |
| 491 | ! |
message("selecting frameworks (", round(proc.time()[[3]] - st, 4), ")")
|
| 492 | 6x |
vars <- colnames(final_res) |
| 493 | 6x |
sel <- grepl( |
| 494 | 6x |
paste0("^(?:", paste(tolower(frameworks), collapse = "|"), ")"),
|
| 495 | 6x |
vars |
| 496 |
) |
|
| 497 | 6x |
if (any(sel)) {
|
| 498 |
if ( |
|
| 499 | 5x |
missing(framework_prefix) && |
| 500 | 5x |
(length(frameworks) == 1 && frameworks != "all") |
| 501 |
) |
|
| 502 | 3x |
framework_prefix <- FALSE |
| 503 | 5x |
sel <- unique(c("text", "id", "text_hash", names(api_args), vars[sel]))
|
| 504 | 5x |
sel <- sel[sel %in% vars] |
| 505 | 5x |
final_res <- final_res[, sel] |
| 506 |
} else {
|
|
| 507 | 1x |
warning( |
| 508 | 1x |
"frameworks did not match any columns -- returning all", |
| 509 | 1x |
call. = FALSE |
| 510 |
) |
|
| 511 |
} |
|
| 512 |
} |
|
| 513 | 43x |
if (as_list) {
|
| 514 | 1x |
if (missing(framework_prefix)) framework_prefix <- FALSE |
| 515 | 1x |
inall <- c("text", "id", "text_hash", names(api_args))
|
| 516 | 1x |
cols <- colnames(final_res) |
| 517 | 1x |
inall <- inall[inall %in% cols] |
| 518 | 1x |
pre <- sub("\\..*$", "", cols)
|
| 519 | 1x |
pre <- unique(pre[!pre %in% inall]) |
| 520 | 1x |
final_res <- lapply(structure(pre, names = pre), function(f) {
|
| 521 | 9x |
res <- final_res[, c(inall, grep(paste0("^", f), cols, value = TRUE))]
|
| 522 | 9x |
if (!framework_prefix) colnames(res) <- sub("^.+\\.", "", colnames(res))
|
| 523 | 9x |
res |
| 524 |
}) |
|
| 525 | 42x |
} else if (!framework_prefix) |
| 526 | 4x |
colnames(final_res) <- sub("^.+\\.", "", colnames(final_res))
|
| 527 | 2x |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")")
|
| 528 | 43x |
invisible(final_res) |
| 529 |
} |
| 1 |
#' @rdname receptiviti |
|
| 2 |
#' @export |
|
| 3 | ||
| 4 |
receptiviti_status <- function( |
|
| 5 |
url = Sys.getenv("RECEPTIVITI_URL"),
|
|
| 6 |
key = Sys.getenv("RECEPTIVITI_KEY"),
|
|
| 7 |
secret = Sys.getenv("RECEPTIVITI_SECRET"),
|
|
| 8 |
verbose = TRUE, |
|
| 9 |
include_headers = FALSE |
|
| 10 |
) {
|
|
| 11 | 59x |
params <- handle_request_params(url, key, secret) |
| 12 | 54x |
ping <- tryCatch( |
| 13 | 54x |
curl_fetch_memory(paste0(params$url, "/v1/ping"), params$handler), |
| 14 | 54x |
error = function(e) NULL |
| 15 |
) |
|
| 16 | 54x |
if (is.null(ping)) {
|
| 17 | 2x |
if (verbose) message("Status: ERROR\nMessage: URL is unreachable")
|
| 18 | 3x |
invisible(return()) |
| 19 |
} |
|
| 20 | 51x |
ping$content <- list(message = rawToChar(ping$content)) |
| 21 | 51x |
if (substr(ping$content, 1, 1) == "{")
|
| 22 | 50x |
ping$content <- fromJSON(ping$content$message) |
| 23 | 51x |
ok <- ping$status_code == 200 && !length(ping$content$code) |
| 24 | 51x |
ping$status_message <- if (ok) {
|
| 25 | 48x |
ping$content$pong |
| 26 |
} else {
|
|
| 27 | 3x |
paste0( |
| 28 | 3x |
if (length(ping$content$code)) |
| 29 | 3x |
paste0(ping$status_code, " (", ping$content$code, "): "),
|
| 30 | 3x |
if ( |
| 31 | 3x |
nchar(ping$content$message) > 500 || |
| 32 | 3x |
grepl("<", ping$content$message, fixed = TRUE)
|
| 33 |
) {
|
|
| 34 | 1x |
ping$status_code |
| 35 |
} else {
|
|
| 36 | 2x |
ping$content$message |
| 37 |
} |
|
| 38 |
) |
|
| 39 |
} |
|
| 40 | 51x |
if (verbose) {
|
| 41 | 3x |
message( |
| 42 | 3x |
"Status: ", |
| 43 | 3x |
if (ok) "OK" else "ERROR", |
| 44 | 3x |
"\nMessage: ", |
| 45 | 3x |
ping$status_message |
| 46 |
) |
|
| 47 | 3x |
if (include_headers) {
|
| 48 | 1x |
ping$headers <- strsplit( |
| 49 | 1x |
rawToChar(ping$headers), |
| 50 | 1x |
"[\r\n]+", |
| 51 | 1x |
perl = TRUE |
| 52 | 1x |
)[[1]] |
| 53 | 1x |
json <- regexec("\\{.+\\}", ping$headers)
|
| 54 | 1x |
for (i in seq_along(json)) {
|
| 55 | 10x |
if (json[[i]] != -1) {
|
| 56 | 2x |
regmatches(ping$headers[[i]], json[[i]]) <- paste( |
| 57 |
" ", |
|
| 58 | 2x |
strsplit( |
| 59 | 2x |
toJSON( |
| 60 | 2x |
fromJSON(regmatches(ping$headers[[i]], json[[i]])), |
| 61 | 2x |
auto_unbox = TRUE, |
| 62 | 2x |
pretty = TRUE |
| 63 |
), |
|
| 64 | 2x |
"\n" |
| 65 | 2x |
)[[1]], |
| 66 | 2x |
collapse = "\n" |
| 67 |
) |
|
| 68 |
} |
|
| 69 |
} |
|
| 70 | 1x |
message(paste0("\n", paste(" ", ping$headers, collapse = "\n")))
|
| 71 |
} |
|
| 72 |
} |
|
| 73 | 51x |
invisible(ping) |
| 74 |
} |
|
| 75 | ||
| 76 |
handle_request_params <- function(url, key, secret) {
|
|
| 77 | 74x |
if (key == "") {
|
| 78 | 2x |
stop( |
| 79 | 2x |
"specify your key, or set it to the RECEPTIVITI_KEY environment variable", |
| 80 | 2x |
call. = FALSE |
| 81 |
) |
|
| 82 |
} |
|
| 83 | 72x |
if (secret == "") {
|
| 84 | 2x |
stop( |
| 85 | 2x |
"specify your secret, or set it to the RECEPTIVITI_SECRET environment variable", |
| 86 | 2x |
call. = FALSE |
| 87 |
) |
|
| 88 |
} |
|
| 89 | 70x |
url <- paste0( |
| 90 | 70x |
if (!grepl("http", tolower(url), fixed = TRUE)) "https://",
|
| 91 | 70x |
sub("/+[Vv]\\d+(?:/.*)?$|/+$", "", url)
|
| 92 |
) |
|
| 93 | 70x |
if (!grepl("^https?://[^.]+[.:][^.]", url, TRUE)) {
|
| 94 | 2x |
stop( |
| 95 | 2x |
"url does not appear to be valid: ", |
| 96 | 2x |
url, |
| 97 | 2x |
call. = FALSE |
| 98 |
) |
|
| 99 |
} |
|
| 100 | 68x |
list( |
| 101 | 68x |
url = url, |
| 102 | 68x |
handler = new_handle(httpauth = 1, userpwd = paste0(key, ":", secret)) |
| 103 |
) |
|
| 104 |
} |
| 1 |
#' List Available Frameworks |
|
| 2 |
#' |
|
| 3 |
#' Retrieve the list of frameworks available to your account. |
|
| 4 |
#' @param url,key,secret Request arguments; same as those in \code{\link{receptiviti}}.
|
|
| 5 |
#' @returns A character vector containing the names of frameworks available to your account. |
|
| 6 |
#' @examples |
|
| 7 |
#' \dontrun{
|
|
| 8 |
#' |
|
| 9 |
#' # see which frameworks are available to your account |
|
| 10 |
#' frameworks <- receptiviti_frameworks() |
|
| 11 |
#' } |
|
| 12 |
#' @export |
|
| 13 | ||
| 14 |
receptiviti_frameworks <- function( |
|
| 15 |
url = Sys.getenv("RECEPTIVITI_URL"),
|
|
| 16 |
key = Sys.getenv("RECEPTIVITI_KEY"),
|
|
| 17 |
secret = Sys.getenv("RECEPTIVITI_SECRET")
|
|
| 18 |
) {
|
|
| 19 | 3x |
params <- handle_request_params(url, key, secret) |
| 20 | 2x |
req <- curl::curl_fetch_memory( |
| 21 | 2x |
paste0(params$url, "/v2/frameworks"), |
| 22 | 2x |
params$handler |
| 23 |
) |
|
| 24 | 2x |
if (req$status_code == 200) {
|
| 25 | 2x |
return(jsonlite::fromJSON(rawToChar(req$content))) |
| 26 |
} |
|
| 27 | ! |
content <- list(message = rawToChar(req$content)) |
| 28 | ! |
if (substr(content$message, 1, 1) == "{")
|
| 29 | ! |
content <- jsonlite::fromJSON(content$message) |
| 30 | ! |
stop("failed to retrieve frameworks list: ", content$message, call. = FALSE)
|
| 31 |
} |