| 1 |
#' @rdname receptiviti |
|
| 2 |
#' @export |
|
| 3 | ||
| 4 |
receptiviti_status <- function(url = Sys.getenv("RECEPTIVITI_URL"), key = Sys.getenv("RECEPTIVITI_KEY"),
|
|
| 5 |
secret = Sys.getenv("RECEPTIVITI_SECRET"), verbose = TRUE, include_headers = FALSE) {
|
|
| 6 | 2x |
if (key == "") stop("specify your key, or set it to the RECEPTIVITI_KEY environment variable", call. = FALSE)
|
| 7 | 2x |
if (secret == "") stop("specify your secret, or set it to the RECEPTIVITI_SECRET environment variable", call. = FALSE)
|
| 8 | 45x |
handler <- new_handle(httpauth = 1, userpwd = paste0(key, ":", secret)) |
| 9 | 45x |
url <- paste0( |
| 10 | 45x |
if (!grepl("http", tolower(url), fixed = TRUE)) "https://",
|
| 11 | 45x |
sub("/[Vv]\\d(?:/.*)?$|/+$", "", url), "/v1/ping"
|
| 12 |
) |
|
| 13 | 1x |
if (!grepl("^https?://[^.]+[.:][^.]", url, TRUE)) stop("url does not appear to be valid: ", url)
|
| 14 | 44x |
ping <- curl_fetch_memory(url, handler) |
| 15 | 44x |
ping$content <- list(message = rawToChar(ping$content)) |
| 16 | 43x |
if (substr(ping$content, 1, 1) == "{") ping$content <- fromJSON(ping$content$message)
|
| 17 | 44x |
ok <- ping$status_code == 200 && !length(ping$content$code) |
| 18 | 44x |
ping$status_message <- if (ok) {
|
| 19 | 41x |
ping$content$pong |
| 20 |
} else {
|
|
| 21 | 3x |
paste0( |
| 22 | 3x |
if (length(ping$content$code)) paste0(ping$status_code, " (", ping$content$code, "): "),
|
| 23 | 3x |
if (nchar(ping$content$message) > 500 || grepl("<", ping$content$message, fixed = TRUE)) {
|
| 24 | 1x |
ping$status_code |
| 25 |
} else {
|
|
| 26 | 2x |
ping$content$message |
| 27 |
} |
|
| 28 |
) |
|
| 29 |
} |
|
| 30 | 44x |
if (verbose) {
|
| 31 | 3x |
message("Status: ", if (ok) "OK" else "ERROR", "\nMessage: ", ping$status_message)
|
| 32 | 3x |
if (include_headers) {
|
| 33 | 1x |
ping$headers <- strsplit(rawToChar(ping$headers), "[\r\n]+", perl = TRUE)[[1]] |
| 34 | 1x |
json <- regexec("\\{.+\\}", ping$headers)
|
| 35 | 1x |
for (i in seq_along(json)) {
|
| 36 | 10x |
if (json[[i]] != -1) {
|
| 37 | 2x |
regmatches(ping$headers[[i]], json[[i]]) <- paste(" ", strsplit(toJSON(
|
| 38 | 2x |
fromJSON(regmatches(ping$headers[[i]], json[[i]])), |
| 39 | 2x |
auto_unbox = TRUE, pretty = TRUE |
| 40 | 2x |
), "\n")[[1]], collapse = "\n") |
| 41 |
} |
|
| 42 |
} |
|
| 43 | 1x |
message(paste0("\n", paste(" ", ping$headers, collapse = "\n")))
|
| 44 |
} |
|
| 45 |
} |
|
| 46 | 44x |
invisible(ping) |
| 47 |
} |
| 1 |
.onLoad <- function(lib, pkg) {
|
|
| 2 | ! |
if (Sys.getenv("RECEPTIVITI_URL") == "") Sys.setenv(RECEPTIVITI_URL = "https://api.receptiviti.com/")
|
| 3 |
} |
|
| 4 | ||
| 5 |
#' Receptiviti API |
|
| 6 |
#' |
|
| 7 |
#' The main function to access the \href{https://www.receptiviti.com}{Receptiviti} API.
|
|
| 8 |
#' |
|
| 9 |
#' @param text A character vector with text to be processed, path to a directory containing files, or a vector of file paths. |
|
| 10 |
#' If a single path to a directory, each file is collapsed to a single text. If a path to a file or files, |
|
| 11 |
#' each line or row is treated as a separate text, unless \code{collapse_lines} is \code{TRUE}.
|
|
| 12 |
#' @param output Path to a \code{.csv} file to write results to. If this already exists, set \code{overwrite} to \code{TRUE}
|
|
| 13 |
#' to overwrite it. |
|
| 14 |
#' @param id Vector of unique IDs the same length as \code{text}, to be included in the results.
|
|
| 15 |
#' @param text_column,id_column Column name of text/id, if \code{text} is a matrix-like object, or a path to a csv file.
|
|
| 16 |
#' @param file_type File extension to search for, if \code{text} is the path to a directory containing files to be read in.
|
|
| 17 |
#' @param return_text Logical; if \code{TRUE}, \code{text} is included as the first column of the result.
|
|
| 18 |
#' @param api_args A list of additional arguments to pass to the API (e.g., \code{list(sallee_mode = "sparse")}). Defaults to the
|
|
| 19 |
#' \code{receptiviti.api_args} option.
|
|
| 20 |
#' @param frameworks A vector of frameworks to include results from. Texts are always scored with all available framework -- |
|
| 21 |
#' this just specifies what to return. Defaults to \code{all}, to return all scored frameworks. Can be set by the
|
|
| 22 |
#' \code{receptiviti.frameworks} option (e.g., \code{options(receptiviti.frameworks = c("liwc", "sallee"))}).
|
|
| 23 |
#' @param framework_prefix Logical; if \code{FALSE}, will remove the framework prefix from column names, which may result in duplicates.
|
|
| 24 |
#' If this is not specified, and 1 framework is selected, or \code{as_list} is \code{TRUE}, will default to remove prefixes.
|
|
| 25 |
#' @param as_list Logical; if \code{TRUE}, returns a list with frameworks in separate entries.
|
|
| 26 |
#' @param bundle_size Number of texts to include in each request; between 1 and 1,000. |
|
| 27 |
#' @param bundle_byte_limit Memory limit (in bytes) of each bundle, under \code{1e7} (10 MB, which is the API's limit).
|
|
| 28 |
#' May need to be lower than the API's limit, depending on the system's requesting library. |
|
| 29 |
#' @param collapse_lines Logical; if \code{TRUE}, and \code{text} contains paths to files, each file is treated as a single text.
|
|
| 30 |
#' @param retry_limit Maximum number of times each request can be retried after hitting a rate limit. |
|
| 31 |
#' @param overwrite Logical; if \code{TRUE}, will overwrite an existing \code{output} file.
|
|
| 32 |
#' @param compress Logical; if \code{TRUE}, will save as an \code{xz}-compressed file.
|
|
| 33 |
#' @param make_request Logical; if \code{FALSE}, a request is not made. This could be useful if you want to be sure and
|
|
| 34 |
#' load from one of the caches, but aren't sure that all results exist there; it will error out if it encounters |
|
| 35 |
#' texts it has no other source for. |
|
| 36 |
#' @param text_as_paths Logical; if \code{TRUE}, ensures \code{text} is treated as a vector of file paths. Otherwise, this will be
|
|
| 37 |
#' determined if there are no \code{NA}s in \code{text} and every entry is under 500 characters long.
|
|
| 38 |
#' @param cache Path to a directory in which to save unique results for reuse; defaults to |
|
| 39 |
#' \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}. See the Cache section for details.
|
|
| 40 |
#' @param cache_overwrite Logical; if \code{TRUE}, will write results to the cache without reading from it. This could be used
|
|
| 41 |
#' if you want fresh results to be cached without clearing the cache. |
|
| 42 |
#' @param cache_format Format of the cache database; see \code{\link[arrow]{FileFormat}}.
|
|
| 43 |
#' Defaults to \code{Sys.getenv(}\code{"RECEPTIVITI_CACHE_FORMAT")}.
|
|
| 44 |
#' @param clear_cache Logical; if \code{TRUE}, will clear any existing files in the cache. Use \code{cache_overwrite} if
|
|
| 45 |
#' you want fresh results without clearing or disabling the cache. Use \code{cache = FALSE} to disable the cache.
|
|
| 46 |
#' @param request_cache Logical; if \code{FALSE}, will always make a fresh request, rather than using the response
|
|
| 47 |
#' from a previous identical request. |
|
| 48 |
#' @param cores Number of CPU cores to split bundles across, if there are multiple bundles. See the Parallelization section. |
|
| 49 |
#' @param use_future Logical; if \code{TRUE}, uses a \code{future} back-end to process bundles, in which case,
|
|
| 50 |
#' parallelization can be controlled with the \code{\link[future]{plan}} function (e.g., \code{plan("multisession")}
|
|
| 51 |
#' to use multiple cores); this is required to see progress bars when using multiple cores. See the Parallelization section. |
|
| 52 |
#' @param in_memory Logical; if \code{FALSE}, will write bundles to temporary files, and only load them as they are being requested.
|
|
| 53 |
#' @param clear_scratch_cache Logical; if \code{FALSE}, will preserve the bundles written when \code{in_memory} is \code{TRUE}, after
|
|
| 54 |
#' the request has been made. |
|
| 55 |
#' @param verbose Logical; if \code{TRUE}, will show status messages.
|
|
| 56 |
#' @param key API Key; defaults to \code{Sys.getenv("RECEPTIVITI_KEY")}.
|
|
| 57 |
#' @param secret API Secret; defaults to \code{Sys.getenv("RECEPTIVITI_SECRET")}.
|
|
| 58 |
#' @param url API endpoint; defaults to \code{Sys.getenv("RECEPTIVITI_URL")}, which defaults to
|
|
| 59 |
#' \code{"https://api.receptiviti.com/"}.
|
|
| 60 |
#' @param include_headers Logical; if \code{TRUE}, \code{receptiviti_status}'s verbose message will include
|
|
| 61 |
#' the HTTP headers. |
|
| 62 |
#' |
|
| 63 |
#' @returns A \code{data.frame} with columns for \code{text} (if \code{return_text} is \code{TRUE}; the originally entered text),
|
|
| 64 |
#' \code{id} (if one was provided), \code{text_hash} (the MD5 hash of the text), a column each for relevant entries in \code{api_args},
|
|
| 65 |
#' and scores from each included framework (e.g., \code{summary.word_count} and \code{liwc.i}). If \code{as_list} is \code{TRUE},
|
|
| 66 |
#' returns a list with a named entry containing such a \code{data.frame} for each framework.
|
|
| 67 |
#' |
|
| 68 |
#' @section Cache: |
|
| 69 |
#' By default, results for unique texts are saved in an \href{https://arrow.apache.org}{Arrow} database in the
|
|
| 70 |
#' cache location (\code{Sys.getenv(}\code{"RECEPTIVITI_CACHE")}), and are retrieved with subsequent requests.
|
|
| 71 |
#' This ensures that the exact same texts are not re-sent to the API. |
|
| 72 |
#' This does, however, add some processing time and disc space usage. |
|
| 73 |
#' |
|
| 74 |
#' If a cache location is not specified, a default directory (\code{receptiviti_cache}) will be looked for
|
|
| 75 |
#' in the system's temporary directory (which is usually the parent of \code{tempdir()}).
|
|
| 76 |
#' If this does not exist, you will be asked if it should be created. You can disable this prompt with the |
|
| 77 |
#' \code{receptiviti.cache_prompt} option (\code{options(receptiviti.cache_prompt = FALSE)}).
|
|
| 78 |
#' |
|
| 79 |
#' The \code{cache_format} arguments (or the \code{RECEPTIVITI_CACHE_FORMAT} environment variable) can be used to adjust the format of the cache.
|
|
| 80 |
#' |
|
| 81 |
#' You can use the cache independently with \code{open_database(Sys.getenv("RECEPTIVITI_CACHE"))}.
|
|
| 82 |
#' |
|
| 83 |
#' You can set the \code{cache} argument to \code{FALSE} to prevent the cache from being used, which might make sense if
|
|
| 84 |
#' you don't expect to need to reprocess it. |
|
| 85 |
#' |
|
| 86 |
#' You can also set the \code{clear_cache} argument to \code{TRUE} to clear the cache before it is used again, which may be useful
|
|
| 87 |
#' if the cache has gotten big, or you know new results will be returned. Even if a cached result exists, it will be |
|
| 88 |
#' reprocessed if it does not have all of the variables of new results, but this depends on there being at least 1 uncached |
|
| 89 |
#' result. If, for instance, you add a framework to your account and want to reprocess a previously processed set of texts, |
|
| 90 |
#' you would need to first clear the cache. |
|
| 91 |
#' |
|
| 92 |
#' Either way, duplicated texts within the same call will only be sent once. |
|
| 93 |
#' |
|
| 94 |
#' The \code{request_cache} argument controls a more temporary cache of each bundle request. This is cleared when the
|
|
| 95 |
#' R session ends. You might want to set this to \code{FALSE} if a new framework becomes available on your account
|
|
| 96 |
#' and you want to process a set of text you already processed in the current R session without restarting. |
|
| 97 |
#' |
|
| 98 |
#' Another temporary cache is made when \code{in_memory} is \code{FALSE}, which is the default when processing
|
|
| 99 |
#' in parallel (when \code{cores} is over \code{1} or \code{use_future} is \code{TRUE}). This contains
|
|
| 100 |
#' a file for each unique bundle, which is read by as needed by the parallel workers. |
|
| 101 |
#' |
|
| 102 |
#' @section Parallelization: |
|
| 103 |
#' \code{text}s are split into bundles based on the \code{bundle_size} argument. Each bundle represents
|
|
| 104 |
#' a single request to the API, which is why they are limited to 1000 texts and a total size of 10 MB. |
|
| 105 |
#' When there is more than one bundle and either \code{cores} is greater than 1 or \code{use_future} is \code{TRUE} (and you've
|
|
| 106 |
#' externally specified a \code{\link[future]{plan}}), bundles are processed by multiple cores.
|
|
| 107 |
#' |
|
| 108 |
#' Using \code{future} also allows for progress bars to be specified externally with \code{\link[progressr]{handlers}}; see examples.
|
|
| 109 |
#' @examples |
|
| 110 |
#' \dontrun{
|
|
| 111 |
#' |
|
| 112 |
#' # check that the API is available, and your credentials work |
|
| 113 |
#' receptiviti_status() |
|
| 114 |
#' |
|
| 115 |
#' # score a single text |
|
| 116 |
#' single <- receptiviti("a text to score")
|
|
| 117 |
#' |
|
| 118 |
#' # score multiple texts, and write results to a file |
|
| 119 |
#' multi <- receptiviti(c("first text to score", "second text"), "filename.csv")
|
|
| 120 |
#' |
|
| 121 |
#' # score many texts in separate files |
|
| 122 |
#' ## defaults to look for .txt files |
|
| 123 |
#' file_results <- receptiviti("./path/to/txt_folder")
|
|
| 124 |
#' |
|
| 125 |
#' ## could be .csv |
|
| 126 |
#' file_results <- receptiviti( |
|
| 127 |
#' "./path/to/csv_folder", |
|
| 128 |
#' text_column = "text", file_type = "csv" |
|
| 129 |
#' ) |
|
| 130 |
#' |
|
| 131 |
#' # score many texts from a file, with a progress bar |
|
| 132 |
#' ## set up cores and progress bar (only necessary if you want the progress bar) |
|
| 133 |
#' future::plan("multisession")
|
|
| 134 |
#' progressr::handlers(global = TRUE) |
|
| 135 |
#' progressr::handlers("progress")
|
|
| 136 |
#' |
|
| 137 |
#' ## make request |
|
| 138 |
#' results <- receptiviti( |
|
| 139 |
#' "./path/to/largefile.csv", |
|
| 140 |
#' text_column = "text", use_future = TRUE |
|
| 141 |
#' ) |
|
| 142 |
#' } |
|
| 143 |
#' @importFrom curl new_handle curl_fetch_memory curl_fetch_disk |
|
| 144 |
#' @importFrom jsonlite toJSON fromJSON read_json |
|
| 145 |
#' @importFrom utils object.size |
|
| 146 |
#' @importFrom digest digest |
|
| 147 |
#' @importFrom parallel detectCores makeCluster clusterExport parLapplyLB parLapply stopCluster |
|
| 148 |
#' @importFrom future.apply future_lapply |
|
| 149 |
#' @importFrom progressr progressor |
|
| 150 |
#' @importFrom arrow read_csv_arrow write_csv_arrow schema string write_dataset open_dataset |
|
| 151 |
#' @importFrom dplyr filter compute collect select |
|
| 152 |
#' @export |
|
| 153 | ||
| 154 |
receptiviti <- function(text, output = NULL, id = NULL, text_column = NULL, id_column = NULL, file_type = "txt", return_text = FALSE, |
|
| 155 |
api_args = getOption("receptiviti.api_args", list()), frameworks = getOption("receptiviti.frameworks", "all"),
|
|
| 156 |
framework_prefix = TRUE, as_list = FALSE, bundle_size = 1000, bundle_byte_limit = 75e5, collapse_lines = FALSE, |
|
| 157 |
retry_limit = 50, clear_cache = FALSE, clear_scratch_cache = TRUE, request_cache = TRUE, |
|
| 158 |
cores = detectCores() - 1, use_future = FALSE, in_memory = TRUE, verbose = FALSE, overwrite = FALSE, |
|
| 159 |
compress = FALSE, make_request = TRUE, text_as_paths = FALSE, cache = Sys.getenv("RECEPTIVITI_CACHE", ""),
|
|
| 160 |
cache_overwrite = FALSE, cache_format = Sys.getenv("RECEPTIVITI_CACHE_FORMAT", "parquet"),
|
|
| 161 |
key = Sys.getenv("RECEPTIVITI_KEY"), secret = Sys.getenv("RECEPTIVITI_SECRET"), url = Sys.getenv("RECEPTIVITI_URL")) {
|
|
| 162 |
# check input |
|
| 163 | 57x |
final_res <- text_hash <- bin <- NULL |
| 164 | 57x |
if (!is.null(output)) {
|
| 165 | ! |
if (!file.exists(output) && file.exists(paste0(output, ".xz"))) output <- paste0(output, ".xz") |
| 166 | 1x |
if (!overwrite && file.exists(output)) stop("output file already exists; use overwrite = TRUE to overwrite it", call. = FALSE)
|
| 167 |
} |
|
| 168 | 56x |
if (cache == "") {
|
| 169 | 19x |
temp <- dirname(tempdir()) |
| 170 | ! |
if (basename(temp) == "working_dir") temp <- dirname(dirname(temp)) |
| 171 | 19x |
cache <- paste0(temp, "/receptiviti_cache") |
| 172 | 19x |
if (!dir.exists(cache)) {
|
| 173 | ! |
if (interactive() && !isFALSE(getOption("receptiviti.cache_prompt")) &&
|
| 174 | ! |
grepl("^(?:[Yy1]|$)", readline("Do you want to establish a default cache? [Y/n] "))) {
|
| 175 | ! |
dir.create(cache, FALSE) |
| 176 |
} else {
|
|
| 177 | ! |
options(receptiviti.cache_prompt = FALSE) |
| 178 | ! |
cache <- "" |
| 179 |
} |
|
| 180 |
} |
|
| 181 |
} |
|
| 182 | 56x |
st <- proc.time()[[3]] |
| 183 | 1x |
if (missing(text)) stop("enter text as the first argument", call. = FALSE)
|
| 184 | 55x |
if (text_as_paths) {
|
| 185 | 1x |
if (anyNA(text)) stop("NAs are not allowed in text when being treated as file paths", call. = FALSE)
|
| 186 | 1x |
if (!all(file.exists(text))) stop("not all of the files in text exist", call. = FALSE)
|
| 187 |
} |
|
| 188 | 53x |
read_in <- FALSE |
| 189 | 53x |
if (text_as_paths || (is.character(text) && !anyNA(text) && all(nchar(text) < 500))) {
|
| 190 | 36x |
if (length(text) == 1 && dir.exists(text)) {
|
| 191 | ! |
if (verbose) message("reading in texts from directory: ", text, " (", round(proc.time()[[3]] - st, 4), ")")
|
| 192 | 4x |
text_as_paths <- TRUE |
| 193 | 4x |
text <- list.files(text, file_type, full.names = TRUE) |
| 194 | 32x |
} else if (text_as_paths || all(file.exists(text))) {
|
| 195 | 5x |
text_as_paths <- FALSE |
| 196 | ! |
if (verbose) message("reading in texts from file list (", round(proc.time()[[3]] - st, 4), ")")
|
| 197 | 4x |
if (missing(id_column)) names(text) <- if (length(id) != length(text)) text else id |
| 198 | 5x |
if (all(grepl("\\.csv", text, TRUE))) {
|
| 199 | 1x |
if (is.null(text_column)) stop("text appears to point to csv files, but text_column was not specified", call. = FALSE)
|
| 200 | 2x |
read_in <- TRUE |
| 201 | 2x |
text <- unlist(lapply(text, function(f) {
|
| 202 | 2x |
if (file.exists(f)) {
|
| 203 | 2x |
d <- tryCatch( |
| 204 | 2x |
read_csv_arrow(f, col_select = c(text_column, id_column)), |
| 205 | 2x |
error = function(e) NULL |
| 206 |
) |
|
| 207 | ! |
if (is.null(d)) stop("failed to read in file ", f, call. = FALSE)
|
| 208 | 2x |
d <- if (!is.null(id_column) && id_column %in% colnames(d)) {
|
| 209 | 1x |
structure(d[, text_column, drop = TRUE], names = d[, id_column, drop = TRUE]) |
| 210 |
} else {
|
|
| 211 | 1x |
d[, text_column, drop = TRUE] |
| 212 |
} |
|
| 213 | ! |
if (collapse_lines) d <- paste(d, collapse = " ") |
| 214 | 2x |
d |
| 215 |
} else {
|
|
| 216 | ! |
f |
| 217 |
} |
|
| 218 |
})) |
|
| 219 |
} else {
|
|
| 220 | 2x |
text <- unlist(lapply(text, function(f) {
|
| 221 | 2x |
if (file.exists(f)) {
|
| 222 | 2x |
d <- readLines(f, warn = FALSE, skipNul = TRUE) |
| 223 | 1x |
if (collapse_lines) d <- paste(d, collapse = " ") |
| 224 | 2x |
d |
| 225 |
} else {
|
|
| 226 | ! |
f |
| 227 |
} |
|
| 228 |
})) |
|
| 229 |
} |
|
| 230 | 3x |
if (!collapse_lines) id <- names(text) |
| 231 |
} |
|
| 232 |
} |
|
| 233 | 52x |
if (is.null(dim(text))) {
|
| 234 | 46x |
if (!read_in) {
|
| 235 | 1x |
if (!text_as_paths && !is.null(text_column)) stop("text_column is specified, but text has no columns", call. = FALSE)
|
| 236 | 1x |
if (!is.null(id_column)) stop("id_column is specified, but text has no columns", call. = FALSE)
|
| 237 |
} |
|
| 238 |
} else {
|
|
| 239 | 1x |
if (length(id) == 1 && id %in% colnames(text)) id_column <- id |
| 240 | 6x |
if (!is.null(id_column)) {
|
| 241 | 2x |
if (id_column %in% colnames(text)) {
|
| 242 | 1x |
id <- text[, id_column, drop = TRUE] |
| 243 |
} else {
|
|
| 244 | 1x |
stop("id_column not found in text", call. = FALSE)
|
| 245 |
} |
|
| 246 |
} |
|
| 247 | 5x |
if (!is.null(text_column)) {
|
| 248 | 3x |
if (text_column %in% colnames(text)) {
|
| 249 | 2x |
text <- text[, text_column, drop = TRUE] |
| 250 |
} else {
|
|
| 251 | 1x |
if (!text_as_paths) stop("text_column not found in text", call. = FALSE)
|
| 252 |
} |
|
| 253 |
} |
|
| 254 | 4x |
if (!is.null(dim(text))) {
|
| 255 | 2x |
if (ncol(text) == 1) {
|
| 256 | 1x |
text <- text[, 1, drop = TRUE] |
| 257 |
} else {
|
|
| 258 | 1x |
stop("text has dimensions, but no text_column column", call. = FALSE)
|
| 259 |
} |
|
| 260 |
} |
|
| 261 |
} |
|
| 262 | 2x |
if (!is.character(text)) text <- as.character(text) |
| 263 | 4x |
if (length(id) && !is.character(id)) id <- as.character(id) |
| 264 | 47x |
provided_id <- FALSE |
| 265 | 47x |
if (length(id)) {
|
| 266 | 1x |
if (length(id) != length(text)) stop("id is not the same length as text", call. = FALSE)
|
| 267 | 1x |
if (anyDuplicated(id)) stop("id contains duplicate values", call. = FALSE)
|
| 268 | 6x |
provided_id <- TRUE |
| 269 |
} else {
|
|
| 270 | 39x |
id <- paste0("t", seq_along(text))
|
| 271 |
} |
|
| 272 | 1x |
if (!is.numeric(retry_limit)) retry_limit <- 0 |
| 273 | 45x |
url <- paste0(sub("(?:/v\\d+)?/+$", "", url), "/v1/")
|
| 274 | ! |
if (!is.list(api_args)) api_args <- as.list(api_args) |
| 275 | 45x |
args_hash <- if (length(api_args)) digest::digest(api_args, algo = "crc32") else "" |
| 276 | ||
| 277 |
# ping API |
|
| 278 | 45x |
if (make_request) {
|
| 279 | 2x |
if (verbose) message("pinging API (", round(proc.time()[[3]] - st, 4), ")")
|
| 280 | 43x |
ping <- receptiviti_status(url, key, secret, verbose = FALSE) |
| 281 | 1x |
if (ping$status_code != 200) stop(ping$status_message, call. = FALSE) |
| 282 |
} |
|
| 283 | ||
| 284 |
# prepare text |
|
| 285 | 2x |
if (verbose) message("preparing text (", round(proc.time()[[3]] - st, 4), ")")
|
| 286 | 42x |
data <- data.frame(text = text, id = id, stringsAsFactors = FALSE) |
| 287 | 42x |
text <- data[!is.na(data$text) & data$text != "" & !duplicated(data$text), ] |
| 288 | 1x |
if (!nrow(text)) stop("no valid texts to process", call. = FALSE)
|
| 289 | 1x |
if (!is.numeric(bundle_size)) bundle_size <- 1000 |
| 290 | 41x |
n <- ceiling(nrow(text) / min(1000, max(1, bundle_size))) |
| 291 | 41x |
bundles <- split(text, sort(rep_len(seq_len(n), nrow(text)))) |
| 292 | 41x |
size_fun <- if (text_as_paths) function(b) sum(file.size(b$text)) else object.size |
| 293 | 41x |
for (i in rev(seq_along(bundles))) {
|
| 294 | 151x |
size <- size_fun(bundles[[i]]) |
| 295 | 151x |
if (size > bundle_byte_limit) {
|
| 296 | 2x |
sizes <- vapply(seq_len(nrow(bundles[[i]])), function(r) as.numeric(size_fun(bundles[[i]][r, ])), 0) |
| 297 | 2x |
if (any(sizes > bundle_byte_limit)) {
|
| 298 | 1x |
stop( |
| 299 | 1x |
"one of your texts is over the individual size limit (", bundle_byte_limit / 1e6, " MB)",
|
| 300 | 1x |
call. = FALSE |
| 301 |
) |
|
| 302 |
} |
|
| 303 | 1x |
bins <- rep(1, length(sizes)) |
| 304 | 1x |
bin_size <- 0 |
| 305 | 1x |
bi <- 1 |
| 306 | 1x |
for (ti in seq_along(bins)) {
|
| 307 | 50x |
bin_size <- bin_size + sizes[ti] |
| 308 | 50x |
if (bin_size > bundle_byte_limit) {
|
| 309 | 2x |
bin_size <- sizes[ti] |
| 310 | 2x |
bi <- bi + 1 |
| 311 |
} |
|
| 312 | 50x |
bins[ti] <- bi |
| 313 |
} |
|
| 314 | 1x |
bundles <- c(bundles[-i], unname(split(bundles[[i]], paste0(i, ".", bins)))) |
| 315 |
} |
|
| 316 |
} |
|
| 317 | 40x |
n <- length(bundles) |
| 318 | 40x |
bundle_ref <- if (n == 1) "bundle" else "bundles" |
| 319 | 2x |
if (verbose) message("prepared text in ", n, " ", bundle_ref, " (", round(proc.time()[[3]] - st, 4), ")")
|
| 320 | ||
| 321 |
# prepare cache |
|
| 322 | 40x |
if (is.character(cache)) {
|
| 323 | 30x |
temp <- paste0(normalizePath(cache, "/", FALSE), "/") |
| 324 | 30x |
cache <- TRUE |
| 325 | 2x |
if (clear_cache) unlink(temp, recursive = TRUE) |
| 326 | 30x |
dir.create(temp, FALSE) |
| 327 |
} else {
|
|
| 328 | 10x |
temp <- NULL |
| 329 | 10x |
cache <- FALSE |
| 330 |
} |
|
| 331 | ||
| 332 | 40x |
check_cache <- cache && !cache_overwrite |
| 333 | 40x |
endpoint <- paste0(url, "framework/bulk") |
| 334 | 40x |
auth <- paste0(key, ":", secret) |
| 335 | ! |
if (missing(in_memory) && (use_future || cores > 1) && n > cores) in_memory <- FALSE |
| 336 | 40x |
request_scratch <- NULL |
| 337 | 40x |
if (!in_memory) {
|
| 338 | ! |
if (verbose) message("writing ", bundle_ref, " to disc (", round(proc.time()[[3]] - st, 4), ")")
|
| 339 | 2x |
request_scratch <- paste0(tempdir(), "/receptiviti_request_scratch/") |
| 340 | 2x |
dir.create(request_scratch, FALSE) |
| 341 | 2x |
if (clear_scratch_cache) on.exit(unlink(request_scratch, recursive = TRUE)) |
| 342 | 2x |
bundles <- vapply(bundles, function(b) {
|
| 343 | 6x |
scratch_bundle <- paste0(request_scratch, digest(b), ".rds") |
| 344 | 6x |
if (!file.exists(scratch_bundle)) saveRDS(b, scratch_bundle, compress = FALSE) |
| 345 | 6x |
scratch_bundle |
| 346 | 2x |
}, "", USE.NAMES = FALSE) |
| 347 |
} |
|
| 348 | ||
| 349 | 40x |
doprocess <- function(bundles, cores, future) {
|
| 350 | 5x |
env <- parent.frame() |
| 351 | 5x |
if (future) {
|
| 352 | 1x |
eval(expression(future.apply::future_lapply(bundles, process)), envir = env) |
| 353 |
} else {
|
|
| 354 | 4x |
cl <- parallel::makeCluster(cores) |
| 355 | 4x |
parallel::clusterExport(cl, ls(envir = env), env) |
| 356 | 4x |
on.exit(parallel::stopCluster(cl)) |
| 357 | 4x |
(if (length(bundles) > cores * 2) parallel::parLapplyLB else parallel::parLapply)(cl, bundles, process) |
| 358 |
} |
|
| 359 |
} |
|
| 360 | ||
| 361 | 40x |
request <- function(body, bin, ids, attempt = retry_limit) {
|
| 362 | 138x |
unpack <- function(d) {
|
| 363 | 20101x |
if (is.list(d)) as.data.frame(lapply(d, unpack), optional = TRUE) else d |
| 364 |
} |
|
| 365 | 138x |
json <- jsonlite::toJSON(unname(body), auto_unbox = TRUE) |
| 366 | 138x |
temp_file <- paste0(tempdir(), "/", digest::digest(paste0(endpoint, auth, json), serialize = FALSE), ".json") |
| 367 | 108x |
if (!request_cache) unlink(temp_file) |
| 368 | 138x |
res <- NULL |
| 369 | 138x |
if (!file.exists(temp_file)) {
|
| 370 | 119x |
if (make_request) {
|
| 371 | 118x |
handler <- tryCatch( |
| 372 | 118x |
curl::new_handle(httpauth = 1, userpwd = auth, copypostfields = json), |
| 373 | 118x |
error = function(e) e$message |
| 374 |
) |
|
| 375 | 118x |
if (is.character(handler)) {
|
| 376 | ! |
stop(if (grepl("libcurl", handler, fixed = TRUE)) {
|
| 377 | ! |
"libcurl encountered an error; try setting the bundle_byte_limit argument to a smaller value" |
| 378 |
} else {
|
|
| 379 | ! |
paste("failed to create handler:", handler)
|
| 380 | ! |
}, call. = FALSE) |
| 381 |
} |
|
| 382 | 118x |
res <- curl::curl_fetch_disk(endpoint, temp_file, handler) |
| 383 |
} else {
|
|
| 384 | 1x |
stop("make_request is FALSE, but there are texts with no cached results", call. = FALSE)
|
| 385 |
} |
|
| 386 |
} |
|
| 387 | 137x |
result <- if (file.exists(temp_file)) {
|
| 388 | 137x |
if (is.null(res$type) || grepl("application/json", res$type, fixed = TRUE)) {
|
| 389 | 137x |
tryCatch( |
| 390 | 137x |
jsonlite::read_json(temp_file, simplifyVector = TRUE), |
| 391 | 137x |
error = function(e) list(message = "invalid response format") |
| 392 |
) |
|
| 393 |
} else {
|
|
| 394 | ! |
list(message = "invalid response format") |
| 395 |
} |
|
| 396 |
} else {
|
|
| 397 | ! |
list(message = rawToChar(res$content)) |
| 398 |
} |
|
| 399 | 137x |
if (!is.null(result$results)) {
|
| 400 | 102x |
result <- result$results |
| 401 | 102x |
if ("error" %in% names(result)) {
|
| 402 | 1x |
su <- !is.na(result$error$code) |
| 403 | 1x |
errors <- result[su & !duplicated(result$error$code), "error"] |
| 404 | 1x |
warning( |
| 405 | 1x |
if (sum(su) > 1) "some texts were invalid: " else "a text was invalid: ", |
| 406 | 1x |
paste( |
| 407 | 1x |
do.call(paste0, data.frame("(", errors$code, ") ", errors$message, stringsAsFactors = FALSE)),
|
| 408 | 1x |
collapse = "; " |
| 409 |
), |
|
| 410 | 1x |
call. = FALSE |
| 411 |
) |
|
| 412 |
} |
|
| 413 | 102x |
result <- unpack(result[!names(result) %in% c("response_id", "language", "version", "error")])
|
| 414 | 102x |
if (!is.null(result) && nrow(result)) {
|
| 415 | 102x |
colnames(result)[1] <- "text_hash" |
| 416 | 102x |
cbind(id = ids, bin = bin, result) |
| 417 |
} |
|
| 418 |
} else {
|
|
| 419 | 35x |
unlink(temp_file) |
| 420 | 35x |
if (length(result$message) == 1 && substr(result$message, 1, 1) == "{") {
|
| 421 | ! |
result <- jsonlite::fromJSON(result$message) |
| 422 |
} |
|
| 423 | 35x |
if (attempt > 0 && (length(result$code) == 1 && result$code == 1420) || ( |
| 424 | 35x |
length(result$message) == 1 && result$message == "invalid response format" |
| 425 |
)) {
|
|
| 426 | 34x |
wait_time <- as.numeric(regmatches(result$message, regexec("[0-9]+(?:\\.[0-9]+)?", result$message)))
|
| 427 | 34x |
Sys.sleep(if (is.na(wait_time)) 1 else wait_time / 1e3) |
| 428 | 34x |
request(body, bin, ids, attempt - 1) |
| 429 |
} else {
|
|
| 430 | 1x |
stop(paste0(if (length(result$code)) {
|
| 431 | 1x |
paste0( |
| 432 | 1x |
if (is.null(res$status_code)) 200 else res$status_code, " (", result$code, "): "
|
| 433 |
) |
|
| 434 | 1x |
}, result$message), call. = FALSE) |
| 435 |
} |
|
| 436 |
} |
|
| 437 |
} |
|
| 438 | ||
| 439 | 40x |
process <- function(bundle) {
|
| 440 | 108x |
opts <- getOption("stringsAsFactors")
|
| 441 | 108x |
options("stringsAsFactors" = FALSE)
|
| 442 | 108x |
on.exit(options("stringsAsFactors" = opts))
|
| 443 | 1x |
if (is.character(bundle)) bundle <- readRDS(bundle) |
| 444 | 108x |
text <- bundle$text |
| 445 | 108x |
bin <- NULL |
| 446 | 108x |
if (text_as_paths) {
|
| 447 | 2x |
if (all(grepl("\\.csv", text, TRUE))) {
|
| 448 | 1x |
if (is.null(text_column)) stop("files appear to be csv, but no text_column was specified", call. = FALSE)
|
| 449 | 1x |
text <- vapply(text, function(f) paste(arrow::read_csv_arrow(f)[, text_column], collapse = " "), "") |
| 450 |
} else {
|
|
| 451 | ! |
text <- vapply(text, function(f) paste(readLines(f, warn = FALSE, skipNul = TRUE), collapse = " "), "") |
| 452 |
} |
|
| 453 |
} |
|
| 454 | 107x |
bundle$hashes <- paste0(args_hash, vapply(text, digest::digest, "", serialize = FALSE)) |
| 455 | 107x |
initial <- paste0("h", substr(bundle$hashes, 1, 1))
|
| 456 | 107x |
set <- !is.na(text) & text != "" & text != "logical(0)" & !duplicated(bundle$hashes) |
| 457 | 107x |
res_cached <- res_fresh <- NULL |
| 458 | 107x |
if (check_cache && dir.exists(paste0(temp, "bin=h"))) {
|
| 459 | 26x |
db <- arrow::open_dataset(temp, partitioning = arrow::schema(bin = arrow::string()), format = cache_format) |
| 460 | 26x |
cached <- if (!is.null(db$schema$GetFieldByName("text_hash"))) {
|
| 461 | 19x |
tryCatch( |
| 462 | 19x |
dplyr::compute(dplyr::filter(db, bin %in% unique(initial), text_hash %in% bundle$hashes)), |
| 463 | 19x |
error = function(e) matrix(integer(), 0) |
| 464 |
) |
|
| 465 |
} else {
|
|
| 466 | 7x |
matrix(integer(), 0) |
| 467 |
} |
|
| 468 | 26x |
if (nrow(cached)) {
|
| 469 | 4x |
cached <- as.data.frame(cached$to_data_frame()) |
| 470 | ! |
if (anyDuplicated(cached$text_hash)) cached <- cached[!duplicated(cached$text_hash), ] |
| 471 | 4x |
rownames(cached) <- cached$text_hash |
| 472 | 4x |
cached_set <- which(bundle$hashes %in% cached$text_hash) |
| 473 | 4x |
set[cached_set] <- FALSE |
| 474 | 4x |
res_cached <- cbind(id = bundle[cached_set, "id"], cached[bundle[cached_set, "hashes"], ]) |
| 475 |
} |
|
| 476 |
} |
|
| 477 | 107x |
valid_options <- names(api_args) |
| 478 | 107x |
if (any(set)) {
|
| 479 | 104x |
set <- which(set) |
| 480 | 104x |
res_fresh <- request(lapply( |
| 481 | 104x |
set, function(i) c(api_args, list(content = text[[i]], request_id = bundle[i, "hashes"])) |
| 482 | 104x |
), initial[set], bundle[set, "id"]) |
| 483 | 102x |
valid_options <- valid_options[valid_options %in% colnames(res_fresh)] |
| 484 | 102x |
if (length(valid_options)) {
|
| 485 | 1x |
res_fresh <- res_fresh[, !colnames(res_fresh) %in% valid_options, drop = FALSE] |
| 486 |
} |
|
| 487 | 102x |
if (check_cache && !is.null(res_cached) && !all(colnames(res_cached) %in% colnames(res_fresh))) {
|
| 488 | ! |
res_cached <- NULL |
| 489 | ! |
res_fresh <- rbind( |
| 490 | ! |
res_fresh, |
| 491 | ! |
request(lapply( |
| 492 | ! |
cached_set, function(i) c(api_args, list(content = text[[i]], request_id = bundle[i, "hashes"])) |
| 493 | ! |
), initial[cached_set], bundle[cached_set, "id"]) |
| 494 |
) |
|
| 495 |
} |
|
| 496 |
} |
|
| 497 | 105x |
res <- rbind(res_cached, res_fresh) |
| 498 | 2x |
if (length(valid_options)) for (n in valid_options) res[[n]] <- api_args[[n]] |
| 499 | 105x |
missing_ids <- !bundle$id %in% res$id |
| 500 | 105x |
if (any(missing_ids)) {
|
| 501 | 1x |
varnames <- colnames(res)[colnames(res) != "id"] |
| 502 | 1x |
res <- rbind(res, cbind( |
| 503 | 1x |
id = bundle[missing_ids, "id"], |
| 504 | 1x |
as.data.frame(matrix(NA, sum(missing_ids), length(varnames), dimnames = list(NULL, varnames))) |
| 505 |
)) |
|
| 506 | 1x |
res$text_hash <- structure(bundle$hashes, names = bundle$id)[res$id] |
| 507 |
} |
|
| 508 | 105x |
prog() |
| 509 | 105x |
res |
| 510 |
} |
|
| 511 | ||
| 512 |
# make request(s) |
|
| 513 | 40x |
cores <- if (is.numeric(cores)) max(1, min(length(bundles), cores)) else 1 |
| 514 | 40x |
call_env <- new.env(parent = globalenv()) |
| 515 | 40x |
prog <- progressor(along = bundles) |
| 516 | 40x |
environment(doprocess) <- call_env |
| 517 | 40x |
environment(request) <- call_env |
| 518 | 40x |
environment(process) <- call_env |
| 519 | 40x |
for (name in c( |
| 520 | 40x |
"doprocess", "request", "process", "text_column", "prog", "make_request", "check_cache", "endpoint", |
| 521 | 40x |
"temp", "use_future", "cores", "bundles", "cache_format", "request_cache", "auth", |
| 522 | 40x |
"text_as_paths", "retry_limit", "api_args", "args_hash" |
| 523 |
)) {
|
|
| 524 | 760x |
call_env[[name]] <- get(name) |
| 525 |
} |
|
| 526 | 40x |
results <- if (use_future || cores > 1) {
|
| 527 | 5x |
if (verbose) {
|
| 528 | ! |
message( |
| 529 | ! |
"processing ", bundle_ref, " using ", if (use_future) "future backend" else paste(cores, "cores"), |
| 530 | ! |
" (", round(proc.time()[[3]] - st, 4), ")"
|
| 531 |
) |
|
| 532 |
} |
|
| 533 | 5x |
eval(expression(doprocess(bundles, cores, use_future)), envir = call_env) |
| 534 |
} else {
|
|
| 535 | 2x |
if (verbose) message("processing ", bundle_ref, " sequentially (", round(proc.time()[[3]] - st, 4), ")")
|
| 536 | 35x |
lapply(bundles, process) |
| 537 |
} |
|
| 538 | 2x |
if (verbose) message("done retrieving; preparing final results (", round(proc.time()[[3]] - st, 4), ")")
|
| 539 | 37x |
final_res <- do.call(rbind, results) |
| 540 | 37x |
if (length(api_args)) {
|
| 541 | 3x |
su <- !names(api_args) %in% colnames(final_res) |
| 542 | 1x |
if (any(su)) warning("unrecognized api_args: ", paste(names(api_args)[su], collapse = ", "), call. = FALSE)
|
| 543 |
} |
|
| 544 | ||
| 545 |
# update cache |
|
| 546 | 37x |
if (!is.null(temp)) {
|
| 547 | 2x |
if (verbose) message("checking cache (", round(proc.time()[[3]] - st, 4), ")")
|
| 548 | 29x |
initialized <- dir.exists(paste0(temp, "bin=h")) |
| 549 | 29x |
if (initialized) {
|
| 550 | 27x |
db <- arrow::open_dataset(temp, partitioning = arrow::schema(bin = arrow::string()), format = cache_format) |
| 551 | 27x |
if (db$num_cols != (ncol(final_res) - 1)) {
|
| 552 | ! |
if (verbose) message("clearing existing cache since columns did not align (", round(proc.time()[[3]] - st, 4), ")")
|
| 553 | ! |
if (clear_cache) unlink(temp, recursive = TRUE) |
| 554 | 3x |
dir.create(temp, FALSE) |
| 555 | 3x |
initialized <- FALSE |
| 556 |
} |
|
| 557 |
} |
|
| 558 | 29x |
if (!initialized) {
|
| 559 | 5x |
su <- !colnames(final_res) %in% c("id", names(api_args))
|
| 560 | 5x |
if (sum(su) > 2) {
|
| 561 | 5x |
initial <- final_res[1, su] |
| 562 | 5x |
initial$text_hash <- "" |
| 563 | 5x |
initial$bin <- "h" |
| 564 | 5x |
initial[, !colnames(initial) %in% c( |
| 565 | 5x |
"bin", "text_hash", "summary.word_count", "summary.sentence_count" |
| 566 | 5x |
)] <- .1 |
| 567 | 5x |
initial <- rbind(initial, final_res[, colnames(initial)]) |
| 568 | 5x |
if (verbose) {
|
| 569 | 1x |
message( |
| 570 | 1x |
"initializing cache with ", nrow(final_res), " result", |
| 571 | 1x |
if (nrow(final_res) > 1) "s", " (", round(proc.time()[[3]] - st, 4), ")"
|
| 572 |
) |
|
| 573 |
} |
|
| 574 | 5x |
arrow::write_dataset(initial, temp, partitioning = "bin", format = cache_format) |
| 575 |
} |
|
| 576 |
} else {
|
|
| 577 | 24x |
fresh <- final_res[ |
| 578 | 24x |
!duplicated(final_res$text_hash), !colnames(final_res) %in% c("text_hash", names(api_args)),
|
| 579 | 24x |
drop = FALSE |
| 580 |
] |
|
| 581 | 24x |
cached <- dplyr::filter(db, bin %in% unique(fresh$bin), text_hash %in% fresh$text_hash) |
| 582 | 24x |
if (!any(dim(cached) == 0) || nrow(cached) != nrow(fresh)) {
|
| 583 | 24x |
uncached_hashes <- if (nrow(cached)) {
|
| 584 | ! |
!fresh$text_hash %in% dplyr::collect(dplyr::select(cached, text_hash))[, 1] |
| 585 |
} else {
|
|
| 586 | 24x |
rep(TRUE, nrow(fresh)) |
| 587 |
} |
|
| 588 | 24x |
if (any(uncached_hashes)) {
|
| 589 | 24x |
if (verbose) {
|
| 590 | 1x |
message( |
| 591 | 1x |
"updating cache with ", sum(uncached_hashes), " result", |
| 592 | 1x |
if (sum(uncached_hashes) > 1) "s", " (", round(proc.time()[[3]] - st, 4), ")"
|
| 593 |
) |
|
| 594 |
} |
|
| 595 | 24x |
arrow::write_dataset(fresh[uncached_hashes, ], temp, partitioning = "bin", format = cache_format) |
| 596 |
} |
|
| 597 |
} |
|
| 598 |
} |
|
| 599 |
} |
|
| 600 | ||
| 601 |
# prepare final results |
|
| 602 | 2x |
if (verbose) message("preparing output (", round(proc.time()[[3]] - st, 4), ")")
|
| 603 | 37x |
rownames(final_res) <- final_res$id |
| 604 | 37x |
rownames(data) <- data$id |
| 605 | 37x |
data$text_hash <- structure(final_res$text_hash, names = data[final_res$id, "text"])[data$text] |
| 606 | 37x |
final_res <- cbind( |
| 607 | 37x |
data[, c(if (return_text) "text", if (provided_id) "id", "text_hash"), drop = FALSE], |
| 608 | 37x |
final_res[ |
| 609 | 37x |
structure(final_res$id, names = final_res$text_hash)[data$text_hash], |
| 610 | 37x |
!colnames(final_res) %in% c("id", "bin", "text_hash", "custom"),
|
| 611 | 37x |
drop = FALSE |
| 612 |
] |
|
| 613 |
) |
|
| 614 | 37x |
row.names(final_res) <- NULL |
| 615 | 37x |
if (!is.null(output)) {
|
| 616 | ! |
if (!grepl("\\.csv", output, TRUE)) output <- paste0(output, ".csv")
|
| 617 | 1x |
if (compress && !grepl(".xz", output, fixed = TRUE)) output <- paste0(output, ".xz")
|
| 618 | 1x |
if (grepl(".xz", output, fixed = TRUE)) compress <- TRUE
|
| 619 | 1x |
if (verbose) message("writing results to file: ", output, " (", round(proc.time()[[3]] - st, 4), ")")
|
| 620 | 3x |
dir.create(dirname(output), FALSE, TRUE) |
| 621 | 2x |
if (overwrite) unlink(output) |
| 622 | 1x |
if (compress) output <- xzfile(output) |
| 623 | 3x |
write_csv_arrow(final_res, file = output) |
| 624 |
} |
|
| 625 | ||
| 626 | 37x |
if (is.character(frameworks) && frameworks[1] != "all") {
|
| 627 | ! |
if (verbose) message("selecting frameworks (", round(proc.time()[[3]] - st, 4), ")")
|
| 628 | 7x |
vars <- colnames(final_res) |
| 629 | 7x |
sel <- grepl(paste0("^(?:", paste(tolower(frameworks), collapse = "|"), ")"), vars)
|
| 630 | 7x |
if (any(sel)) {
|
| 631 | 4x |
if (missing(framework_prefix) && (length(frameworks) == 1 && frameworks != "all")) framework_prefix <- FALSE |
| 632 | 6x |
sel <- unique(c("text", "id", "text_hash", names(api_args), vars[sel]))
|
| 633 | 6x |
sel <- sel[sel %in% vars] |
| 634 | 6x |
final_res <- final_res[, sel] |
| 635 |
} else {
|
|
| 636 | 1x |
warning("frameworks did not match any columns -- returning all", call. = FALSE)
|
| 637 |
} |
|
| 638 |
} |
|
| 639 | 37x |
if (as_list) {
|
| 640 | 1x |
if (missing(framework_prefix)) framework_prefix <- FALSE |
| 641 | 1x |
inall <- c("text", "id", "text_hash", names(api_args))
|
| 642 | 1x |
cols <- colnames(final_res) |
| 643 | 1x |
inall <- inall[inall %in% cols] |
| 644 | 1x |
pre <- sub("\\..*$", "", cols)
|
| 645 | 1x |
pre <- unique(pre[!pre %in% inall]) |
| 646 | 1x |
final_res <- lapply(structure(pre, names = pre), function(f) {
|
| 647 | 9x |
res <- final_res[, c(inall, grep(paste0("^", f), cols, value = TRUE))]
|
| 648 | 9x |
if (!framework_prefix) colnames(res) <- sub("^.+\\.", "", colnames(res))
|
| 649 | 9x |
res |
| 650 |
}) |
|
| 651 | 5x |
} else if (!framework_prefix) colnames(final_res) <- sub("^.+\\.", "", colnames(final_res))
|
| 652 | 2x |
if (verbose) message("done (", round(proc.time()[[3]] - st, 4), ")")
|
| 653 | 37x |
invisible(final_res) |
| 654 |
} |