Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ scratch/
PR.md
.DS_Store
TPLYR2_REQUIREMENTS.MD

/.quarto/
**/*.quarto_ipynb
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: Tplyr
Title: A Traceability Focused Grammar of Clinical Data Summary
Version: 1.3.0
Version: 1.3.0.9000
Authors@R:
c(
person(given = "Eli",
Expand Down Expand Up @@ -46,7 +46,7 @@ License: MIT + file LICENSE
URL: https://atorus-research.github.io/Tplyr/, https://github.com/atorus-research/Tplyr
BugReports: https://github.com/atorus-research/Tplyr/issues
Encoding: UTF-8
Depends: R (>= 3.5.0)
Depends: R (>= 4.1.0)
Imports:
rlang (>= 0.4.6),
assertthat (>= 0.2.1),
Expand Down
135 changes: 91 additions & 44 deletions R/sort.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,36 @@

#### Helpers ####

#' Find row indices for special rows (total, missing, missing subjects)
#'
#' Locates the positions of total, missing, and missing subjects rows within
#' formatted data by matching labels in the last row_label column.
#'
#' @param formatted_data The formatted data frame
#' @param missing_string String identifying missing rows
#' @param total_row_label Label for the total row
#' @param missing_subjects_row_label Label for missing subjects row
#'
#' @return A named list with elements `missing`, `total`, and
#' `missing_subjects`, each NULL or an integer vector of row indices
#' @noRd
find_special_row_indices <- function(formatted_data,
missing_string = NULL,
total_row_label = NULL,
missing_subjects_row_label = NULL) {
label_row_ind <- which(names(formatted_data) %in%
tail(vars_select(names(formatted_data),
starts_with("row_label")), 1))
label_values <- unlist(formatted_data[, label_row_ind])

list(
missing = if (!is.null(missing_string)) which(label_values %in% missing_string),
total = if (!is.null(total_row_label)) which(label_values %in% total_row_label),
missing_subjects = if (!is.null(missing_subjects_row_label))
which(label_values %in% missing_subjects_row_label)
)
}

#' Check for VARN variable
#'
#' For needed by variables, looks for a <by>N. e.g. VISITN
Expand Down Expand Up @@ -360,10 +390,20 @@ add_order_columns.shift_layer <- function(x) {
)
}

# Find explicit indices of special rows
total_row_label <- env_get(x, "total_row_label", default = NULL)
missing_subjects_row_label <- env_get(x, "missing_subjects_row_label", default = NULL)
row_inds <- find_special_row_indices(formatted_data,
total_row_label = total_row_label,
missing_subjects_row_label = missing_subjects_row_label)

# The logic is the same now for a byvarn so reuse that function
formatted_data[, paste0("ord_layer_", formatted_col_index)] <-
get_data_order_byvarn(formatted_data, fact_df, as_name(target_var$row),
formatted_col_index, total_row_sort_value = total_row_sort_value,
formatted_col_index,
total_index = row_inds$total,
total_row_sort_value = total_row_sort_value,
missing_subjects_index = row_inds$missing_subjects,
missing_subjects_sort_value = missing_subjects_sort_value)

# BIND: Bind necessary variables back into layer
Expand Down Expand Up @@ -460,31 +500,18 @@ get_data_order_count <- function(formatted_data, formatted_col_index,
# Switch for the sorting method
if (order_count_method == "bycount") {

# Get the index of the row with the missing and total names
label_row_ind <- which(names(formatted_data) %in%
tail(vars_select(names(formatted_data),
starts_with("row_label")), 1))

missing_index <- NULL
total_index <- NULL
missing_subjects_index <- NULL

if (!is.null(missing_string)) {
missing_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_string)
}
if (!is.null(total_row_label)) {
total_index <- which(unlist(formatted_data[, label_row_ind]) %in% total_row_label)
}
if (!is.null(missing_subjects_row_label)) {
missing_subjects_index <- which(unlist(formatted_data[, label_row_ind]) %in% missing_subjects_row_label)
}
# Get the indices of special rows (missing, total, missing subjects)
row_inds <- find_special_row_indices(formatted_data,
missing_string = missing_string,
total_row_label = total_row_label,
missing_subjects_row_label = missing_subjects_row_label)

# No processing is needed here just pass in the needed info
get_data_order_bycount(numeric_data, ordering_cols,
treat_var, by, cols, result_order_var, target_var,
missing_index, missing_sort_value,
total_index, total_row_sort_value,
missing_subjects_index, missing_subjects_sort_value,
row_inds$missing, missing_sort_value,
row_inds$total, total_row_sort_value,
row_inds$missing_subjects, missing_subjects_sort_value,
break_ties = break_ties,
numeric_cutoff = numeric_cutoff,
numeric_cutoff_stat = numeric_cutoff_stat,
Expand All @@ -498,7 +525,7 @@ get_data_order_count <- function(formatted_data, formatted_col_index,

if (!is.null(missing_count_list)) {
varN_name <- names(varn_df)[2]
varn_df[, 1] <- as.character(varn_df[, 1])
varn_df[[1]] <- as.character(varn_df[[1]])
if (is.null(missing_sort_value)) {
varn_df <- varn_df %>%
bind_rows(tibble(
Expand All @@ -514,8 +541,16 @@ get_data_order_count <- function(formatted_data, formatted_col_index,
}
}

# Find the explicit indices of special rows
row_inds <- find_special_row_indices(formatted_data,
total_row_label = total_row_label,
missing_subjects_row_label = missing_subjects_row_label)

get_data_order_byvarn(formatted_data, varn_df, as_name(target_var[[1]]),
formatted_col_index, total_row_sort_value = total_row_sort_value,
formatted_col_index,
total_index = row_inds$total,
total_row_sort_value = total_row_sort_value,
missing_subjects_index = row_inds$missing_subjects,
missing_subjects_sort_value = missing_subjects_sort_value)

# Here it is 'byfactor'
Expand Down Expand Up @@ -568,9 +603,17 @@ get_data_order_count <- function(formatted_data, formatted_col_index,
}
}

# Find the explicit indices of special rows
row_inds <- find_special_row_indices(formatted_data,
total_row_label = total_row_label,
missing_subjects_row_label = missing_subjects_row_label)

# The logic is the same now for a byvarn so reuse that function
get_data_order_byvarn(formatted_data, fact_df, as_name(target_var[[1]]),
formatted_col_index, total_row_sort_value = total_row_sort_value,
formatted_col_index,
total_index = row_inds$total,
total_row_sort_value = total_row_sort_value,
missing_subjects_index = row_inds$missing_subjects,
missing_subjects_sort_value = missing_subjects_sort_value)
}
}
Expand Down Expand Up @@ -667,37 +710,40 @@ get_data_order_bycount <- function(numeric_data, ordering_cols,
}

get_data_order_byvarn <- function(formatted_data, by_varn_df, by_var, by_column_index,
indentation = "", total_row_sort_value = NULL,
indentation = "",
total_index = NULL, total_row_sort_value = NULL,
missing_subjects_index = NULL,
missing_subjects_sort_value = NULL) {

# Pull out the by values in the formatted data.
by_values <- unlist(formatted_data[, by_column_index])

# Look up the VARN value for each row. Unmatched rows get a default
# value that places them at the end.
default_sort <- max(unlist(by_varn_df[, 2])) + 1
varns <- map_dbl(by_values, function(a_by) {

# Row containing the index and the value
ind_row <- by_varn_df %>%
# Converting by_var to a symbol didn't work here for some reason but this
# works just as well.
filter(.data[[as_name(by_var)]] == a_by)

# If the row is length zero it is a total row. Just add one so it appears on the bottom or use the sort_value
if (nrow(ind_row) == 0) {
# Flag to determine where total row is positioned
if(!is.null(total_row_sort_value)) {
total_row_sort_value
} else if (!is.null(missing_subjects_sort_value)){
missing_subjects_sort_value
} else {
max(by_varn_df[,2]) + 1
}
default_sort
} else {
# Index is always in the second row
as.double(unlist(ind_row[, 2]))
}
})

# Remove the names so its just an unnamed numeric vecetor
# Apply sort values to specific total and missing subjects rows by index
if (!is.null(total_index) && !is.null(total_row_sort_value)) {
varns[total_index] <- total_row_sort_value
}

if (!is.null(missing_subjects_index) && !is.null(missing_subjects_sort_value)) {
varns[missing_subjects_index] <- missing_subjects_sort_value
}

# Remove the names so its just an unnamed numeric vector
unname(varns)

}
Expand Down Expand Up @@ -775,11 +821,12 @@ add_data_order_nested_vectorized <- function(formatted_data, final_col, numeric_
# - by[[1]] becomes row_label1 (contains NA for outer rows)
# - summary_var becomes row_label{length(by)+1} (contains actual outer values)
# So we need to look at column final_col + 1, not final_col
all_outer$..outer_order <- all_outer %>%
replace_by_string_names(c(by, quo(summary_var))) %>%
get_data_order_byvarn(varn_df, by[[1]], final_col + 1,
total_row_sort_value = total_row_sort_value,
missing_subjects_sort_value = missing_subjects_sort_value)
renamed_outer <- all_outer %>%
replace_by_string_names(c(by, quo(summary_var)))
# No total/missing subjects rows expected at the outer nesting level,
# so no indices are passed
all_outer$..outer_order <-
get_data_order_byvarn(renamed_outer, varn_df, by[[1]], final_col + 1)

} else if (order_count_method[1] == "bycount") {
all_outer$..outer_order <- get_data_order_bycount(
Expand Down
4 changes: 3 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,13 @@ extract_character_from_quo <- function(var_list) {
#' @param dat Dataframe to strip of variable attributes
#'
#' @return Dataframe with variable attributes removed, except for factor levels
#' and other structurally necessary attributes (e.g. units for difftime,
#' tzone for POSIXct)
#' @noRd
clean_attr <- function(dat) {
for (n in names(dat)) {
for (a in names(attributes(dat[[n]]))) {
if (!a %in% c('levels', 'class', 'names', 'row.names', 'groups')) {
if (!a %in% c('levels', 'class', 'names', 'row.names', 'groups', 'units', 'tzone')) {
attr(dat[[n]], a) <- NULL
}
}
Expand Down
41 changes: 41 additions & 0 deletions tests/testthat/test-sort.R
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,47 @@ test_that("Nested counts with by variables process properly", {

})

# Added to address #202
test_that("byvarn sorting with a total row only applies sort_value to the total row", {
t <- tplyr_table(tplyr_adsl, TRT01A) %>%
add_layer(
group_count(AGEGR1) %>%
set_order_count_method("byvarn") %>%
add_total_row(fmt = f_str("xx", n), count_missings = FALSE, sort_value = -Inf) %>%
set_total_row_label("n")
)
b_t <- build(t)

# The total row should have -Inf, other rows should have their AGEGR1N values
total_mask <- b_t$row_label1 == "n"
expect_true(all(b_t$ord_layer_1[total_mask] == -Inf))
expect_true(all(b_t$ord_layer_1[!total_mask] != -Inf))

# Non-total rows should have distinct sort values matching AGEGR1N
non_total_sorts <- b_t$ord_layer_1[!total_mask]
agegr1n_values <- sort(unique(tplyr_adsl$AGEGR1N))
expect_equal(sort(non_total_sorts), agegr1n_values)

# Same test with a by variable and set_missing_count, which exercises
# the as.character() coercion path for varn_df
t2 <- tplyr_table(tplyr_adsl, TRT01A) %>%
add_layer(
group_count(AGEGR1, by = "Age Categories n (%)") %>%
set_order_count_method("byvarn") %>%
add_total_row(fmt = f_str("xx", n), count_missings = FALSE, sort_value = -Inf) %>%
set_total_row_label("n") %>%
set_missing_count(fmt = f_str("xx", n), denom_ignore = TRUE, "Missing" = NA)
)
b_t2 <- build(t2)

total_mask2 <- b_t2$row_label2 == "n"
missing_mask2 <- b_t2$row_label2 == "Missing"
data_mask2 <- !total_mask2 & !missing_mask2

expect_true(all(b_t2$ord_layer_2[total_mask2] == -Inf))
expect_equal(sort(b_t2$ord_layer_2[data_mask2]), sort(unique(tplyr_adsl$AGEGR1N)))
})

test_that("Sorting functions work correctly after refactoring", {
# Test all three sorting methods to ensure functionality is preserved

Expand Down
6 changes: 5 additions & 1 deletion vignettes/desc.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,12 @@ tplyr_table(tplyr_adsl, TRT01P) %>%
group_desc(AGE) %>%
set_format_strings("Mean" = f_str('xx.xx', mean))
) %>%
build() %>%
build() %>%
kable()
```

```{r echo=FALSE}
options(tplyr.custom_summaries = NULL)
```

Note that the table code used to produce the output is the same. Now **Tplyr** used the custom summary function for `mean` as specified in the `tplyr.custom_summaries` option. Also note the use of `rlang::quos()`. We've done our best to mask this from the user everywhere possible and make the interfaces clean and intuitive, but a great deal of **Tplyr** is built using 'rlang' and non-standard evaluation. Within this option is one of the very few instances where a user needs to concern themselves with the use of quosures. If you'd like to learn more about non-standard evaluation and quosures, we recommend [Section IV](https://adv-r.hadley.nz/metaprogramming.html) in Advanced R.
6 changes: 5 additions & 1 deletion vignettes/options.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,11 @@ tplyr_table(tplyr_adsl, TRT01P) %>%
kable()
```

## IBM Rounding
```{r include=FALSE}
options(op)
```

## IBM Rounding

In certain cases users may want to match tables produced by other languages that IBM rounding.**Tplyr** offers the option 'tplyr.IBMRounding' to change the default rounding behavior of **Tplyr** tables. Review var1_4 in the tables below.

Expand Down