## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message=FALSE----------------------------------------------------- library(scoringutils) library(data.table) library(ggplot2) library(magrittr) library(magrittr) #pipe operator ## ----eval=require("ggdist", quietly = TRUE)----------------------------------- #" @title Plot Predictions vs True Values #" #" @description #" Make a plot of observed and predicted values #" #" @param data a data.frame that follows the same specifications outlined in #" [score()]. To customise your plotting, you can filter your data using the #" function [make_NA()]. #" @param by character vector with column names that denote categories by which #" the plot should be stratified. If for example you want to have a facetted #" plot, this should be a character vector with the columns used in facetting #" (note that the facetting still needs to be done outside of the function call) #" @param x character vector of length one that denotes the name of the variable #" @param interval_range numeric vector indicating the interval ranges to plot. #" If 0 is included in `interval_range`, the median prediction will be shown. #" @return ggplot object with a plot of true vs predicted values #" @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual theme_light #" @importFrom ggplot2 facet_wrap facet_grid aes geom_line .data geom_point #" @importFrom data.table dcast #" @importFrom ggdist geom_lineribbon #" @export #" @examples #" library(ggplot2) #" library(magrittr) #" #" example_sample_continuous %>% #" make_NA ( #" what = "truth", #" target_end_date >= "2021-07-22", #" target_end_date < "2021-05-01" #" ) %>% #" make_NA ( #" what = "forecast", #" model != "EuroCOVIDhub-ensemble", #" forecast_date != "2021-06-07" #" ) %>% #" plot_predictions ( #" x = "target_end_date", #" by = c("target_type", "location"), #" interval_range = c(0, 50, 90, 95) #" ) + #" facet_wrap(~ location + target_type, scales = "free_y") + #" aes(fill = model, color = model) #" #" example_sample_continuous %>% #" make_NA ( #" what = "truth", #" target_end_date >= "2021-07-22", #" target_end_date < "2021-05-01" #" ) %>% #" make_NA ( #" what = "forecast", #" forecast_date != "2021-06-07" #" ) %>% #" plot_predictions ( #" x = "target_end_date", #" by = c("target_type", "location"), #" interval_range = 0 #" ) + #" facet_wrap(~ location + target_type, scales = "free_y") + #" aes(fill = model, color = model) library(ggdist) plot_predictions <- function(data, by = NULL, x = "date", interval_range = c(0, 50, 90)) { # split truth data and forecasts in order to apply different filtering truth_data <- data.table::as.data.table(data)[!is.na(observed)] forecasts <- data.table::as.data.table(data)[!is.na(predicted)] del_cols <- colnames(truth_data)[!(colnames(truth_data) %in% c(by, "observed", x))] truth_data <- unique(suppressWarnings(truth_data[, eval(del_cols) := NULL])) # find out what type of predictions we have. convert sample based to # interval range data if ("quantile_level" %in% colnames(data)) { forecasts <- scoringutils:::quantile_to_interval( forecasts, keep_quantile_col = FALSE ) } else if ("sample_id" %in% colnames(data)) { # using a scoringutils internal function forecasts <- scoringutils:::sample_to_interval_long( as_forecast_sample(forecasts), interval_range = interval_range, keep_quantile_col = FALSE ) } # select appropriate boundaries and pivot wider select <- forecasts$interval_range %in% setdiff(interval_range, 0) intervals <- forecasts[select, ] # delete quantile column in intervals if present. This is important for # pivoting if ("quantile_level" %in% names(intervals)) { intervals[, quantile_level := NULL] } plot <- ggplot(data = data, aes(x = .data[[x]])) + theme_scoringutils() + ylab("True and predicted values") if (nrow(intervals) != 0) { # pivot wider and convert range to a factor intervals <- data.table::dcast(intervals, ... ~ boundary, value.var = "predicted") # only plot interval ranges if there are interval ranges to plot plot <- plot + ggdist::geom_lineribbon( data = intervals, aes( ymin = lower, ymax = upper, # We use the fill_ramp aesthetic for this instead of the default fill # because we want to keep fill to be able to use it for other # variables fill_ramp = factor( interval_range, levels = sort(unique(interval_range), decreasing = TRUE) ) ), lwd = 0.4 ) + ggdist::scale_fill_ramp_discrete( name = "interval_range", # range argument was added to make sure that the line for the median # and the ribbon don"t have the same opacity, making the line # invisible range = c(0.15, 0.75) ) } # We could treat this step as part of ggdist::geom_lineribbon() but we treat # it separately here to deal with the case when only the median is provided # (in which case ggdist::geom_lineribbon() will fail) if (0 %in% interval_range) { select_median <- forecasts$interval_range == 0 & forecasts$boundary == "lower" median <- forecasts[select_median] if (nrow(median) > 0) { plot <- plot + geom_line( data = median, mapping = aes(y = predicted), lwd = 0.4 ) } } # add observed values if (nrow(truth_data) > 0) { plot <- plot + geom_point( data = truth_data, show.legend = FALSE, inherit.aes = FALSE, aes(x = .data[[x]], y = observed), color = "black", size = 0.5 ) + geom_line( data = truth_data, inherit.aes = FALSE, show.legend = FALSE, aes(x = .data[[x]], y = observed), linetype = 1, color = "grey40", lwd = 0.2 ) } return(plot) } ## ----------------------------------------------------------------------------- #" @title Make Rows NA in Data for Plotting #" #" @description #" Filters the data and turns values into `NA` before the data gets passed to #" [plot_predictions()]. The reason to do this is to this is that it allows to #" "filter" prediction and truth data separately. Any value that is NA will then #" be removed in the subsequent call to [plot_predictions()]. #" #" @inheritParams score #" @param what character vector that determines which values should be turned #" into `NA`. If `what = "truth"`, values in the column "observed" will be #" turned into `NA`. If `what = "forecast"`, values in the column "prediction" #" will be turned into `NA`. If `what = "both"`, values in both column will be #" turned into `NA`. #" @param ... logical statements used to filter the data #" @return A data.table #" @importFrom rlang enexprs #" @keywords plotting #" @export #" #" @examples #" make_NA ( #" example_sample_continuous, #" what = "truth", #" target_end_date >= "2021-07-22", #" target_end_date < "2021-05-01" #" ) make_NA <- function(data = NULL, what = c("truth", "forecast", "both"), ...) { stopifnot(is.data.frame(data)) data <- as.data.table(data) what <- match.arg(what) # turn ... arguments into expressions args <- enexprs(...) vars <- NULL if (what %in% c("forecast", "both")) { vars <- c(vars, "predicted") } if (what %in% c("truth", "both")) { vars <- c(vars, "observed") } for (expr in args) { data <- data[eval(expr), eval(vars) := NA_real_] } return(data[]) } ## ----eval=require("ggdist", quietly = TRUE)----------------------------------- median_forecasts <- example_quantile[quantile_level == 0.5] median_forecasts %>% make_NA(what = "truth", target_end_date <= "2021-05-01", target_end_date > "2021-07-22") %>% make_NA(what = "forecast", model != "EuroCOVIDhub-ensemble", forecast_date != "2021-06-07") %>% plot_predictions( by = c("location", "target_type"), x = "target_end_date" ) + facet_wrap(location ~ target_type, scales = "free_y") ## ----eval=require("ggdist", quiet = TRUE)------------------------------------- example_quantile %>% make_NA(what = "truth", target_end_date <= "2021-05-01", target_end_date > "2021-07-22") %>% make_NA(what = "forecast", model != "EuroCOVIDhub-ensemble", forecast_date != "2021-06-07") %>% plot_predictions( by = c("location", "target_type"), x = "target_end_date", interval_range = c(0, 10, 20, 30, 40, 50, 60) ) + facet_wrap(location ~ target_type, scales = "free_y") ## ----eval=require("ggdist", quietly = TRUE)----------------------------------- example_sample_continuous %>% make_NA(what = "truth", target_end_date <= "2021-05-01", target_end_date > "2021-07-22") %>% make_NA(what = "forecast", model != "EuroCOVIDhub-ensemble", forecast_date != "2021-06-07") %>% plot_predictions( by = c("location", "target_type"), x = "target_end_date", interval_range = c(0, 50, 90, 95) ) + facet_wrap(location ~ target_type, scales = "free_y") ## ----eval=require("ggdist", quietly = TRUE)----------------------------------- example_quantile %>% make_NA(what = "truth", target_end_date > "2021-07-15", target_end_date <= "2021-05-22") %>% make_NA(what = "forecast", !(model %in% c("EuroCOVIDhub-ensemble", "EuroCOVIDhub-baseline")), forecast_date != "2021-06-28") %>% plot_predictions(x = "target_end_date", by = c("target_type", "location")) + aes(colour = model, fill = model) + facet_wrap(target_type ~ location, ncol = 4, scales = "free_y") + labs(x = "Target end date") ## ----------------------------------------------------------------------------- #" @title Plot Metrics by Range of the Prediction Interval #" #" @description #" Visualise the metrics by range, e.g. if you are interested how different #" interval ranges contribute to the overall interval score, or how #" sharpness / dispersion changes by range. #" #" @param scores A data.frame of scores based on quantile forecasts as #" produced by [score()] or [summarise_scores()]. Note that "range" must be included #" in the `by` argument when running [summarise_scores()] #" @param y The variable from the scores you want to show on the y-Axis. #" This could be something like "wis" (the default) or "dispersion" #" @param x The variable from the scores you want to show on the x-Axis. #" Usually this will be "model" #" @param colour Character vector of length one used to determine a variable #" for colouring dots. The Default is "range". #" @return A ggplot2 object showing a contributions from the three components of #" the weighted interval score #" @importFrom ggplot2 ggplot aes aes geom_point geom_line #" expand_limits theme theme_light element_text scale_color_continuous labs #" @export #" @examples #" library(ggplot2) #" ex <- data.table::copy(example_quantile) #" ex$range <- scoringutils:::get_range_from_quantile(ex$quantile) #" scores <- suppressWarnings(score(as_forecast_quantile(ex), metrics = list("wis" = wis))) #" summarised <- summarise_scores( #" scores, #" by = c("model", "target_type", "range") #" ) #" plot_interval_ranges(summarised, x = "model") + #" facet_wrap(~target_type, scales = "free") plot_interval_ranges <- function(scores, y = "wis", x = "model", colour = "range") { plot <- ggplot( scores, aes( x = .data[[x]], y = .data[[y]], colour = .data[[colour]] ) ) + geom_point(size = 2) + geom_line(aes(group = range), colour = "black", linewidth = 0.01 ) + scale_color_continuous(low = "steelblue", high = "salmon") + theme_scoringutils() + expand_limits(y = 0) + theme( legend.position = "right", axis.text.x = element_text( angle = 90, vjust = 1, hjust = 1 ) ) return(plot) } ## ----------------------------------------------------------------------------- range_example <- copy(example_quantile) %>% na.omit() %>% .[, range := scoringutils:::get_range_from_quantile(quantile_level)] sum_scores <- range_example %>% as_forecast_quantile() %>% score(metrics = list(wis = wis, dispersion = dispersion_quantile)) %>% summarise_scores(by = c("model", "target_type", "range")) %>% suppressWarnings() plot_interval_ranges(sum_scores, x = "model") + facet_wrap(~target_type, scales = "free") ## ----------------------------------------------------------------------------- plot_interval_ranges(sum_scores, y = "dispersion", x = "model") + facet_wrap(~target_type, scales = "free_y") ## ----------------------------------------------------------------------------- #' @title Plot Coloured Score Table #' #' @description #' Plots a coloured table of summarised scores obtained using #' [score()]. #' #' @param y the variable to be shown on the y-axis. Instead of a single character string, #' you can also specify a vector with column names, e.g. #' `y = c("model", "location")`. These column names will be concatenated #' to create a unique row identifier (e.g. "model1_location1"). #' @param by A character vector that determines how the colour shading for the #' plot gets computed. By default (`NULL`), shading will be determined per #' metric, but you can provide additional column names (see examples). #' @param metrics A character vector with the metrics to show. If set to #' `NULL` (default), all metrics present in `scores` will be shown. #' #' @returns A ggplot object with a coloured table of summarised scores #' @inheritParams get_pairwise_comparisons #' @importFrom ggplot2 ggplot aes element_blank element_text labs coord_cartesian coord_flip #' @importFrom data.table setDT melt #' @importFrom stats sd #' @export #' #' @examples #' library(ggplot2) #' library(magrittr) # pipe operator #' \dontshow{ #' data.table::setDTthreads(2) # restricts number of cores used on CRAN #' } #' #' scores <- score(as_forecast_quantile(example_quantile)) %>% #' summarise_scores(by = c("model", "target_type")) %>% #' summarise_scores(by = c("model", "target_type"), fun = signif, digits = 2) #' #' plot_score_table(scores, y = "model", by = "target_type") + #' facet_wrap(~target_type, ncol = 1) #' #' # can also put target description on the y-axis #' plot_score_table(scores, #' y = c("model", "target_type"), #' by = "target_type") plot_score_table <- function(scores, y = "model", by = NULL, metrics = NULL) { # identify metrics ----------------------------------------------------------- id_vars <- get_forecast_unit(scores) metrics <- get_metrics(scores) cols_to_delete <- names(scores)[!(names(scores) %in% c(metrics, id_vars))] suppressWarnings(scores[, eval(cols_to_delete) := NULL]) # compute scaled values ------------------------------------------------------ # scaling is done in order to colour the different scores # for most metrics larger is worse, but others like bias are better if they # are close to zero and deviations in both directions are bad # define which metrics are scaled using min (larger is worse) and # which not (metrics like bias where deviations in both directions are bad) metrics_zero_good <- c("bias", "interval_coverage_deviation") metrics_no_color <- "coverage" metrics_min_good <- setdiff(metrics, c( metrics_zero_good, metrics_no_color )) # write scale functions that can be used in data.table scale <- function(x) { scaled <- x / sd(x, na.rm = TRUE) return(scaled) } scale_min_good <- function(x) { scaled <- (x - min(x)) / sd(x, na.rm = TRUE) return(scaled) } # pivot longer and add scaled values df <- data.table::melt(scores, value.vars = metrics, id.vars = id_vars, variable.name = "metric" ) df[metric %in% metrics_min_good, value_scaled := scale_min_good(value), by = c("metric", by) ] df[metric %in% metrics_zero_good, value_scaled := scale(value), by = c("metric", by) ] df[metric %in% metrics_no_color, value_scaled := 0, by = c("metric", by) ] # create identifier column for plot ------------------------------------------ # if there is only one column, leave column as is. Reason to do that is that # users can then pass in a factor and keep the ordering of that column intact if (length(y) > 1) { df[, identifCol := do.call(paste, c(.SD, sep = "_")), .SDcols = y[y %in% names(df)]] } else { setnames(df, old = eval(y), new = "identifCol") } # plot ----------------------------------------------------------------------- # make plot with all metrics that are not NA plot <- ggplot( df[!is.na(value), ], aes(y = identifCol, x = metric) ) + geom_tile(aes(fill = value_scaled), colour = "white", show.legend = FALSE) + geom_text(aes(y = identifCol, label = value)) + scale_fill_gradient2(low = "steelblue", high = "salmon") + theme_scoringutils() + theme( legend.title = element_blank(), legend.position = "none", axis.text.x = element_text( angle = 90, vjust = 1, hjust = 1 ) ) + labs(x = "", y = "") + coord_cartesian(expand = FALSE) return(plot) } ## ----------------------------------------------------------------------------- scores <- score(as_forecast_quantile(example_quantile)) %>% summarise_scores(by = c("model", "target_type")) %>% summarise_scores(by = c("model", "target_type"), fun = signif, digits = 2) plot_score_table(scores, y = "model", by = "target_type") + facet_wrap(~target_type, ncol = 1) ## ----------------------------------------------------------------------------- # can also put target description on the y-axis plot_score_table(scores, y = c("model", "target_type"), by = "target_type")