% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/calculateTopLoadingGeneShifts.R,
%   R/plot.calculateTopLoadingGeneShiftsObject.R
\name{calculateTopLoadingGeneShifts}
\alias{calculateTopLoadingGeneShifts}
\alias{plot.calculateTopLoadingGeneShiftsObject}
\title{Calculate Top Loading Gene Expression Shifts}
\usage{
calculateTopLoadingGeneShifts(
  query_data,
  reference_data,
  query_cell_type_col,
  ref_cell_type_col,
  cell_types = NULL,
  pc_subset = 1:5,
  n_top_loadings = 50,
  p_value_threshold = 0.05,
  adjust_method = "fdr",
  assay_name = "logcounts",
  detect_anomalies = FALSE,
  anomaly_comparison = FALSE,
  anomaly_threshold = 0.6,
  n_tree = 500,
  max_cells_query = 5000,
  max_cells_ref = 5000
)

\method{plot}{calculateTopLoadingGeneShiftsObject}(
  x,
  cell_type,
  pc_subset = 1:3,
  plot_type = c("heatmap", "barplot", "boxplot"),
  plot_by = c("p_adjusted", "top_loading"),
  n_genes = 10,
  significance_threshold = 0.05,
  show_anomalies = FALSE,
  pseudo_bulk = FALSE,
  cluster_cols = FALSE,
  draw_plot = TRUE,
  show_all_query = TRUE,
  max_cells_ref = NULL,
  max_cells_query = NULL,
  ...
)
}
\arguments{
\item{query_data}{A \code{\linkS4class{SingleCellExperiment}} object containing numeric expression matrix for the query cells.}

\item{reference_data}{A \code{\linkS4class{SingleCellExperiment}} object containing numeric expression matrix for the reference cells.}

\item{query_cell_type_col}{The column name in the \code{colData} of \code{query_data} that identifies the cell types.}

\item{ref_cell_type_col}{The column name in the \code{colData} of \code{reference_data} that identifies the cell types.}

\item{cell_types}{A character vector specifying the cell types to analyze. If NULL, all common cell types are used.}

\item{pc_subset}{A numeric vector specifying which principal components to plot. Default is 1:3.}

\item{n_top_loadings}{Number of top loading genes to analyze per PC. Default is 50.}

\item{p_value_threshold}{P-value threshold for statistical significance. Default is 0.05.}

\item{adjust_method}{Method for multiple testing correction. Default is "fdr".}

\item{assay_name}{Name of the assay on which to perform computations. Default is "logcounts".}

\item{detect_anomalies}{Logical indicating whether to perform anomaly detection using isolation forests.
Default is FALSE.}

\item{anomaly_comparison}{Logical indicating whether to perform statistical comparisons
between non-anomalous reference cells and anomalous query cells instead of all-vs-all
comparisons. When TRUE, only non-anomalous reference cells are compared against only
anomalous query cells for each cell type. Requires detect_anomalies = TRUE. Default is FALSE.}

\item{anomaly_threshold}{A numeric value specifying the threshold for identifying anomalies when
\code{detect_anomalies} is TRUE. Default is 0.6.}

\item{n_tree}{An integer specifying the number of trees for the isolation forest when
\code{detect_anomalies} is TRUE. Default is 500.}

\item{max_cells_query}{Maximum number of query cells to include in the plot. If NULL,
all available query cells are plotted. Default is NULL.}

\item{max_cells_ref}{Maximum number of reference cells to include in the plot. If NULL,
all available reference cells are plotted. Default is NULL.}

\item{x}{An object of class \code{calculateTopLoadingGeneShiftsObject}.}

\item{cell_type}{A character string specifying the cell type to plot (must be exactly one).}

\item{plot_type}{A character string specifying visualization type. Either "heatmap", "barplot", or "boxplot".
Default is "heatmap".}

\item{plot_by}{A character string specifying gene selection method when `n_genes` is not NULL.
Either "top_loading" or "p_adjusted". Default is "p_adjusted".}

\item{n_genes}{Number of top genes to show per PC. Can be NULL if `significance_threshold` is set.
Default is 10.}

\item{significance_threshold}{If not NULL, a numeric value between 0 and 1. Used for gene
selection or annotation. Default is 0.05.}

\item{show_anomalies}{Logical indicating whether to display anomaly status annotations.
Default is FALSE. Requires anomaly results to be present in the object.}

\item{pseudo_bulk}{Logical indicating whether to create pseudo-bulk profiles instead of
showing individual cells. When TRUE, expression values are averaged within groups
(dataset and optionally anomaly status). Not compatible with boxplot visualization.
Required for barplot visualization. Default is FALSE.}

\item{cluster_cols}{Logical indicating whether to cluster columns in the heatmap when
`pseudo_bulk = TRUE`. When TRUE, columns (pseudo-bulk profiles) will be
hierarchically clustered. When FALSE, columns maintain their original ordering
(Query groups followed by Reference groups). Only applicable when
`pseudo_bulk = TRUE` and `plot_type = "heatmap"`. Default is FALSE.}

\item{draw_plot}{Logical indicating whether to draw the plot immediately (TRUE) or return
the undrawn plot object (FALSE). For heatmaps, FALSE returns a ComplexHeatmap
object that can be further customized before drawing. Default is TRUE.}

\item{show_all_query}{Logical indicating whether to show the yellow bar for all query vs reference
comparison. Default is TRUE. When FALSE, only green and red bars are shown.}

\item{...}{Additional arguments passed to \code{\link[ComplexHeatmap]{draw}} or not used for other plot types.}
}
\value{
A list containing:
\itemize{
  \item PC results: Named elements for each PC (e.g., "PC1", "PC2") containing data frames with gene-level analysis results.
  \item expression_data: Matrix of expression values for all analyzed genes (genes × cells).
  \item cell_metadata: Data frame with columns: cell_id, dataset, cell_type, original_index, and optionally anomaly_status.
  \item gene_metadata: Data frame with columns: gene, pc, loading for all analyzed genes.
  \item percent_var: Named numeric vector of global percent variance explained for each analyzed PC.
  \item cell_type_variance: A data frame detailing the percent of variance a global PC explains within specific cell types for both query and reference datasets.
  \item anomaly_results: If \code{detect_anomalies} is TRUE, contains the full output from \code{detectAnomaly}.
}

The `cell_type_variance` data frame contains columns: pc, cell_type, dataset, percent_variance.
When anomaly detection is enabled, `cell_metadata` includes an additional `anomaly_status` column.

A plot object. For heatmaps when \code{draw_plot = FALSE}, returns a ComplexHeatmap object.
For boxplots and barplots, returns a ggplot2 object.
}
\description{
This function identifies genes with the highest loadings for specified principal components
and performs statistical tests to detect distributional differences between query and reference data.
It also calculates the proportion of variance explained by each principal component within
specific cell types. Optionally, it can detect anomalous cells using isolation forests.

This function creates visualizations showing expression distributions for top loading genes
that exhibit distributional differences between query and reference datasets. Can display
results as elegant complex heatmaps, information-rich summary boxplots, or pseudo-bulk fold
change barplots. Optionally displays anomaly status when available.
}
\details{
This function extracts the top loading genes for each specified principal component from the reference
PCA space and performs distributional comparisons between query and reference data. For each gene,
it performs statistical tests to identify genes that may be causing PC-specific alignment issues
between datasets. A key feature is the calculation of cell-type-specific variance explained by
global PCs, providing a more nuanced view of how major biological axes affect individual populations.
When anomaly detection is enabled, isolation forests are used to identify anomalous cells based on
their PCA projections.

When \code{anomaly_comparison = TRUE}, the statistical analysis focuses specifically on
comparing non-anomalous reference cells against anomalous query cells. This can help
identify genes that are differentially expressed between "normal" reference cells and
potentially problematic query cells, providing insights into what makes certain query
cells anomalous.

This function visualizes the results from \code{calculateTopLoadingGeneShifts}.
The "heatmap" option displays a hierarchically clustered set of genes.
The "boxplot" option creates a two-panel plot using `ggplot2`: the left panel shows
horizontal expression boxplots for up to 5 PCs, while the right panel displays their
corresponding PC loadings and adjusted p-values.
The "barplot" option creates horizontal barplots showing log2 fold changes between
pseudo-bulk expression profiles (query vs reference), with genes ordered identically
to the heatmap clustering. Bars show comparisons for query non-anomaly (green),
optionally all query cells (yellow), and query anomaly cells (red) versus reference.
When anomaly detection results are available and \code{show_anomalies} is TRUE,
additional annotation bars or visual cues highlight anomalous cells.
}
\seealso{
\code{\link{plot.calculateTopLoadingGeneShiftsObject}}, \code{\link{detectAnomaly}}

\code{\link{calculateTopLoadingGeneShifts}}
}
\author{
Anthony Christidis, \email{anthony-alexander_christidis@hms.harvard.edu}
}
