percell_workflow: Per-Cell Annotation Workflow Example

Description

Example workflow for using SlimR's per-cell annotation functions

Arguments

Overview

The per-cell annotation workflow in SlimR provides an alternative to cluster-based annotation by scoring and labeling individual cells based on marker expression. This is useful when:

Clusters contain mixed cell types
You want finer-grained annotations
Cell states exist on a continuum
UMAP spatial context can improve annotation quality

Basic Workflow


# 1. Prepare your Seurat object (must have normalized data)
library(SlimR)
library(Seurat)

# 2. Create or load marker list Markers_list <- Markers_filter_Cellmarker2( Cellmarker2, species = "Human", tissue_class = "Intestine" )

# 3. Run per-cell annotation result <- Celltype_Calculate_PerCell( seurat_obj = sce, gene_list = Markers_list, species = "Human", method = "weighted", # "weighted", "mean", or "AUCell" min_expression = 0.1, min_score = 0.1, verbose = TRUE )

# 4. Annotate Seurat object sce <- Celltype_Annotation_PerCell( seurat_obj = sce, SlimR_percell_result = result, plot_UMAP = TRUE, plot_confidence = TRUE, annotation_col = "Cell_type_PerCell" )

# 5. Verify annotations dotplot <- Celltype_Verification_PerCell( seurat_obj = sce, SlimR_percell_result = result, gene_number = 5, annotation_col = "Cell_type_PerCell" ) print(dotplot)

Advanced

UMAP Spatial Smoothing:


# Use UMAP coordinates to smooth predictions via k-NN
# This reduces noise and improves consistency in spatial regions
result_smooth <- Celltype_Calculate_PerCell(
    seurat_obj = sce,
    gene_list = Markers_list,
    species = "Human",
    use_umap_smoothing = TRUE,
    k_neighbors = 20,              # Number of neighbors to consider
    smoothing_weight = 0.3,        # 30
    verbose = TRUE
)
# Compare smoothed vs unsmoothed
sce$Cell_type_Smooth <- result_smooth$Cell_annotations$Predicted_cell_type
sce$Cell_type_Raw <- result$Cell_annotations$Predicted_cell_type
DimPlot(sce, group.by = "Cell_type_Raw") | 
  DimPlot(sce, group.by = "Cell_type_Smooth")

Scoring Methods Comparison


# Method 1: Weighted (recommended for most cases)
# Combines expression with marker specificity and detection rate
result_weighted <- Celltype_Calculate_PerCell(
    seurat_obj = sce,
    gene_list = Markers_list,
    species = "Human",
    method = "weighted"
)

# Method 2: Mean (simple, fast) # Just averages normalized marker expression result_mean <- Celltype_Calculate_PerCell( seurat_obj = sce, gene_list = Markers_list, species = "Human", method = "mean" )

# Method 3: AUCell (rank-based, robust to batch effects) # Scores based on proportion of markers in top 5 result_aucell <- Celltype_Calculate_PerCell( seurat_obj = sce, gene_list = Markers_list, species = "Human", method = "AUCell" )

Comparing Cluster vs Per-Cell Annotation


# Cluster-based annotation (original SlimR approach)
cluster_result <- Celltype_Calculate(
    seurat_obj = sce,
    gene_list = Markers_list,
    species = "Human",
    cluster_col = "seurat_clusters"
)

sce <- Celltype_Annotation( seurat_obj = sce, cluster_col = "seurat_clusters", SlimR_anno_result = cluster_result, annotation_col = "Cell_type_Cluster" )

# Per-cell annotation percell_result <- Celltype_Calculate_PerCell( seurat_obj = sce, gene_list = Markers_list, species = "Human" )

sce <- Celltype_Annotation_PerCell( seurat_obj = sce, SlimR_percell_result = percell_result, annotation_col = "Cell_type_PerCell" )

# Compare library(ggplot2) library(patchwork)

p1 <- DimPlot(sce, group.by = "Cell_type_Cluster") + ggtitle("Cluster-based") p2 <- DimPlot(sce, group.by = "Cell_type_PerCell") + ggtitle("Per-cell")

p1 | p2

# Check agreement table(sce$Cell_type_Cluster, sce$Cell_type_PerCell)

Performance Optimization


# For large datasets, adjust chunk_size to manage memory
result <- Celltype_Calculate_PerCell(
    seurat_obj = sce,
    gene_list = Markers_list,
    species = "Human",
    chunk_size = 10000,  # Process 10k cells at a time
    verbose = TRUE
)

# For UMAP smoothing, install RANN for 10-100x speedup # install.packages("RANN")

result_smooth <- Celltype_Calculate_PerCell( seurat_obj = sce, gene_list = Markers_list, species = "Human", use_umap_smoothing = TRUE, k_neighbors = 15 # RANN will be used automatically if installed )

Accessing Results


# Cell-level annotations
head(result$Cell_annotations)
#   Cell_barcode Predicted_cell_type Max_score Confidence
# 1  AAACCTGAG... Enterocyte          0.85      0.62
# 2  AAACCTGCA... Goblet cell         0.72      0.45

# Summary statistics result$Summary # Cell_type Count Percentage # 1 Enterocyte 5432 45.2 # 2 Goblet cell 2156 17.9

# Full probability matrix (if return_scores = TRUE) result$Probability_matrix[1:5, 1:3] # Enterocyte Goblet_cell Stem_cell # AAACCTGAG... 0.85 0.10 0.05

# Extract high-confidence cells high_conf <- result$Cell_annotations$Cell_barcode[ result$Cell_annotations$Confidence > 0.5 ]

# Extract uncertain cells for manual review uncertain <- result$Cell_annotations$Cell_barcode[ result$Cell_annotations$Confidence < 0.2 ]